string: add SH4AL-DSP optimizations for memchr (DONE)

This commit is contained in:
Lephenixnoir 2021-05-23 15:51:09 +02:00
parent 591e453717
commit a48c163e55
No known key found for this signature in database
GPG key ID: 1BBA026E13FC0495

View file

@ -1,38 +1,52 @@
#include <bits/asm/cpucap.h>
.global _memchr .global _memchr
.type _memchr, @function .type _memchr, @function
_memchr: _memchr:
mov r4, r0 mov r4, r1
exts.b r5, r5 exts.b r5, r5
/* For small inputs, simply check bytes individually */ /* For small inputs, simply check bytes individually */
mov #64, r2 mov #64, r2
cmp/hi r6, r2 cmp/hi r6, r2
bt .last bt .naive
.large: /* Make a 4-byte version of r5 for cmp/str */ /* Make a 4-byte version of r5 for cmp/str */
extu.b r5, r3 extu.b r5, r3
swap.b r3, r2 swap.b r3, r2
or r3, r2 or r3, r2
swap.w r2, r3 swap.w r2, r3
or r3, r2 or r3, r2
mov.l .___cpucap, r0
mov.l @r0, r0
tst #__CPUCAP_SH4ALDSP, r0
bf .sh4aldsp
/*
** Fast memchr() method on SH3:
** -> Align to 4 bytes with single-byte reads
** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
** -> Use a somewhat tight longword-based loop with dt
*/
.sh3:
/* First check 3 bytes to ensure we don't skip bytes when aligning */ /* First check 3 bytes to ensure we don't skip bytes when aligning */
mov.b @r0+, r1 mov.b @r1+, r0
cmp/eq r1, r5 cmp/eq r0, r5
bt .end bt .end
mov.b @r0+, r1 mov.b @r1+, r0
cmp/eq r1, r5 cmp/eq r0, r5
bt .end bt .end
mov.b @r0+, r1 mov.b @r1+, r0
cmp/eq r1, r5 cmp/eq r0, r5
bt .end bt .end
/* Align to a 4-byte boundary */ /* Align to a 4-byte boundary */
shlr2 r0 shlr2 r1
shll2 r0 shll2 r1
add r4, r6 add r4, r6
sub r0, r6 sub r1, r6
mov r6, r7 mov r6, r7
shlr2 r7 shlr2 r7
@ -40,29 +54,70 @@ _memchr:
and r3, r6 and r3, r6
/* Read longwords */ /* Read longwords */
1: mov.l @r0+, r1 1: mov.l @r1+, r0
cmp/str r1, r2 cmp/str r0, r2
bt .found bt .found
dt r7 dt r7
bf 1b bf 1b
.last: /* Don't read if there are no bytes left */ /* Finish the last bytes with a naive method */
bra .naive
nop
/*
** Fast memchr() method on SH4AL-DSP:
** -> Align with an unaligned read
** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
** -> Use an extremely tight loop with the DSP repeat function
*/
.sh4aldsp:
/* Check a couple of unaligned bytes first */
movua.l @r1+, r0
cmp/str r0, r2
bt .found
/* Align to a 4-byte boundary */
shlr2 r1
shll2 r1
add r4, r6
sub r1, r6
mov r6, r7
shlr2 r7
ldrs 2f
ldre 3f
ldrc r7
mov #3, r3
and r3, r6
/* Read longwords super efficiently */
2: mov.l @r1+, r0
cmp/str r0, r2
3: bt .found
/* Finish the last few bytes with the naive method */
.naive: /* Don't read if there are no bytes left */
tst r6, r6 tst r6, r6
bt .none bt .none
2: mov.b @r0+, r1 4: mov.b @r1+, r0
cmp/eq r1, r5 cmp/eq r0, r5
bt .end bt .end
dt r6 dt r6
bf 2b bf 4b
.none: rts .none: rts
mov #0, r0 mov #0, r0
.found: /* Go back to find out which of the last 4 bytes is r5 */ .found: /* Go back to find out which of the last 4 bytes is r5 */
add #-4, r0 add #-4, r1
bra 2b bra 4b
mov #4, r6 mov #4, r6
.end: rts .end: mov r1, r0
rts
add #-1, r0 add #-1, r0
.___cpucap:
.long ___cpucap