mirror of
https://git.planet-casio.com/Vhex-Kernel-Core/fxlibc.git
synced 2025-01-01 14:33:36 +01:00
string: add SH4AL-DSP optimizations for memchr (DONE)
This commit is contained in:
parent
591e453717
commit
a48c163e55
1 changed files with 76 additions and 21 deletions
|
@ -1,38 +1,52 @@
|
||||||
|
#include <bits/asm/cpucap.h>
|
||||||
|
|
||||||
.global _memchr
|
.global _memchr
|
||||||
.type _memchr, @function
|
.type _memchr, @function
|
||||||
|
|
||||||
_memchr:
|
_memchr:
|
||||||
mov r4, r0
|
mov r4, r1
|
||||||
exts.b r5, r5
|
exts.b r5, r5
|
||||||
|
|
||||||
/* For small inputs, simply check bytes individually */
|
/* For small inputs, simply check bytes individually */
|
||||||
mov #64, r2
|
mov #64, r2
|
||||||
cmp/hi r6, r2
|
cmp/hi r6, r2
|
||||||
bt .last
|
bt .naive
|
||||||
|
|
||||||
.large: /* Make a 4-byte version of r5 for cmp/str */
|
/* Make a 4-byte version of r5 for cmp/str */
|
||||||
extu.b r5, r3
|
extu.b r5, r3
|
||||||
swap.b r3, r2
|
swap.b r3, r2
|
||||||
or r3, r2
|
or r3, r2
|
||||||
swap.w r2, r3
|
swap.w r2, r3
|
||||||
or r3, r2
|
or r3, r2
|
||||||
|
|
||||||
|
mov.l .___cpucap, r0
|
||||||
|
mov.l @r0, r0
|
||||||
|
tst #__CPUCAP_SH4ALDSP, r0
|
||||||
|
bf .sh4aldsp
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Fast memchr() method on SH3:
|
||||||
|
** -> Align to 4 bytes with single-byte reads
|
||||||
|
** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
|
||||||
|
** -> Use a somewhat tight longword-based loop with dt
|
||||||
|
*/
|
||||||
|
.sh3:
|
||||||
/* First check 3 bytes to ensure we don't skip bytes when aligning */
|
/* First check 3 bytes to ensure we don't skip bytes when aligning */
|
||||||
mov.b @r0+, r1
|
mov.b @r1+, r0
|
||||||
cmp/eq r1, r5
|
cmp/eq r0, r5
|
||||||
bt .end
|
bt .end
|
||||||
mov.b @r0+, r1
|
mov.b @r1+, r0
|
||||||
cmp/eq r1, r5
|
cmp/eq r0, r5
|
||||||
bt .end
|
bt .end
|
||||||
mov.b @r0+, r1
|
mov.b @r1+, r0
|
||||||
cmp/eq r1, r5
|
cmp/eq r0, r5
|
||||||
bt .end
|
bt .end
|
||||||
|
|
||||||
/* Align to a 4-byte boundary */
|
/* Align to a 4-byte boundary */
|
||||||
shlr2 r0
|
shlr2 r1
|
||||||
shll2 r0
|
shll2 r1
|
||||||
add r4, r6
|
add r4, r6
|
||||||
sub r0, r6
|
sub r1, r6
|
||||||
|
|
||||||
mov r6, r7
|
mov r6, r7
|
||||||
shlr2 r7
|
shlr2 r7
|
||||||
|
@ -40,29 +54,70 @@ _memchr:
|
||||||
and r3, r6
|
and r3, r6
|
||||||
|
|
||||||
/* Read longwords */
|
/* Read longwords */
|
||||||
1: mov.l @r0+, r1
|
1: mov.l @r1+, r0
|
||||||
cmp/str r1, r2
|
cmp/str r0, r2
|
||||||
bt .found
|
bt .found
|
||||||
dt r7
|
dt r7
|
||||||
bf 1b
|
bf 1b
|
||||||
|
|
||||||
.last: /* Don't read if there are no bytes left */
|
/* Finish the last bytes with a naive method */
|
||||||
|
bra .naive
|
||||||
|
nop
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Fast memchr() method on SH4AL-DSP:
|
||||||
|
** -> Align with an unaligned read
|
||||||
|
** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
|
||||||
|
** -> Use an extremely tight loop with the DSP repeat function
|
||||||
|
*/
|
||||||
|
.sh4aldsp:
|
||||||
|
/* Check a couple of unaligned bytes first */
|
||||||
|
movua.l @r1+, r0
|
||||||
|
cmp/str r0, r2
|
||||||
|
bt .found
|
||||||
|
|
||||||
|
/* Align to a 4-byte boundary */
|
||||||
|
shlr2 r1
|
||||||
|
shll2 r1
|
||||||
|
add r4, r6
|
||||||
|
sub r1, r6
|
||||||
|
|
||||||
|
mov r6, r7
|
||||||
|
shlr2 r7
|
||||||
|
ldrs 2f
|
||||||
|
ldre 3f
|
||||||
|
ldrc r7
|
||||||
|
mov #3, r3
|
||||||
|
and r3, r6
|
||||||
|
|
||||||
|
/* Read longwords super efficiently */
|
||||||
|
2: mov.l @r1+, r0
|
||||||
|
cmp/str r0, r2
|
||||||
|
3: bt .found
|
||||||
|
|
||||||
|
/* Finish the last few bytes with the naive method */
|
||||||
|
|
||||||
|
.naive: /* Don't read if there are no bytes left */
|
||||||
tst r6, r6
|
tst r6, r6
|
||||||
bt .none
|
bt .none
|
||||||
|
|
||||||
2: mov.b @r0+, r1
|
4: mov.b @r1+, r0
|
||||||
cmp/eq r1, r5
|
cmp/eq r0, r5
|
||||||
bt .end
|
bt .end
|
||||||
dt r6
|
dt r6
|
||||||
bf 2b
|
bf 4b
|
||||||
|
|
||||||
.none: rts
|
.none: rts
|
||||||
mov #0, r0
|
mov #0, r0
|
||||||
|
|
||||||
.found: /* Go back to find out which of the last 4 bytes is r5 */
|
.found: /* Go back to find out which of the last 4 bytes is r5 */
|
||||||
add #-4, r0
|
add #-4, r1
|
||||||
bra 2b
|
bra 4b
|
||||||
mov #4, r6
|
mov #4, r6
|
||||||
|
|
||||||
.end: rts
|
.end: mov r1, r0
|
||||||
|
rts
|
||||||
add #-1, r0
|
add #-1, r0
|
||||||
|
|
||||||
|
.___cpucap:
|
||||||
|
.long ___cpucap
|
||||||
|
|
Loading…
Reference in a new issue