string: add SH4AL-DSP optimizations for memchr (DONE)

2025-06-06 16:05:03 +02:00 · 2021-05-23 15:51:09 +02:00 · 2021-05-23 15:51:09 +02:00 · a48c163e55
commit a48c163e55
parent 591e453717
1 changed files with 76 additions and 21 deletions
--- a/src/libc/string/target/sh-generic/memchr.S
+++ b/src/libc/string/target/sh-generic/memchr.S
@ -1,38 +1,52 @@
 #include <bits/asm/cpucap.h>
 .global _memchr
 .type _memchr, @function
 _memchr:
-	mov	r4, r0
+	mov	r4, r1
 	exts.b	r5, r5
 	/* For small inputs, simply check bytes individually */
 	mov	#64, r2
 	cmp/hi	r6, r2
-	bt	.last
+	bt	.naive
-.large:	/* Make a 4-byte version of r5 for cmp/str */
+	/* Make a 4-byte version of r5 for cmp/str */
 	extu.b	r5, r3
 	swap.b	r3, r2
 	or	r3, r2
 	swap.w	r2, r3
 	or	r3, r2
 	mov.l	.___cpucap, r0
 	mov.l	@r0, r0
 	tst	#__CPUCAP_SH4ALDSP, r0
 	bf	.sh4aldsp
 	/*
 	** Fast memchr() method on SH3:
 	** -> Align to 4 bytes with single-byte reads
 	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
 	** -> Use a somewhat tight longword-based loop with dt
 	*/
 .sh3:
 	/* First check 3 bytes to ensure we don't skip bytes when aligning */
-	mov.b	@r0+, r1
+	mov.b	@r1+, r0
-	cmp/eq	r1, r5
+	cmp/eq	r0, r5
 	bt	.end
-	mov.b	@r0+, r1
+	mov.b	@r1+, r0
-	cmp/eq	r1, r5
+	cmp/eq	r0, r5
 	bt	.end
-	mov.b	@r0+, r1
+	mov.b	@r1+, r0
-	cmp/eq	r1, r5
+	cmp/eq	r0, r5
 	bt	.end
 	/* Align to a 4-byte boundary */
-	shlr2	r0
+	shlr2	r1
-	shll2	r0
+	shll2	r1
 	add	r4, r6
-	sub	r0, r6
+	sub	r1, r6
 	mov	r6, r7
 	shlr2	r7
@ -40,29 +54,70 @@ _memchr:
 	and	r3, r6
 	/* Read longwords */
-1:	mov.l	@r0+, r1
+1:	mov.l	@r1+, r0
-	cmp/str	r1, r2
+	cmp/str	r0, r2
 	bt	.found
 	dt	r7
 	bf	1b
-.last:	/* Don't read if there are no bytes left */
+	/* Finish the last bytes with a naive method */
 	bra	.naive
 	nop
 	/*
 	** Fast memchr() method on SH4AL-DSP:
 	** -> Align with an unaligned read
 	** -> Then read 4 bytes at a time, and check for r5 with with cmp/str
 	** -> Use an extremely tight loop with the DSP repeat function
 	*/
 .sh4aldsp:
 	/* Check a couple of unaligned bytes first */
 	movua.l	@r1+, r0
 	cmp/str	r0, r2
 	bt	.found
 	/* Align to a 4-byte boundary */
 	shlr2	r1
 	shll2	r1
 	add	r4, r6
 	sub	r1, r6
 	mov	r6, r7
 	shlr2	r7
 	ldrs	2f
 	ldre	3f
 	ldrc	r7
 	mov	#3, r3
 	and	r3, r6
 	/* Read longwords super efficiently */
 2:	mov.l	@r1+, r0
 	cmp/str	r0, r2
 3:	bt	.found
 	/* Finish the last few bytes with the naive method */
 .naive:	/* Don't read if there are no bytes left */
 	tst	r6, r6
 	bt	.none
-2:	mov.b	@r0+, r1
+4:	mov.b	@r1+, r0
-	cmp/eq	r1, r5
+	cmp/eq	r0, r5
 	bt	.end
 	dt	r6
-	bf	2b
+	bf	4b
 .none:	rts
 	mov	#0, r0
 .found:	/* Go back to find out which of the last 4 bytes is r5 */
-	add	#-4, r0
+	add	#-4, r1
-	bra	2b
+	bra	4b
 	mov	#4, r6
-.end:	rts
+.end:	mov	r1, r0
 	rts
 	add	#-1, r0
 .___cpucap:
 	.long	___cpucap