string: use gint's optimized memcpy (DONE)

2025-06-06 07:55:10 +02:00 · 2021-05-23 16:22:25 +02:00 · 2021-05-23 16:22:25 +02:00 · a354e38ccf
commit a354e38ccf
parent b69e0fd299
4 changed files with 135 additions and 8 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -172,6 +172,7 @@ if(sh-generic IN_LIST TARGET_FOLDERS)
    src/libc/setjmp/target/sh-generic/setjmp.S
    src/libc/setjmp/target/sh-generic/longjmp.S
    src/libc/string/target/sh-generic/memchr.S
    src/libc/string/target/sh-generic/memcpy.S
    src/libc/string/target/sh-generic/memset.S
    src/libc/string/target/sh-generic/strlen.S
    src/target/sh-generic/cpucap.c)
--- a/src/libc/string/memcpy.c
+++ b/src/libc/string/memcpy.c
@ -1,17 +1,13 @@
 #include <string.h>
 #include <stdint.h>
-/*
+#ifndef __SUPPORT_ARCH_SH
-** The memcpy() function copies n bytes from memory area src to memory area dest.
+
 ** The memory areas must not overlap.  Use memmove(3) if the memory areas do
 ** overlap.
 **
 ** TODO: use DMA ?
 ** TODO: use DSP ?
 */
 void *memcpy(void *dest, const void *src, size_t count)
 {
 	for (size_t i = 0; i < count; i = i + 1)
 		((uint8_t *) dest)[i] = ((uint8_t *) src)[i];
 	return (dest);
 }
 #endif /*__SUPPORT_ARCH_SH*/
--- a/src/libc/string/target/sh-generic/memchr.S
+++ b/src/libc/string/target/sh-generic/memchr.S
@ -119,5 +119,7 @@ _memchr:
 	rts
 	add	#-1, r0
 .align 4
 .___cpucap:
 	.long	___cpucap
--- a/src/libc/string/target/sh-generic/memcpy.S
+++ b/src/libc/string/target/sh-generic/memcpy.S
@ -0,0 +1,128 @@
 #include <bits/asm/cpucap.h>
 .global _memcpy
 .text
 _memcpy:
 	tst	r6, r6
 	bt	.zero
 	mov	r4, r3
 	mov	#3, r2
 	/* When copying less than 64 bytes, use the naive method */
 	mov	#64, r0
 	cmp/ge	r6, r0
 	bt	_naive_memcpy
 _memcpy_align_dst:
 	/* 4-align the destination */
 	mov.b	@r5+, r0
 	mov.b	r0, @r4
 	add	#1, r4
 	tst	r2, r4
 	bf/s	_memcpy_align_dst
 	dt	r6
 	/* If source is 4-aligned, use mov.l */
 	tst	r2, r5
 	bt/s	.aligned4_32
 	mov	#4, r2
 	/* If unaligned but SH4, use movua.l */
 	mov.l	.___cpucap, r0
 	mov.l	@r0, r0
 	tst	#__CPUCAP_SH4ALDSP, r0
 	bf	.unaligned4
 	/* If source is 2-aligned, use mov.w */
 	mov	r5, r0
 	tst	#1, r0
 	bt	.aligned2
 	/* Otherwise use a naive copy */
 	bra	_naive_memcpy
 	nop
 .aligned4_32:
 	mov	#36, r2
 	/* Copy 32 bytes at a time until at most 32 bytes are left */
 	mov.l	@r5+, r0
 	mov.l	@r5+, r1
 	mov.l	@r5+, r7
 	mov.l	r0, @r4
 	mov.l	r1, @(4,r4)
 	mov.l	r7, @(8,r4)
 	mov.l	@r5+, r0
 	mov.l	@r5+, r1
 	mov.l	@r5+, r7
 	mov.l	r0, @(12,r4)
 	mov.l	r1, @(16,r4)
 	mov.l	r7, @(20,r4)
 	mov.l	@r5+, r0
 	mov.l	@r5+, r1
 	add	#-32, r6
 	mov.l	r0, @(24,r4)
 	mov.l	r1, @(28,r4)
 	cmp/ge	r6, r2
 	bf/s	.aligned4_32
 	add	#32, r4
 .aligned4_4:
 	mov	#4, r2
 	/* Copy 4 bytes at a time until at most 4 bytes are left */
 	mov.l	@r5+, r0
 	mov.l	r0, @r4
 	add	#-4, r6
 	cmp/ge	r6, r2
 	bf/s	.aligned4_4
 	add	#4, r4
 	bra	_naive_memcpy
 	nop
 .unaligned4:
 	/* Copy 4 bytes but read with movua.l since source is unaligned */
 	movua.l	@r5+, r0
 	mov.l	r0, @r4
 	add	#-4, r6
 	cmp/ge	r6, r2
 	bf/s	.unaligned4
 	add	#4, r4
 	bra	_naive_memcpy
 	nop
 .aligned2:
 	mov.w	@r5+, r0
 	mov.w	r0, @r4
 	mov.w	@r5+, r0
 	mov.w	r0, @(2,r4)
 	add	#-4, r6
 	cmp/ge	r6, r2
 	bf/s	.aligned2
 	add	#4, r4
 	bra	_naive_memcpy
 	nop
 _naive_memcpy:
 	mov.b	@r5+, r0
 	dt	r6
 	mov.b	r0, @r4
 	bf/s	_naive_memcpy
 	add	#1, r4
 	rts
 	mov	r3, r0
 .zero:
 	rts
 	mov	r4, r0
 .align 4
 .___cpucap:
 	.long ___cpucap