mirror of
https://git.planet-casio.com/Lephenixnoir/gint.git
synced 2025-04-04 09:37:10 +02:00
string: optimized memcpy, memcmp, memset; decent memmove
This change adds optimized versions of the core memory functions, relying on 4-alignment, 2-alignment, and the SH4's unaligned move instruction to (hopefully) attain good performance in all situations.
This commit is contained in:
parent
7d63a1b536
commit
9d1187b5b4
7 changed files with 332 additions and 120 deletions
3
TODO
3
TODO
|
@ -1,11 +1,8 @@
|
|||
For the 2.1.0 release:
|
||||
* core: the four basic memory functions (with automated tests)
|
||||
* bopti: remove the deprecated image_t definition
|
||||
* project: remove the compat branch
|
||||
* core: remove the boot log
|
||||
|
||||
Issues:
|
||||
* #8 support fx-CG Manager
|
||||
* #10 support fx-CG 20
|
||||
|
||||
Extensions on existing code:
|
||||
|
|
|
@ -13,6 +13,12 @@ void *memcpy(void * restrict dest, void const * restrict src, size_t n);
|
|||
/* memset(): Fill a chunk of memory with a single byte */
|
||||
void *memset(void *dest, int byte, size_t n);
|
||||
|
||||
/* memcpy(): Compare two chunks of memory */
|
||||
int memcmp(void const *s1, void const *s2, size_t n);
|
||||
|
||||
/* memmove(): Copy a chunk of memory to a possibly overlapping destination */
|
||||
void *memmove(void *dest, void const *src, size_t n);
|
||||
|
||||
/* strlen(): Length of a NUL-terminated string */
|
||||
size_t strlen(char const *str);
|
||||
|
||||
|
|
114
src/std/memcmp.s
Normal file
114
src/std/memcmp.s
Normal file
|
@ -0,0 +1,114 @@
|
|||
.global _memcmp
|
||||
.text
|
||||
|
||||
_memcmp:
|
||||
tst r6, r6
|
||||
bt .zero
|
||||
|
||||
/* When comparing less than 64 bytes, use the naive method */
|
||||
mov #64, r0
|
||||
cmp/ge r6, r0
|
||||
bt _naive_memcmp
|
||||
|
||||
mov #4, r2
|
||||
mov #3, r3
|
||||
|
||||
_memcmp_align_rhs:
|
||||
/* 4-align the right-hand side */
|
||||
mov.b @r4+, r0
|
||||
mov.b @r5+, r1
|
||||
cmp/eq r0, r1
|
||||
bf/s .end
|
||||
dt r6
|
||||
tst r3, r5
|
||||
bf _memcmp_align_rhs
|
||||
|
||||
/* If left-hand side is 4-aligned, use mov.l */
|
||||
tst r3, r4
|
||||
bt .aligned4
|
||||
|
||||
/* If unaligned but SH4, use movua.l */
|
||||
mov.l .gint, r0
|
||||
mov.l @r0, r0
|
||||
tst #1, r0
|
||||
bt .unaligned4
|
||||
|
||||
/* If left-hand side is 2-aligned, use mov.w and mov.l */
|
||||
mov r4, r0
|
||||
tst #1, r0
|
||||
bt .aligned2
|
||||
|
||||
/* Otherwise use a naive comparison */
|
||||
bra _naive_memcmp
|
||||
nop
|
||||
|
||||
.aligned4:
|
||||
/* Compare 4 bytes at a time until at most 4 bytes are left */
|
||||
mov.l @r4+, r0
|
||||
mov.l @r5+, r1
|
||||
cmp/eq r0, r1
|
||||
bf/s _fail
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf .aligned4
|
||||
|
||||
bra _naive_memcmp
|
||||
nop
|
||||
|
||||
.unaligned4:
|
||||
/* Compare 4 bytes at a time until at most 4 bytes are left. Since
|
||||
left-hand side is aligned, use movua.l */
|
||||
movua.l @r4+, r0
|
||||
mov.l @r5+, r1
|
||||
cmp/eq r0, r1
|
||||
bf/s _fail
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf .unaligned4
|
||||
|
||||
bra _naive_memcmp
|
||||
nop
|
||||
|
||||
.aligned2:
|
||||
/* Read 4 bytes from r4 in two steps */
|
||||
mov.w @r4+, r0
|
||||
mov.l @r5+, r1
|
||||
mov.w @r4+, r2
|
||||
shll16 r0
|
||||
or r2, r0
|
||||
cmp/eq r0, r1
|
||||
bf/s _fail
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf .aligned2
|
||||
|
||||
bra _naive_memcmp
|
||||
nop
|
||||
|
||||
_fail:
|
||||
/* Rewind 4 bytes to compare manually */
|
||||
add #-4, r4
|
||||
add #-4, r5
|
||||
add #4, r6
|
||||
|
||||
_naive_memcmp:
|
||||
mov.b @r4+, r0
|
||||
mov.b @r5+, r1
|
||||
cmp/eq r0, r1
|
||||
bf/s .end
|
||||
dt r6
|
||||
bf _naive_memcmp
|
||||
|
||||
.end:
|
||||
extu.b r0, r0
|
||||
extu.b r1, r1
|
||||
rts
|
||||
sub r1, r0
|
||||
|
||||
.zero:
|
||||
rts
|
||||
mov #0, r0
|
||||
|
||||
.align 4
|
||||
.gint:
|
||||
.long _gint
|
98
src/std/memcpy.s
Normal file
98
src/std/memcpy.s
Normal file
|
@ -0,0 +1,98 @@
|
|||
.global _memcpy
|
||||
.text
|
||||
|
||||
_memcpy:
|
||||
tst r6, r6
|
||||
bt .zero
|
||||
|
||||
mov r4, r3
|
||||
mov #3, r2
|
||||
|
||||
/* When copying less than 64 bytes, use the naive method */
|
||||
mov #64, r0
|
||||
cmp/ge r6, r0
|
||||
bt _naive_memcpy
|
||||
|
||||
_memcpy_align_dst:
|
||||
/* 4-align the destination */
|
||||
mov.b @r5+, r0
|
||||
mov.b r0, @r4
|
||||
add #1, r4
|
||||
tst r2, r4
|
||||
bf/s _memcpy_align_dst
|
||||
dt r6
|
||||
|
||||
/* If source is 4-aligned, use mov.l */
|
||||
tst r2, r5
|
||||
bt/s .aligned4
|
||||
mov #4, r2
|
||||
|
||||
/* If unaligned but SH4, use movua.l */
|
||||
mov.l .gint, r0
|
||||
mov.l @r0, r0
|
||||
tst #1, r0
|
||||
bt .unaligned4
|
||||
|
||||
/* If source is 2-aligned, use mov.w */
|
||||
mov r5, r0
|
||||
tst #1, r0
|
||||
bt .aligned2
|
||||
|
||||
/* Otherwise use a naive copy */
|
||||
bra _naive_memcpy
|
||||
nop
|
||||
|
||||
.aligned4:
|
||||
/* Copy 4 bytes at a time until at most 4 bytes are left */
|
||||
mov.l @r5+, r0
|
||||
mov.l r0, @r4
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf/s .aligned4
|
||||
add #4, r4
|
||||
|
||||
bra _naive_memcpy
|
||||
nop
|
||||
|
||||
.unaligned4:
|
||||
/* Copy 4 bytes but read with movua.l since source is unaligned */
|
||||
movua.l @r5+, r0
|
||||
mov.l r0, @r4
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf/s .unaligned4
|
||||
add #4, r4
|
||||
|
||||
bra _naive_memcpy
|
||||
nop
|
||||
|
||||
.aligned2:
|
||||
mov.w @r5+, r0
|
||||
mov.w r0, @r4
|
||||
mov.w @r5+, r0
|
||||
mov.w r0, @(2,r4)
|
||||
add #-4, r6
|
||||
cmp/ge r6, r2
|
||||
bf/s .aligned2
|
||||
add #4, r4
|
||||
|
||||
bra _naive_memcpy
|
||||
nop
|
||||
|
||||
_naive_memcpy:
|
||||
mov.b @r5+, r0
|
||||
dt r6
|
||||
mov.b r0, @r4
|
||||
bf/s _naive_memcpy
|
||||
add #1, r4
|
||||
|
||||
rts
|
||||
mov r3, r0
|
||||
|
||||
.zero:
|
||||
rts
|
||||
mov r4, r0
|
||||
|
||||
.align 4
|
||||
.gint:
|
||||
.long _gint
|
60
src/std/memmove.s
Normal file
60
src/std/memmove.s
Normal file
|
@ -0,0 +1,60 @@
|
|||
.global _memmove
|
||||
.text
|
||||
|
||||
_memmove:
|
||||
tst r6, r6
|
||||
bt .zero
|
||||
|
||||
/* Simple optimization: if regions do not overlap, use memcpy() */
|
||||
mov r4, r0
|
||||
add r6, r0
|
||||
cmp/ge r0, r5
|
||||
bt _memmove_memcpy
|
||||
mov r5, r0
|
||||
add r6, r0
|
||||
cmp/ge r0, r4
|
||||
bt _memmove_memcpy
|
||||
|
||||
mov r4, r3
|
||||
|
||||
cmp/ge r4, r5
|
||||
bf .backwards
|
||||
|
||||
.forwards:
|
||||
/* If the destination starts before the source, copy forwards */
|
||||
mov.b @r5+, r0
|
||||
mov.b r0, @r4
|
||||
dt r6
|
||||
bf/s .forwards
|
||||
add #1, r4
|
||||
|
||||
rts
|
||||
mov r3, r0
|
||||
|
||||
.backwards:
|
||||
/* Otherwise, copy backwards */
|
||||
add r6, r4
|
||||
add r6, r5
|
||||
|
||||
.backwards_loop:
|
||||
add #-1, r5
|
||||
mov.b @r5, r0
|
||||
dt r6
|
||||
bf/s .backwards_loop
|
||||
mov.b r0, @-r4
|
||||
|
||||
rts
|
||||
mov r3, r0
|
||||
|
||||
_memmove_memcpy:
|
||||
mov.l .memcpy, r1
|
||||
jmp @r1
|
||||
nop
|
||||
|
||||
.zero:
|
||||
rts
|
||||
mov r4, r0
|
||||
|
||||
.align 4
|
||||
.memcpy:
|
||||
.long _memcpy
|
117
src/std/memory.c
117
src/std/memory.c
|
@ -1,117 +0,0 @@
|
|||
#include <gint/defs/attributes.h>
|
||||
#include <gint/hardware.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
static void memcpy4(uint32_t * restrict d, const void * restrict src, size_t n)
|
||||
{
|
||||
int modulo = (uintptr_t)src & 3;
|
||||
|
||||
/* Best case: perform 32-bit accesses only */
|
||||
if(!modulo)
|
||||
{
|
||||
const uint32_t *s = src;
|
||||
for(; n; n-=4) *d++ = *s++;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Here's where SH-3 and SH-4A start working differently. SH-4A has a
|
||||
2-cycle 'movua' instruction to perform unaligned reads */
|
||||
else if(isSH4())
|
||||
{
|
||||
uint32_t longword;
|
||||
const uint32_t *s = src;
|
||||
|
||||
while(n--)
|
||||
{
|
||||
__asm__(
|
||||
"movua.l %1, %0"
|
||||
: "=z"(longword)
|
||||
: "m>"(*s)
|
||||
);
|
||||
s++;
|
||||
*d++ = longword;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* On SH-3, we can only hope that there is 2-alignment */
|
||||
else if(!(modulo & 1))
|
||||
{
|
||||
const uint16_t *s = src;
|
||||
uint16_t * restrict dst = (void *)d;
|
||||
|
||||
for(; n; n-=2)
|
||||
{
|
||||
*dst++ = *s++;
|
||||
*dst++ = *s++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Or just perform the raw copy */
|
||||
else
|
||||
{
|
||||
const uint8_t *s = src;
|
||||
uint8_t * restrict dst = (void *)d;
|
||||
|
||||
while(n--) *dst++ = *s++;
|
||||
}
|
||||
}
|
||||
|
||||
void *memcpy(void * restrict dst, const void * restrict src, size_t n)
|
||||
{
|
||||
uint8_t *d = dst;
|
||||
const uint8_t *s = src;
|
||||
|
||||
/* Small areas: don't bother with complex methods */
|
||||
if(n < 32)
|
||||
{
|
||||
while(n--) *d++ = *s++;
|
||||
return dst;
|
||||
}
|
||||
|
||||
/* Find a longword offset to perform word or longword operations */
|
||||
while((uintptr_t)d & 3) *d++ = *s++, n--;
|
||||
|
||||
/* Perform the big, efficient copy */
|
||||
memcpy4((void *)d, s, n & ~3);
|
||||
|
||||
size_t m = n & 3;
|
||||
d += (n - m);
|
||||
s += (n - m);
|
||||
n = m;
|
||||
|
||||
/* Copy around the last bytes */
|
||||
while(n--) *d++ = *s++;
|
||||
return dst;
|
||||
}
|
||||
|
||||
void *_memmove(GUNUSED void *dst, GUNUSED const void *src, GUNUSED size_t n)
|
||||
{
|
||||
// (same as memcpy, but heed for direction if areas overlap)
|
||||
|
||||
// copy by increasing addresses if dst < src
|
||||
// copy by decreasing addresses if dst > src
|
||||
return dst;
|
||||
}
|
||||
|
||||
int memcmp(GUNUSED const void *s1, GUNUSED const void *s2, GUNUSED size_t n)
|
||||
{
|
||||
uint8_t const *p1 = s1;
|
||||
uint8_t const *p2 = s2;
|
||||
|
||||
for(size_t i = 0; i < n; i++)
|
||||
{
|
||||
if(p1[i] != p2[i]) return (p1[i] - p2[i]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void *memset(void *s, int byte, size_t n)
|
||||
{
|
||||
/* TODO: Do it efficiently */
|
||||
char *dst = s;
|
||||
while(n--) *dst++ = byte;
|
||||
return s;
|
||||
}
|
54
src/std/memset.s
Normal file
54
src/std/memset.s
Normal file
|
@ -0,0 +1,54 @@
|
|||
.global _memset
|
||||
.text
|
||||
|
||||
_memset:
|
||||
tst r6, r6
|
||||
bt .zero
|
||||
|
||||
/* We'll fill from the end */
|
||||
mov r4, r3
|
||||
add r6, r4
|
||||
|
||||
/* When setting less than 64 bytes, use the naive method */
|
||||
mov #64, r0
|
||||
cmp/ge r6, r0
|
||||
bt _naive_memset
|
||||
|
||||
mov #3, r2
|
||||
|
||||
/* Make a 4-byte filler */
|
||||
mov r5, r0
|
||||
shll8 r5
|
||||
or r5, r0
|
||||
mov r0, r5
|
||||
shll16 r5
|
||||
or r5, r0
|
||||
|
||||
_memset_align:
|
||||
/* 4-align the destination */
|
||||
mov.b r0, @-r4
|
||||
tst r2, r4
|
||||
bf/s _memset_align
|
||||
dt r6
|
||||
|
||||
mov #8, r2
|
||||
|
||||
.aligned4:
|
||||
mov.l r0, @-r4
|
||||
cmp/ge r6, r2
|
||||
bf/s .aligned4
|
||||
add #-4, r6
|
||||
|
||||
_naive_memset:
|
||||
/* Tight loop copy one byte */
|
||||
dt r6
|
||||
bf/s _naive_memset
|
||||
mov.b r5, @-r4
|
||||
|
||||
.end:
|
||||
rts
|
||||
mov r3, r0
|
||||
|
||||
.zero:
|
||||
rts
|
||||
mov r4, r0
|
Loading…
Add table
Reference in a new issue