diff --git a/CMakeLists.txt b/CMakeLists.txt index 1db42ea..301be41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,7 @@ set(SOURCES src/render-cg/dsubimage.c src/render-cg/dupdate.c src/render-cg/dvram.c + src/render-cg/dvram.S src/render-cg/gint_dline.c src/render-cg/topti-asm.S src/render-cg/topti.c diff --git a/fxcp_hh2.ld.c b/fxcp_hh2.ld.c index 25a9edd..9e9c2ba 100644 --- a/fxcp_hh2.ld.c +++ b/fxcp_hh2.ld.c @@ -36,7 +36,7 @@ SECTIONS - All text from .text and .text.* (including user code) */ .text : { *(.text.header) - *(.hh2info) + KEEP(*(.hh2info)) *(.text.entry) _bctors = . ; diff --git a/include/gint/display-cg.h b/include/gint/display-cg.h index 48f4412..05572e0 100644 --- a/include/gint/display-cg.h +++ b/include/gint/display-cg.h @@ -109,6 +109,35 @@ void dsetvram(uint16_t *main, uint16_t *secondary); Returns the VRAM buffer addresses used to render on fx-CG 50. */ void dgetvram(uint16_t **main, uint16_t **secondary); + +//--- +// VRAM backup +// On the fx-CP gint backs up the VRAM when loading and restores it when +// leaving. While this is a transparent mechanism, the following parts of the +// implementation are exposed at the internal API level. +//--- + +/* Encode the VRAM contents between [in_start] and [in_end] at [output]. While + [*in_end] is excluded from the encoding, it will be modified temporarily to + serve as bounds check, so it must not be accessed asynchronously during the + encoding. The size of the output is not known but is at most the distance + between in_end and in_start in bytes. Setting output = in_start to compress + in-place is supported. Returns the end pointer after encoding. */ +uint8_t *gint_vrambackup_encode( + uint8_t *output, uint16_t *in_start, uint16_t *in_end); + +/* Predefine palette based on the GUI at the Hollyhock loading screen. Contains + 109 entries plus a 110th dummy entry used internally as bounds check. */ +extern uint16_t gint_vrambackup_palette[110]; + +/* Get the pointer to the encoded VRAM backup created at load time. If [size] + is not NULL, sets the size in [*size]. The pointer is heap allocated and + remains owned by gint. */ +void *gint_vrambackup_get(int *size); + +/* Decode the load-time VRAM backup back to VRAM. */ +void gint_vrambackup_show(void); + #ifdef __cplusplus } #endif diff --git a/src/render-cg/dvram.S b/src/render-cg/dvram.S new file mode 100644 index 0000000..2d16830 --- /dev/null +++ b/src/render-cg/dvram.S @@ -0,0 +1,297 @@ +#include +#ifdef GINT_OS_CP + +/* +Compressed VRAM backup for the fx-CP, with RLE encoding and palette indexing. + +I went a bit overboard with this--I think it was just fun to optimize the heck +out of it. Essentially when we start an fx-CP add-in we save the VRAM and +restore it when leaving, which is essentially what the loader expects us to do. +Normally we'd use the OS's LCD_VRAMBackup() and LCD_VRAMRestore() functions +which just memcpy() to another VRAM-sized buffer. + +However, we want to use that other buffer, which is at a fixed address, to load +code. So we have to backup elsewhere. The standard idea, which CPDoom does, is +just to allocate a third VRAM-sized buffer in the heap. But that's 330 kB, and +the copy still takes some 30 ms due to the sheer amount of data. I wanted to +optimize that. + +This file implements a compressed encoding of the VRAM data. It's slightly +lossy but the difference isn't noticeable unless you know what to look for or +flip between the original and compressed frames. Conceptually, the encoding has +three phases: + +1. Reduce the number of colors by or-ing all pixels with a mask (namely 0x0821) +2. Index the entire picture through a palette +3. RLE-encode the indexed data + +Step 2 uses a predefined palette that I extracted from the HollyHock loading +GUI and sorted by decreasing frequency (to optimize linear search, which, yes, +appears to be faster than binary search in this case, due to the imbalance I +guess). The HHK loading screen never really changes but if a color appears +that's not in the palette it will use the last slot as a safety default. The +total size of the palette is 109 colors + the last slot. + +Step 3 uses a simple RLE encoding. For most runs, the encoding is 2 bytes: the +color index in the palette (< 110), then the length of the run (≤ 255). For +length-1 runs, the encoding is the color index in the palette + 110, which +takes a single byte. + +This results in typical frames of 20-30 kB and a save time of ~22 ms, which is +a 10-15x space improvement and ~25% time improvement over memcpy (29.5 ms) and +even slightly more over LCD__VRAMBackup() (35.5 ms for some reason). + +For the sake of future readability, below is the encoding function that, up to +typical assembly optimizations, the implementation follows. + +static int cvt_encode(u16 const *colors, u16 mask, u8 *output, int count) +{ + u16 *backup = CVT_SOURCE; + u16 *next = backup + (DWIDTH * DHEIGHT); + u8 *output_init = output; + + u16 next_backup = next[0]; + next[0] = ~next[-1]; + + while(backup < next) { + u16 run_color = *backup++ | mask; + + int index = 0; + while(index < 0xff && colors[index] != run_color) + index++; + + if((*backup | mask) != run_color) { + *output++ = count + index; + continue; + } + + int run_length = 1; + do run_length++, backup++; + while((*backup | mask) == run_color); + + while(run_length > 0xff) { + *output++ = index; + *output++ = 0xff; + run_length -= 0xff; + } + + *output++ = index; + *output++ = run_length; + } + + next[0] = next_backup; + return output - output_init; +} +*/ + +.global _gint_vrambackup_encode +.global _gint_vrambackup_palette +.balign 4 + +#define _index r2 /* Index of current run's color in palette */ +#define _mask r3 /* Color reduction mask = 0x0821 */ +#define _output r4 /* Output buffer (advances every write) */ +#define _input r5 /* Input pointer (advances every read) */ +#define _input_end r6 /* End of input pointer */ +#define _palette r7 /* Color palette */ +#define _palette_end r8 /* End of palette (_palette + 2 * 109) */ + +#define _next_pixel r9 /* Color of second pixel of each run */ +#define _run_length r9 /* Length of any given run */ + +/* u8 *gint_vrambackup_encode(u8 *output, u16 *in_start, u16 *in_end) */ +_gint_vrambackup_encode: + mov.l r8, @-r15 + add #-2, _input_end + + sts.l pr, @-r15 + mov #0x08, _mask + + # Set _input_end[1] = ~_input_end[0], which makes the past-the-end + # pixel value different from the last pixel value, ensuring that the + # last run ends at the right time without us having to check bounds in + # the run-length inner loop. + mov.w @_input_end, r1 + shll8 _mask + + mov.w @(2, _input_end), r0 + add #0x21, _mask + + mov.l r0, @-r15 + neg r1, r0 + + mov.l .palette, _palette + mov _palette, _palette_end + + mov.w r0, @(2, _input_end) + add #109, _palette_end + + mov.l r9, @-r15 + add #109, _palette_end + + mov.l r10, @-r15 + nop + +.loop_run: + # Determine current run color (r0) and its index in the palette + # (_index). In order to get a faster lookup we also write the searched + # value in the past-the-end slot, this way we don't have to check + # bounds in the palette-search inner loop. + mov.w @_input+, r0 + mov _palette, _index + + mov.w @_input, _next_pixel + nop + + # Precharge the palette-search loop + mov.w @_index+, r1 + or _mask, r0 + + mov.w r0, @_palette_end + or _mask, _next_pixel + + #=== Palette search ===# + +.ps: cmp/eq r1, r0 + mov.w @_index+, r1 + + bf .ps +1: nop + + # Compute index from pointer difference + # _index = (_index - _palette - 2) / 2 + sub _palette, _index + nop + + add #-4, _index + nop + + #=== Length-1-run fast path ===# + + # Avoid the run-length loop if the run is of length 1. We have a + # special encoding for that anyway. + cmp/eq _next_pixel, r0 + nop + + bf.s .length_1_run + shlr _index + +.length_n_run: + # Compute the run length. Here we use the _run_length register to save + # up the value of _input and we'll compute the difference after the + # loop. + mov.w @_input+, r1 /* LS-based increment */ + mov _input, _run_length + + mov.w @_input+, r1 + nop + + # (bubble) + + #=== Run length computation ===# + +.rl: or _mask, r1 + nop + + cmp/eq r1, r0 + mov.w @_input+, r1 + + bt.s .rl +1: nop + + #=== Run generation ===# + + mov _input, r1 + sub _run_length, r1 + + mov r1, _run_length + shlr _run_length + + add #-4, _input + nop + + mov #-1, r1 + shll8 r1 + + # While _run_length > 0xff, generate 0xff-length runs + tst r1, _run_length + mov #-1, r0 + + bt .sre + extu.b r0, r0 + +.sr: # (bubble after jump) + + mov.b _index, @_output + sub r0, _run_length + + mov.b r0, @(1, _output) + tst r1, _run_length + + bf.s .sr + add #2, _output + + # Generate the last, short run +.sre: + mov.b _index, @_output + mov _run_length, r0 + + cmp/hs _input_end, _input + mov.b r0, @(1, _output) + + bf.s .loop_run + add #2, _output + +.length_1_run: + # Encode byte as palette size + color index + 1. + add #110, _index + mov.b _index, @_output + + cmp/hs _input_end, _input + nop + + bf.s .loop_run + add #1, _output + +.end: + # Restore _input_end[1] and leave + mov.l @r15+, r10 + nop + + mov.l @r15+, r9 + nop + + mov.l @r15+, r0 + nop + + mov.w r0, @(2, _input_end) + nop + + lds.l @r15+, pr + mov _output, r0 + + rts + mov.l @r15+, r8 + +.palette: + .long _gint_vrambackup_palette + +/* Palette of colors used by the launch GUI, sorted by decreasing frequency */ +_gint_vrambackup_palette: + .word 0xffff, 0x5aeb, 0xffbf, 0xef3d, 0x8c31, 0xef7d, 0xdefb, 0xff7f + .word 0x7baf, 0xbdb7, 0x9cb3, 0x4aab, 0xcebb, 0xbdf7, 0xad75, 0xdebb + .word 0xce79, 0x9db9, 0xce39, 0x8cf5, 0x9cf5, 0xefbd, 0x6eff, 0x0821 + .word 0x7c31, 0xad35, 0x7bf1, 0xeefd, 0x7bef, 0x9cf3, 0x8bf1, 0xdf3b + .word 0xbe39, 0x8c2f, 0x6b6d, 0xadf9, 0xadb9, 0x9d77, 0x9d37, 0x5e7d + .word 0x6baf, 0x6ebf, 0x9d35, 0x6b6f, 0x8c71, 0x7c73, 0x8cb1, 0x9c73 + .word 0x5b2d, 0x4aeb, 0x5dfb, 0x8cb3, 0xefbf, 0xff7d, 0xdefd, 0xce7b + .word 0x5dfd, 0xbe3b, 0x7c71, 0x5aed, 0x8bef, 0x4dbb, 0x9efd, 0x4cf9 + .word 0x4c75, 0x6ebd, 0x4d39, 0xbe37, 0xacf5, 0xde7b, 0x5d37, 0x5e3d + .word 0x5ebf, 0x5d79, 0x4d79, 0x6f3f, 0x6c73, 0x6e7d, 0x6e3d, 0x8dfb + .word 0x7dbb, 0x6c33, 0x9d33, 0xcdf9, 0x4dfb, 0x6cb5, 0x6c75, 0x5cf7 + .word 0x4d7b, 0x4cb7, 0x9eff, 0x5e7f, 0xaebd, 0xdf7f, 0xbefd, 0x6c31 + .word 0x4c35, 0x4dfd, 0x5d39, 0x5ebd, 0x9c71, 0x7e3d, 0x7e3b, 0xcf3f + .word 0x6e3b, 0xcf3d, 0x7dfb, 0x5c33, 0x8df9 + .word 0x0000 /* Extra entry for bounds non-check safety net */ + +#endif /* GINT_OS_CP */ diff --git a/src/render-cg/dvram.c b/src/render-cg/dvram.c index c4307ff..aac777a 100644 --- a/src/render-cg/dvram.c +++ b/src/render-cg/dvram.c @@ -3,32 +3,88 @@ #include #include #include +#include +#include #if GINT_RENDER_RGB #if GINT_OS_CP extern void *__GetVRAMAddress(void); -extern void __VRAMBackup(void); -extern void __VRAMRestore(void); uint16_t *gint_vram = NULL; +static uint8_t *gint_vrambackup = NULL; +static int gint_vrambackup_size = -1; + bool dvram_init(void) { - __VRAMBackup(); + /* Backup the VRAM up, but not to the normal backup area--we use that + to load code. Instead, save over VRAM itself then copy to heap. */ + void *VRAM = (void *)0x8c000000; + void *VRAM_END = (void *)0x8c052800; + void *SCRATCH = VRAM; + + // prof_enter(*p1); + void *SCRATCH_END = gint_vrambackup_encode(SCRATCH, VRAM, VRAM_END); + // prof_leave(*p1); + + // prof_enter(*p2); + gint_vrambackup_size = (u8 *)SCRATCH_END - (u8 *)SCRATCH; + gint_vrambackup = malloc(gint_vrambackup_size); + if(gint_vrambackup) + memcpy(gint_vrambackup, SCRATCH, gint_vrambackup_size); + // prof_leave(*p2); + gint_vram = __GetVRAMAddress(); return true; } void dvram_quit(void) { - __VRAMRestore(); // TODO: CP dvram_quit: use global framebuffer image image_t *img = image_create_vram(); + gint_vrambackup_show(); + free(gint_vrambackup); + gint_vrambackup = NULL; video_update(0, 0, img, VIDEO_UPDATE_FOREIGN_WORLD); image_free(img); } +void dgetvram(uint16_t **ptr_vram_1, uint16_t **ptr_vram_2) +{ + *ptr_vram_1 = *ptr_vram_2 = gint_vram; +} + +void gint_vrambackup_show(void) +{ + uint8_t *rle = gint_vrambackup; + int i = 0; + while(i < DWIDTH * DHEIGHT) { + int index = *rle++; + int run_length, run_color; + + if(index >= 110) { + run_length = 1; + run_color = gint_vrambackup_palette[index - 110]; + } + else { + run_length = *rle++; + run_color = gint_vrambackup_palette[index]; + } + + for(int j = 0; j < run_length; j++) + gint_vram[i+j] = run_color; + i += run_length; + } +} + +void *gint_vrambackup_get(int *size) +{ + if(size) + *size = gint_vrambackup_size; + return gint_vrambackup; +} + #elif GINT_OS_CG // TODO[3]: CG: Remove triple buffering