#include #ifdef GINT_OS_CP /* Compressed VRAM backup for the fx-CP, with RLE encoding and palette indexing. I went a bit overboard with this--I think it was just fun to optimize the heck out of it. Essentially when we start an fx-CP add-in we save the VRAM and restore it when leaving, which is essentially what the loader expects us to do. Normally we'd use the OS's LCD_VRAMBackup() and LCD_VRAMRestore() functions which just memcpy() to another VRAM-sized buffer. However, we want to use that other buffer, which is at a fixed address, to load code. So we have to backup elsewhere. The standard idea, which CPDoom does, is just to allocate a third VRAM-sized buffer in the heap. But that's 330 kB, and the copy still takes some 30 ms due to the sheer amount of data. I wanted to optimize that. This file implements a compressed encoding of the VRAM data. It's slightly lossy but the difference isn't noticeable unless you know what to look for or flip between the original and compressed frames. Conceptually, the encoding has three phases: 1. Reduce the number of colors by or-ing all pixels with a mask (namely 0x0821) 2. Index the entire picture through a palette 3. RLE-encode the indexed data Step 2 uses a predefined palette that I extracted from the HollyHock loading GUI and sorted by decreasing frequency (to optimize linear search, which, yes, appears to be faster than binary search in this case, due to the imbalance I guess). The HHK loading screen never really changes but if a color appears that's not in the palette it will use the last slot as a safety default. The total size of the palette is 109 colors + the last slot. Step 3 uses a simple RLE encoding. For most runs, the encoding is 2 bytes: the color index in the palette (< 110), then the length of the run (≤ 255). For length-1 runs, the encoding is the color index in the palette + 110, which takes a single byte. This results in typical frames of 20-30 kB and a save time of ~22 ms, which is a 10-15x space improvement and ~25% time improvement over memcpy (29.5 ms) and even slightly more over LCD__VRAMBackup() (35.5 ms for some reason). For the sake of future readability, below is the encoding function that, up to typical assembly optimizations, the implementation follows. static int cvt_encode(u16 const *colors, u16 mask, u8 *output, int count) { u16 *backup = CVT_SOURCE; u16 *next = backup + (DWIDTH * DHEIGHT); u8 *output_init = output; u16 next_backup = next[0]; next[0] = ~next[-1]; while(backup < next) { u16 run_color = *backup++ | mask; int index = 0; while(index < 0xff && colors[index] != run_color) index++; if((*backup | mask) != run_color) { *output++ = count + index; continue; } int run_length = 1; do run_length++, backup++; while((*backup | mask) == run_color); while(run_length > 0xff) { *output++ = index; *output++ = 0xff; run_length -= 0xff; } *output++ = index; *output++ = run_length; } next[0] = next_backup; return output - output_init; } */ .global _gint_vrambackup_encode .global _gint_vrambackup_palette .balign 4 #define _index r2 /* Index of current run's color in palette */ #define _mask r3 /* Color reduction mask = 0x0821 */ #define _output r4 /* Output buffer (advances every write) */ #define _input r5 /* Input pointer (advances every read) */ #define _input_end r6 /* End of input pointer */ #define _palette r7 /* Color palette */ #define _palette_end r8 /* End of palette (_palette + 2 * 109) */ #define _next_pixel r9 /* Color of second pixel of each run */ #define _run_length r9 /* Length of any given run */ /* u8 *gint_vrambackup_encode(u8 *output, u16 *in_start, u16 *in_end) */ _gint_vrambackup_encode: mov.l r8, @-r15 add #-2, _input_end sts.l pr, @-r15 mov #0x08, _mask # Set _input_end[1] = ~_input_end[0], which makes the past-the-end # pixel value different from the last pixel value, ensuring that the # last run ends at the right time without us having to check bounds in # the run-length inner loop. mov.w @_input_end, r1 shll8 _mask mov.w @(2, _input_end), r0 add #0x21, _mask mov.l r0, @-r15 neg r1, r0 mov.l .palette, _palette mov _palette, _palette_end mov.w r0, @(2, _input_end) add #109, _palette_end mov.l r9, @-r15 add #109, _palette_end mov.l r10, @-r15 nop .loop_run: # Determine current run color (r0) and its index in the palette # (_index). In order to get a faster lookup we also write the searched # value in the past-the-end slot, this way we don't have to check # bounds in the palette-search inner loop. mov.w @_input+, r0 mov _palette, _index mov.w @_input, _next_pixel nop # Precharge the palette-search loop mov.w @_index+, r1 or _mask, r0 mov.w r0, @_palette_end or _mask, _next_pixel #=== Palette search ===# .ps: cmp/eq r1, r0 mov.w @_index+, r1 bf .ps 1: nop # Compute index from pointer difference # _index = (_index - _palette - 2) / 2 sub _palette, _index nop add #-4, _index nop #=== Length-1-run fast path ===# # Avoid the run-length loop if the run is of length 1. We have a # special encoding for that anyway. cmp/eq _next_pixel, r0 nop bf.s .length_1_run shlr _index .length_n_run: # Compute the run length. Here we use the _run_length register to save # up the value of _input and we'll compute the difference after the # loop. mov.w @_input+, r1 /* LS-based increment */ mov _input, _run_length mov.w @_input+, r1 nop # (bubble) #=== Run length computation ===# .rl: or _mask, r1 nop cmp/eq r1, r0 mov.w @_input+, r1 bt.s .rl 1: nop #=== Run generation ===# mov _input, r1 sub _run_length, r1 mov r1, _run_length shlr _run_length add #-4, _input nop mov #-1, r1 shll8 r1 # While _run_length > 0xff, generate 0xff-length runs tst r1, _run_length mov #-1, r0 bt .sre extu.b r0, r0 .sr: # (bubble after jump) mov.b _index, @_output sub r0, _run_length mov.b r0, @(1, _output) tst r1, _run_length bf.s .sr add #2, _output # Generate the last, short run .sre: mov.b _index, @_output mov _run_length, r0 cmp/hs _input_end, _input mov.b r0, @(1, _output) bf.s .loop_run add #2, _output .length_1_run: # Encode byte as palette size + color index + 1. add #110, _index mov.b _index, @_output cmp/hs _input_end, _input nop bf.s .loop_run add #1, _output .end: # Restore _input_end[1] and leave mov.l @r15+, r10 nop mov.l @r15+, r9 nop mov.l @r15+, r0 nop mov.w r0, @(2, _input_end) nop lds.l @r15+, pr mov _output, r0 rts mov.l @r15+, r8 .palette: .long _gint_vrambackup_palette /* Palette of colors used by the launch GUI, sorted by decreasing frequency */ _gint_vrambackup_palette: .word 0xffff, 0x5aeb, 0xffbf, 0xef3d, 0x8c31, 0xef7d, 0xdefb, 0xff7f .word 0x7baf, 0xbdb7, 0x9cb3, 0x4aab, 0xcebb, 0xbdf7, 0xad75, 0xdebb .word 0xce79, 0x9db9, 0xce39, 0x8cf5, 0x9cf5, 0xefbd, 0x6eff, 0x0821 .word 0x7c31, 0xad35, 0x7bf1, 0xeefd, 0x7bef, 0x9cf3, 0x8bf1, 0xdf3b .word 0xbe39, 0x8c2f, 0x6b6d, 0xadf9, 0xadb9, 0x9d77, 0x9d37, 0x5e7d .word 0x6baf, 0x6ebf, 0x9d35, 0x6b6f, 0x8c71, 0x7c73, 0x8cb1, 0x9c73 .word 0x5b2d, 0x4aeb, 0x5dfb, 0x8cb3, 0xefbf, 0xff7d, 0xdefd, 0xce7b .word 0x5dfd, 0xbe3b, 0x7c71, 0x5aed, 0x8bef, 0x4dbb, 0x9efd, 0x4cf9 .word 0x4c75, 0x6ebd, 0x4d39, 0xbe37, 0xacf5, 0xde7b, 0x5d37, 0x5e3d .word 0x5ebf, 0x5d79, 0x4d79, 0x6f3f, 0x6c73, 0x6e7d, 0x6e3d, 0x8dfb .word 0x7dbb, 0x6c33, 0x9d33, 0xcdf9, 0x4dfb, 0x6cb5, 0x6c75, 0x5cf7 .word 0x4d7b, 0x4cb7, 0x9eff, 0x5e7f, 0xaebd, 0xdf7f, 0xbefd, 0x6c31 .word 0x4c35, 0x4dfd, 0x5d39, 0x5ebd, 0x9c71, 0x7e3d, 0x7e3b, 0xcf3f .word 0x6e3b, 0xcf3d, 0x7dfb, 0x5c33, 0x8df9 .word 0x0000 /* Extra entry for bounds non-check safety net */ #endif /* GINT_OS_CP */