gint: compressed VRAM save on fx-CP

This frees the OS' backup VRAM, which is a 337920-byte buffer sitting
at a fixed address, for loading code.
This commit is contained in:
Lephe 2024-06-01 14:41:09 +02:00
parent 335326692f
commit 84a4dd7ca9
No known key found for this signature in database
GPG key ID: 1BBA026E13FC0495
5 changed files with 388 additions and 5 deletions

View file

@ -171,6 +171,7 @@ set(SOURCES
src/render-cg/dsubimage.c
src/render-cg/dupdate.c
src/render-cg/dvram.c
src/render-cg/dvram.S
src/render-cg/gint_dline.c
src/render-cg/topti-asm.S
src/render-cg/topti.c

View file

@ -36,7 +36,7 @@ SECTIONS
- All text from .text and .text.* (including user code) */
.text : {
*(.text.header)
*(.hh2info)
KEEP(*(.hh2info))
*(.text.entry)
_bctors = . ;

View file

@ -109,6 +109,35 @@ void dsetvram(uint16_t *main, uint16_t *secondary);
Returns the VRAM buffer addresses used to render on fx-CG 50. */
void dgetvram(uint16_t **main, uint16_t **secondary);
//---
// VRAM backup
// On the fx-CP gint backs up the VRAM when loading and restores it when
// leaving. While this is a transparent mechanism, the following parts of the
// implementation are exposed at the internal API level.
//---
/* Encode the VRAM contents between [in_start] and [in_end] at [output]. While
[*in_end] is excluded from the encoding, it will be modified temporarily to
serve as bounds check, so it must not be accessed asynchronously during the
encoding. The size of the output is not known but is at most the distance
between in_end and in_start in bytes. Setting output = in_start to compress
in-place is supported. Returns the end pointer after encoding. */
uint8_t *gint_vrambackup_encode(
uint8_t *output, uint16_t *in_start, uint16_t *in_end);
/* Predefine palette based on the GUI at the Hollyhock loading screen. Contains
109 entries plus a 110th dummy entry used internally as bounds check. */
extern uint16_t gint_vrambackup_palette[110];
/* Get the pointer to the encoded VRAM backup created at load time. If [size]
is not NULL, sets the size in [*size]. The pointer is heap allocated and
remains owned by gint. */
void *gint_vrambackup_get(int *size);
/* Decode the load-time VRAM backup back to VRAM. */
void gint_vrambackup_show(void);
#ifdef __cplusplus
}
#endif

297
src/render-cg/dvram.S Normal file
View file

@ -0,0 +1,297 @@
#include <gint/config.h>
#ifdef GINT_OS_CP
/*
Compressed VRAM backup for the fx-CP, with RLE encoding and palette indexing.
I went a bit overboard with this--I think it was just fun to optimize the heck
out of it. Essentially when we start an fx-CP add-in we save the VRAM and
restore it when leaving, which is essentially what the loader expects us to do.
Normally we'd use the OS's LCD_VRAMBackup() and LCD_VRAMRestore() functions
which just memcpy() to another VRAM-sized buffer.
However, we want to use that other buffer, which is at a fixed address, to load
code. So we have to backup elsewhere. The standard idea, which CPDoom does, is
just to allocate a third VRAM-sized buffer in the heap. But that's 330 kB, and
the copy still takes some 30 ms due to the sheer amount of data. I wanted to
optimize that.
This file implements a compressed encoding of the VRAM data. It's slightly
lossy but the difference isn't noticeable unless you know what to look for or
flip between the original and compressed frames. Conceptually, the encoding has
three phases:
1. Reduce the number of colors by or-ing all pixels with a mask (namely 0x0821)
2. Index the entire picture through a palette
3. RLE-encode the indexed data
Step 2 uses a predefined palette that I extracted from the HollyHock loading
GUI and sorted by decreasing frequency (to optimize linear search, which, yes,
appears to be faster than binary search in this case, due to the imbalance I
guess). The HHK loading screen never really changes but if a color appears
that's not in the palette it will use the last slot as a safety default. The
total size of the palette is 109 colors + the last slot.
Step 3 uses a simple RLE encoding. For most runs, the encoding is 2 bytes: the
color index in the palette (< 110), then the length of the run ( 255). For
length-1 runs, the encoding is the color index in the palette + 110, which
takes a single byte.
This results in typical frames of 20-30 kB and a save time of ~22 ms, which is
a 10-15x space improvement and ~25% time improvement over memcpy (29.5 ms) and
even slightly more over LCD__VRAMBackup() (35.5 ms for some reason).
For the sake of future readability, below is the encoding function that, up to
typical assembly optimizations, the implementation follows.
static int cvt_encode(u16 const *colors, u16 mask, u8 *output, int count)
{
u16 *backup = CVT_SOURCE;
u16 *next = backup + (DWIDTH * DHEIGHT);
u8 *output_init = output;
u16 next_backup = next[0];
next[0] = ~next[-1];
while(backup < next) {
u16 run_color = *backup++ | mask;
int index = 0;
while(index < 0xff && colors[index] != run_color)
index++;
if((*backup | mask) != run_color) {
*output++ = count + index;
continue;
}
int run_length = 1;
do run_length++, backup++;
while((*backup | mask) == run_color);
while(run_length > 0xff) {
*output++ = index;
*output++ = 0xff;
run_length -= 0xff;
}
*output++ = index;
*output++ = run_length;
}
next[0] = next_backup;
return output - output_init;
}
*/
.global _gint_vrambackup_encode
.global _gint_vrambackup_palette
.balign 4
#define _index r2 /* Index of current run's color in palette */
#define _mask r3 /* Color reduction mask = 0x0821 */
#define _output r4 /* Output buffer (advances every write) */
#define _input r5 /* Input pointer (advances every read) */
#define _input_end r6 /* End of input pointer */
#define _palette r7 /* Color palette */
#define _palette_end r8 /* End of palette (_palette + 2 * 109) */
#define _next_pixel r9 /* Color of second pixel of each run */
#define _run_length r9 /* Length of any given run */
/* u8 *gint_vrambackup_encode(u8 *output, u16 *in_start, u16 *in_end) */
_gint_vrambackup_encode:
mov.l r8, @-r15
add #-2, _input_end
sts.l pr, @-r15
mov #0x08, _mask
# Set _input_end[1] = ~_input_end[0], which makes the past-the-end
# pixel value different from the last pixel value, ensuring that the
# last run ends at the right time without us having to check bounds in
# the run-length inner loop.
mov.w @_input_end, r1
shll8 _mask
mov.w @(2, _input_end), r0
add #0x21, _mask
mov.l r0, @-r15
neg r1, r0
mov.l .palette, _palette
mov _palette, _palette_end
mov.w r0, @(2, _input_end)
add #109, _palette_end
mov.l r9, @-r15
add #109, _palette_end
mov.l r10, @-r15
nop
.loop_run:
# Determine current run color (r0) and its index in the palette
# (_index). In order to get a faster lookup we also write the searched
# value in the past-the-end slot, this way we don't have to check
# bounds in the palette-search inner loop.
mov.w @_input+, r0
mov _palette, _index
mov.w @_input, _next_pixel
nop
# Precharge the palette-search loop
mov.w @_index+, r1
or _mask, r0
mov.w r0, @_palette_end
or _mask, _next_pixel
#=== Palette search ===#
.ps: cmp/eq r1, r0
mov.w @_index+, r1
bf .ps
1: nop
# Compute index from pointer difference
# _index = (_index - _palette - 2) / 2
sub _palette, _index
nop
add #-4, _index
nop
#=== Length-1-run fast path ===#
# Avoid the run-length loop if the run is of length 1. We have a
# special encoding for that anyway.
cmp/eq _next_pixel, r0
nop
bf.s .length_1_run
shlr _index
.length_n_run:
# Compute the run length. Here we use the _run_length register to save
# up the value of _input and we'll compute the difference after the
# loop.
mov.w @_input+, r1 /* LS-based increment */
mov _input, _run_length
mov.w @_input+, r1
nop
# (bubble)
#=== Run length computation ===#
.rl: or _mask, r1
nop
cmp/eq r1, r0
mov.w @_input+, r1
bt.s .rl
1: nop
#=== Run generation ===#
mov _input, r1
sub _run_length, r1
mov r1, _run_length
shlr _run_length
add #-4, _input
nop
mov #-1, r1
shll8 r1
# While _run_length > 0xff, generate 0xff-length runs
tst r1, _run_length
mov #-1, r0
bt .sre
extu.b r0, r0
.sr: # (bubble after jump)
mov.b _index, @_output
sub r0, _run_length
mov.b r0, @(1, _output)
tst r1, _run_length
bf.s .sr
add #2, _output
# Generate the last, short run
.sre:
mov.b _index, @_output
mov _run_length, r0
cmp/hs _input_end, _input
mov.b r0, @(1, _output)
bf.s .loop_run
add #2, _output
.length_1_run:
# Encode byte as palette size + color index + 1.
add #110, _index
mov.b _index, @_output
cmp/hs _input_end, _input
nop
bf.s .loop_run
add #1, _output
.end:
# Restore _input_end[1] and leave
mov.l @r15+, r10
nop
mov.l @r15+, r9
nop
mov.l @r15+, r0
nop
mov.w r0, @(2, _input_end)
nop
lds.l @r15+, pr
mov _output, r0
rts
mov.l @r15+, r8
.palette:
.long _gint_vrambackup_palette
/* Palette of colors used by the launch GUI, sorted by decreasing frequency */
_gint_vrambackup_palette:
.word 0xffff, 0x5aeb, 0xffbf, 0xef3d, 0x8c31, 0xef7d, 0xdefb, 0xff7f
.word 0x7baf, 0xbdb7, 0x9cb3, 0x4aab, 0xcebb, 0xbdf7, 0xad75, 0xdebb
.word 0xce79, 0x9db9, 0xce39, 0x8cf5, 0x9cf5, 0xefbd, 0x6eff, 0x0821
.word 0x7c31, 0xad35, 0x7bf1, 0xeefd, 0x7bef, 0x9cf3, 0x8bf1, 0xdf3b
.word 0xbe39, 0x8c2f, 0x6b6d, 0xadf9, 0xadb9, 0x9d77, 0x9d37, 0x5e7d
.word 0x6baf, 0x6ebf, 0x9d35, 0x6b6f, 0x8c71, 0x7c73, 0x8cb1, 0x9c73
.word 0x5b2d, 0x4aeb, 0x5dfb, 0x8cb3, 0xefbf, 0xff7d, 0xdefd, 0xce7b
.word 0x5dfd, 0xbe3b, 0x7c71, 0x5aed, 0x8bef, 0x4dbb, 0x9efd, 0x4cf9
.word 0x4c75, 0x6ebd, 0x4d39, 0xbe37, 0xacf5, 0xde7b, 0x5d37, 0x5e3d
.word 0x5ebf, 0x5d79, 0x4d79, 0x6f3f, 0x6c73, 0x6e7d, 0x6e3d, 0x8dfb
.word 0x7dbb, 0x6c33, 0x9d33, 0xcdf9, 0x4dfb, 0x6cb5, 0x6c75, 0x5cf7
.word 0x4d7b, 0x4cb7, 0x9eff, 0x5e7f, 0xaebd, 0xdf7f, 0xbefd, 0x6c31
.word 0x4c35, 0x4dfd, 0x5d39, 0x5ebd, 0x9c71, 0x7e3d, 0x7e3b, 0xcf3f
.word 0x6e3b, 0xcf3d, 0x7dfb, 0x5c33, 0x8df9
.word 0x0000 /* Extra entry for bounds non-check safety net */
#endif /* GINT_OS_CP */

View file

@ -3,32 +3,88 @@
#include <gint/video.h>
#include <gint/image.h>
#include <gint/config.h>
#include <string.h>
#include <stdlib.h>
#if GINT_RENDER_RGB
#if GINT_OS_CP
extern void *__GetVRAMAddress(void);
extern void __VRAMBackup(void);
extern void __VRAMRestore(void);
uint16_t *gint_vram = NULL;
static uint8_t *gint_vrambackup = NULL;
static int gint_vrambackup_size = -1;
bool dvram_init(void)
{
__VRAMBackup();
/* Backup the VRAM up, but not to the normal backup area--we use that
to load code. Instead, save over VRAM itself then copy to heap. */
void *VRAM = (void *)0x8c000000;
void *VRAM_END = (void *)0x8c052800;
void *SCRATCH = VRAM;
// prof_enter(*p1);
void *SCRATCH_END = gint_vrambackup_encode(SCRATCH, VRAM, VRAM_END);
// prof_leave(*p1);
// prof_enter(*p2);
gint_vrambackup_size = (u8 *)SCRATCH_END - (u8 *)SCRATCH;
gint_vrambackup = malloc(gint_vrambackup_size);
if(gint_vrambackup)
memcpy(gint_vrambackup, SCRATCH, gint_vrambackup_size);
// prof_leave(*p2);
gint_vram = __GetVRAMAddress();
return true;
}
void dvram_quit(void)
{
__VRAMRestore();
// TODO: CP dvram_quit: use global framebuffer image
image_t *img = image_create_vram();
gint_vrambackup_show();
free(gint_vrambackup);
gint_vrambackup = NULL;
video_update(0, 0, img, VIDEO_UPDATE_FOREIGN_WORLD);
image_free(img);
}
void dgetvram(uint16_t **ptr_vram_1, uint16_t **ptr_vram_2)
{
*ptr_vram_1 = *ptr_vram_2 = gint_vram;
}
void gint_vrambackup_show(void)
{
uint8_t *rle = gint_vrambackup;
int i = 0;
while(i < DWIDTH * DHEIGHT) {
int index = *rle++;
int run_length, run_color;
if(index >= 110) {
run_length = 1;
run_color = gint_vrambackup_palette[index - 110];
}
else {
run_length = *rle++;
run_color = gint_vrambackup_palette[index];
}
for(int j = 0; j < run_length; j++)
gint_vram[i+j] = run_color;
i += run_length;
}
}
void *gint_vrambackup_get(int *size)
{
if(size)
*size = gint_vrambackup_size;
return gint_vrambackup;
}
#elif GINT_OS_CG
// TODO[3]: CG: Remove triple buffering