bopti: more performance improvement for SCSP cases

* Turn on GCC's -O3 for bopti files
* Remove the bopti_render_noclip() step
* Use rbox as early as possible to avoid moving memory around
* A lot of local grinding
This commit is contained in:
Lephe 2020-07-23 14:03:45 +02:00
parent 11dd04243f
commit d887423bbb
No known key found for this signature in database
GPG key ID: 1BBA026E13FC0495
5 changed files with 147 additions and 98 deletions

View file

@ -2,21 +2,38 @@
#include "../render-fx/render-fx.h"
#include "../render-fx/bopti-asm.h"
#pragma GCC optimize("O3")
/* gsubimage(): Render a section of an image */
void gsubimage(int x, int y, bopti_image_t const *img, int left, int top,
int width, int height, int flags)
void gsubimage(bopti_image_t const *img, struct rbox *r, GUNUSED int flags)
{
uint32_t *light, *dark;
dgray_getvram(&light, &dark);
if(flags & DIMAGE_NOCLIP)
/* Intersect the bounding box with both the source image and the VRAM,
except if DIMAGE_NOCLIP is provided */
if(!(flags & DIMAGE_NOCLIP))
{
bopti_render_noclip(x, y, img, left, top, width, height,
light, dark);
/* Early finish for empty intersections */
if(bopti_clip(img, r)) return;
}
int left = r->left;
int width = r->width;
int visual_x = r->visual_x;
r->left = left >> 5;
r->columns = ((left + width - 1) >> 5) - r->left + 1;
if(r->columns == 1 && (visual_x & 31) + width <= 32)
{
r->x = (left & 31) - (visual_x & 31);
bopti_render_scsp(img, r, light, dark);
}
else
{
bopti_render_clip(x, y, img, left, top, width, height,
light, dark);
/* x-coordinate of the first pixel of the first column */
r->x = visual_x - (left & 31);
bopti_render(img, r, light, dark);
}
}

View file

@ -36,6 +36,7 @@ typedef void asm_gray_scsp_t(uint32_t *v1, uint32_t const *layer,
/* Type of any rendering function */
typedef union {
void *gen;
asm_mono_t *asm_mono;
asm_gray_t *asm_gray;
asm_mono_scsp_t *asm_mono_scsp;
@ -84,4 +85,34 @@ extern asm_gray_scsp_t bopti_gasm_gray_scsp;
/* bpoti_asm_gray_alpha_scsp(): SCSP "gray_alpha" profile */
extern asm_gray_scsp_t bopti_gasm_gray_alpha_scsp;
//---
// Renderer's data structures
//---
/* struct rbox: A rendering box (target coordinates and source rectangle)
Meaning of fields vary during the rendering process! */
struct rbox
{
/* General renderer:
On-screen location of the leftmost pixel of the leftmost rendered
column (this particular pixel might not be drawn but is of
importance in the positioning process)
SCSP renderer:
Shift value used to align columns with positions */
int x;
/* On-screen location of top-left corner; the (x,y) of dsubimage() */
int visual_x, y;
/* Width of rendered sub-image */
int width;
/* Before bopti_render{_scsp}():
Left-coordinate of the source box (included, in pixels)
In bopti_render{_scsp}():
Left-coordinate of the source box (included, in columns) */
int left;
/* Number of columns used in the source box */
int columns;
/* Vertical bounds of the box in the image (inc-excluded, in pixels) */
int top, height;
};
#endif /* GINT_RENDERFX_BOPTIASM */

View file

@ -3,22 +3,7 @@
#include "render-fx.h"
#include "bopti-asm.h"
/* struct rbox: A rendering box (target coordinates and source rectangle)
Some of the data here is redundant, but makes things easier. */
struct rbox
{
/* Left pixel of the first column to be drawn, even if this column is
not drawn entirely */
int x;
/* On-screen location of top-left corner */
int visual_x, y;
/* Width of rendered sub-image */
int width;
/* Horizontal bounds of the box in the image (included, in columns) */
int left, right;
/* Vertical bounds of the box in the image (inc-excluded, in pixels) */
int top, height;
};
#pragma GCC optimize("O3")
/* struct command: A rendering command
Includes many computed parameters and handy information. Read-only. */
@ -176,7 +161,7 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
masks(rbox->visual_x, rbox->visual_x + rbox->width - 1, vm);
/* Number of layers per profile */
int layer_count[] = { 1, 2, 2, 3 };
static const int layer_count[] = { 1, 2, 2, 3 };
int layers = layer_count[img->profile];
/* For each pair of consecutive VRAM elements involved, create a mask
@ -207,20 +192,17 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers;
/* Number of grid columns */
int columns = rbox->right - rbox->left + 1;
/* Compute and execute the command for this parameters */
struct command c = {
.x = rbox->x & 31,
.v1 = v1,
.v2 = v2 ? v2 : v1,
.offset = (rbox->y << 2) + (rbox->x >> 5),
.columns = columns,
.columns = rbox->columns,
.masks = masks + 2 * left_origin,
.real_start = (left_origin > 0),
.vram_stride = 4 - columns,
.data_stride = ((img_columns - columns) << 2) * layers,
.vram_stride = 4 - rbox->columns,
.data_stride = ((img_columns - rbox->columns) << 2) * layers,
.gray = (v2 != NULL),
.f = f,
};
@ -231,61 +213,62 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
void bopti_render_scsp(bopti_image_t const *img, struct rbox *rbox,
uint32_t *v1, uint32_t *v2)
{
/* Rendering function */
bopti_asm_t f;
if(v2) f.asm_gray_scsp = asm_gray_scsp[img->profile];
else f.asm_mono_scsp = asm_mono_scsp[img->profile];
/* Compute the only rendering mask. Avoid UB if width = 32 */
uint32_t mask = 0xffffffff;
if(rbox->width < 32)
{
int right = 32 - ((rbox->visual_x & 31) + rbox->width);
mask = ((1 << rbox->width) - 1) << right;
}
/* Compute the only rendering mask */
uint32_t mask =
(0xffffffff << (32 - rbox->width)) >> (rbox->visual_x & 31);
/* Number of layers */
int layer_count[] = { 1, 2, 2, 3 };
int layers = layer_count[img->profile];
int layers = img->profile - (img->profile >> 1) + 1;
/* Number of columns in [img] */
int img_columns = (img->width + 31) >> 5;
/* Number of longwords to skip between rows of [img] */
int img_stride = ((img->width + 31) >> 5) * layers;
/* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers;
layer += (rbox->top * img_stride) + (rbox->left * layers);
/* Starting value of VRAM pointers */
int offset = (rbox->y << 2) + (rbox->visual_x >> 5);
v1 += offset;
if(v2) v2 += offset;
/* Number of rows */
int rows = rbox->height;
/* Mask shift */
int shift = -(rbox->x & 31);
if(rbox->x < 0) shift += 32;
/* Render the grid immediately; mono version */
if(!v2) while(rows--)
if(!v2)
{
f.asm_mono_scsp(v1, layer, mask, shift);
layer += img_columns * layers;
v1 += 4;
asm_mono_scsp_t *f = asm_mono_scsp[img->profile];
while(rows--)
{
f(v1, layer, mask, rbox->x);
layer += img_stride;
v1 += 4;
}
}
/* Gray version */
else while(rows--)
else
{
f.asm_gray_scsp(v1, layer, mask, v2, shift);
layer += img_columns * layers;
v1 += 4;
v2 += 4;
asm_gray_scsp_t *f = asm_gray_scsp[img->profile];
v2 += offset;
while(rows--)
{
f(v1, layer, mask, v2, rbox->x);
layer += img_stride;
v1 += 4;
v2 += 4;
}
}
}
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
int top, int width, int height, uint32_t *v1, uint32_t *v2)
int bopti_clip(bopti_image_t const *img, struct rbox *r)
{
/* This load/save is not elegant but it makes GCC use register-only
operations, which is what we need for efficiency */
int x = r->visual_x, y = r->y;
int left = r->left, top = r->top;
int width = r->width, height = r->height;
/* Adjust the bounding box of the input image */
if(left < 0) width += left, x -= left, left = 0;
if(top < 0) height += top, y -= top, top = 0;
@ -298,32 +281,34 @@ void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
if(x + width > DWIDTH) width = DWIDTH - x;
if(y + height > DHEIGHT) height = DHEIGHT - y;
/* Early finish for empty intersections */
if(width <= 0 || height <= 0) return;
r->visual_x = x;
r->y = y;
r->left = left;
r->top = top;
r->width = width;
r->height = height;
/* Finish with the noclip variant */
bopti_render_noclip(x, y, img, left, top, width, height, v1, v2);
/* Return non-zero if the result is empty */
return (width <= 0 || height <= 0);
}
void bopti_render_noclip(int visual_x, int y, bopti_image_t const *img,
int left, int top, int width, int height, uint32_t *v1, uint32_t *v2)
void bopti_render_noclip(bopti_image_t const *img, struct rbox *r,
uint32_t *v1, uint32_t *v2)
{
int left = r->left;
/* Start column and end column (both included) */
int cl = (left) >> 5;
int cr = (left + width - 1) >> 5;
r->left >>= 5;
/* Finish with the standard bopti renderer */
struct rbox rbox = { 0, visual_x, y, width, cl, cr, top, height };
if(cl == cr && (visual_x & 31) + width <= 32)
if(r->columns == 1 && (r->visual_x & 31) + r->width <= 32)
{
rbox.x = (visual_x & 31) - (left & 31);
bopti_render_scsp(img, &rbox, v1, v2);
r->x = (left & 31) - (r->visual_x & 31);
bopti_render_scsp(img, r, v1, v2);
}
else
{
/* x-coordinate of the first pixel of the first column */
rbox.x = visual_x - (left & 31);
bopti_render(img, &rbox, v1, v2);
r->x = r->visual_x - (left & 31);
bopti_render(img, r, v1, v2);
}
}

View file

@ -2,22 +2,43 @@
#include "render-fx.h"
#include "bopti-asm.h"
#pragma GCC optimize("O3")
/* dsubimage(): Render a section of an image */
void dsubimage(int x, int y, bopti_image_t const *img, int left, int top,
int width, int height, int flags)
{
DMODE_OVERRIDE(dsubimage, x, y, img, left, top, width, height, flags);
struct rbox r = {
0, x, y, width, left, 0, top, height
};
DMODE_OVERRIDE(dsubimage, img, &r, flags);
if(img->gray) return;
if(flags & DIMAGE_NOCLIP)
/* Intersect the bounding box with both the source image and the VRAM,
except if DIMAGE_NOCLIP is provided */
if(!(flags & DIMAGE_NOCLIP))
{
bopti_render_noclip(x, y, img, left, top, width, height,
gint_vram, NULL);
/* Early finish for empty intersections */
if(bopti_clip(img, &r)) return;
}
left = r.left;
width = r.width;
int visual_x = r.visual_x;
r.left = left >> 5;
r.columns = ((left + width - 1) >> 5) - r.left + 1;
if(r.columns == 1 && (visual_x & 31) + width <= 32)
{
r.x = (left & 31) - (visual_x & 31);
bopti_render_scsp(img, &r, gint_vram, NULL);
}
else
{
bopti_render_clip(x, y, img, left, top, width, height,
gint_vram, NULL);
/* x-coordinate of the first pixel of the first column */
r.x = visual_x - (left & 31);
bopti_render(img, &r, gint_vram, NULL);
}
}

View file

@ -7,6 +7,7 @@
#include <gint/defs/types.h>
#include <gint/display.h>
#include "bopti-asm.h"
/* masks(): Compute the vram masks for a given rectangle
@ -24,13 +25,10 @@
@masks Stores the result of the function (four uint32_t values) */
void masks(int x1, int x2, uint32_t *masks);
/* bopti_render_clip(): Render a bopti image with clipping
@x @y Location of the top-left corner
@img Image encoded by [fxconv]
@left @top @w @h Bounding box to render
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
int top, int w, int h, uint32_t *v1, uint32_t *v2);
/* bopti_clip(): Clip a bounding box to image and VRAM
@img Image encoded by [fxconv]
@rbox Rendering box */
int bopti_clip(bopti_image_t const *img, struct rbox *rbox);
/* bopti_render_noclip(): Render a bopti image without clipping
This function is only ever slightly faster than bopti_render_clip(),
@ -42,8 +40,8 @@ void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
@img Image encoded by [fxconv]
@left @top @w @h Bounding box to render
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */
void bopti_render_noclip(int x, int y, bopti_image_t const *img, int left,
int top, int w, int h, uint32_t *v1, uint32_t *v2);
void bopti_render_noclip(bopti_image_t const *img, struct rbox *rbox,
uint32_t *v1, uint32_t *v2);
//---
// Alternate rendering modes
@ -72,8 +70,7 @@ struct rendering_mode
(int x, int y, int fg, int bg, int halign, int valign,
char const *str);
void (*dsubimage)
(int x, int y, bopti_image_t const *image, int left, int top,
int width, int height, int flags);
(bopti_image_t const *image, struct rbox *r, int flags);
};
/* The alternate rendering mode pointer (initially NULL)*/
@ -89,9 +86,7 @@ void gint_gvline(int y1, int y2, int x, int color);
void gtext_opt
(int x, int y, int fg, int bg, int halign, int valign,
char const *str);
void gsubimage
(int x, int y, bopti_image_t const *image, int left, int top,
int width, int height, int flags);
void gsubimage(bopti_image_t const *image, struct rbox *r, int flags);
/* Short macro to call the alternate rendering function when available */
#define DMODE_OVERRIDE(func, ...) \