bopti: more performance improvement for SCSP cases

* Turn on GCC's -O3 for bopti files
* Remove the bopti_render_noclip() step
* Use rbox as early as possible to avoid moving memory around
* A lot of local grinding
This commit is contained in:
Lephe 2020-07-23 14:03:45 +02:00
parent 11dd04243f
commit d887423bbb
No known key found for this signature in database
GPG key ID: 1BBA026E13FC0495
5 changed files with 147 additions and 98 deletions

View file

@ -2,21 +2,38 @@
#include "../render-fx/render-fx.h" #include "../render-fx/render-fx.h"
#include "../render-fx/bopti-asm.h" #include "../render-fx/bopti-asm.h"
#pragma GCC optimize("O3")
/* gsubimage(): Render a section of an image */ /* gsubimage(): Render a section of an image */
void gsubimage(int x, int y, bopti_image_t const *img, int left, int top, void gsubimage(bopti_image_t const *img, struct rbox *r, GUNUSED int flags)
int width, int height, int flags)
{ {
uint32_t *light, *dark; uint32_t *light, *dark;
dgray_getvram(&light, &dark); dgray_getvram(&light, &dark);
if(flags & DIMAGE_NOCLIP) /* Intersect the bounding box with both the source image and the VRAM,
except if DIMAGE_NOCLIP is provided */
if(!(flags & DIMAGE_NOCLIP))
{ {
bopti_render_noclip(x, y, img, left, top, width, height, /* Early finish for empty intersections */
light, dark); if(bopti_clip(img, r)) return;
}
int left = r->left;
int width = r->width;
int visual_x = r->visual_x;
r->left = left >> 5;
r->columns = ((left + width - 1) >> 5) - r->left + 1;
if(r->columns == 1 && (visual_x & 31) + width <= 32)
{
r->x = (left & 31) - (visual_x & 31);
bopti_render_scsp(img, r, light, dark);
} }
else else
{ {
bopti_render_clip(x, y, img, left, top, width, height, /* x-coordinate of the first pixel of the first column */
light, dark); r->x = visual_x - (left & 31);
bopti_render(img, r, light, dark);
} }
} }

View file

@ -36,6 +36,7 @@ typedef void asm_gray_scsp_t(uint32_t *v1, uint32_t const *layer,
/* Type of any rendering function */ /* Type of any rendering function */
typedef union { typedef union {
void *gen;
asm_mono_t *asm_mono; asm_mono_t *asm_mono;
asm_gray_t *asm_gray; asm_gray_t *asm_gray;
asm_mono_scsp_t *asm_mono_scsp; asm_mono_scsp_t *asm_mono_scsp;
@ -84,4 +85,34 @@ extern asm_gray_scsp_t bopti_gasm_gray_scsp;
/* bpoti_asm_gray_alpha_scsp(): SCSP "gray_alpha" profile */ /* bpoti_asm_gray_alpha_scsp(): SCSP "gray_alpha" profile */
extern asm_gray_scsp_t bopti_gasm_gray_alpha_scsp; extern asm_gray_scsp_t bopti_gasm_gray_alpha_scsp;
//---
// Renderer's data structures
//---
/* struct rbox: A rendering box (target coordinates and source rectangle)
Meaning of fields vary during the rendering process! */
struct rbox
{
/* General renderer:
On-screen location of the leftmost pixel of the leftmost rendered
column (this particular pixel might not be drawn but is of
importance in the positioning process)
SCSP renderer:
Shift value used to align columns with positions */
int x;
/* On-screen location of top-left corner; the (x,y) of dsubimage() */
int visual_x, y;
/* Width of rendered sub-image */
int width;
/* Before bopti_render{_scsp}():
Left-coordinate of the source box (included, in pixels)
In bopti_render{_scsp}():
Left-coordinate of the source box (included, in columns) */
int left;
/* Number of columns used in the source box */
int columns;
/* Vertical bounds of the box in the image (inc-excluded, in pixels) */
int top, height;
};
#endif /* GINT_RENDERFX_BOPTIASM */ #endif /* GINT_RENDERFX_BOPTIASM */

View file

@ -3,22 +3,7 @@
#include "render-fx.h" #include "render-fx.h"
#include "bopti-asm.h" #include "bopti-asm.h"
/* struct rbox: A rendering box (target coordinates and source rectangle) #pragma GCC optimize("O3")
Some of the data here is redundant, but makes things easier. */
struct rbox
{
/* Left pixel of the first column to be drawn, even if this column is
not drawn entirely */
int x;
/* On-screen location of top-left corner */
int visual_x, y;
/* Width of rendered sub-image */
int width;
/* Horizontal bounds of the box in the image (included, in columns) */
int left, right;
/* Vertical bounds of the box in the image (inc-excluded, in pixels) */
int top, height;
};
/* struct command: A rendering command /* struct command: A rendering command
Includes many computed parameters and handy information. Read-only. */ Includes many computed parameters and handy information. Read-only. */
@ -176,7 +161,7 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
masks(rbox->visual_x, rbox->visual_x + rbox->width - 1, vm); masks(rbox->visual_x, rbox->visual_x + rbox->width - 1, vm);
/* Number of layers per profile */ /* Number of layers per profile */
int layer_count[] = { 1, 2, 2, 3 }; static const int layer_count[] = { 1, 2, 2, 3 };
int layers = layer_count[img->profile]; int layers = layer_count[img->profile];
/* For each pair of consecutive VRAM elements involved, create a mask /* For each pair of consecutive VRAM elements involved, create a mask
@ -207,20 +192,17 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
const uint32_t *layer = (void *)img->data; const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers; layer += (rbox->top * img_columns + rbox->left) * layers;
/* Number of grid columns */
int columns = rbox->right - rbox->left + 1;
/* Compute and execute the command for this parameters */ /* Compute and execute the command for this parameters */
struct command c = { struct command c = {
.x = rbox->x & 31, .x = rbox->x & 31,
.v1 = v1, .v1 = v1,
.v2 = v2 ? v2 : v1, .v2 = v2 ? v2 : v1,
.offset = (rbox->y << 2) + (rbox->x >> 5), .offset = (rbox->y << 2) + (rbox->x >> 5),
.columns = columns, .columns = rbox->columns,
.masks = masks + 2 * left_origin, .masks = masks + 2 * left_origin,
.real_start = (left_origin > 0), .real_start = (left_origin > 0),
.vram_stride = 4 - columns, .vram_stride = 4 - rbox->columns,
.data_stride = ((img_columns - columns) << 2) * layers, .data_stride = ((img_columns - rbox->columns) << 2) * layers,
.gray = (v2 != NULL), .gray = (v2 != NULL),
.f = f, .f = f,
}; };
@ -231,61 +213,62 @@ void bopti_render(bopti_image_t const *img, struct rbox *rbox, uint32_t *v1,
void bopti_render_scsp(bopti_image_t const *img, struct rbox *rbox, void bopti_render_scsp(bopti_image_t const *img, struct rbox *rbox,
uint32_t *v1, uint32_t *v2) uint32_t *v1, uint32_t *v2)
{ {
/* Rendering function */ /* Compute the only rendering mask */
bopti_asm_t f; uint32_t mask =
if(v2) f.asm_gray_scsp = asm_gray_scsp[img->profile]; (0xffffffff << (32 - rbox->width)) >> (rbox->visual_x & 31);
else f.asm_mono_scsp = asm_mono_scsp[img->profile];
/* Compute the only rendering mask. Avoid UB if width = 32 */
uint32_t mask = 0xffffffff;
if(rbox->width < 32)
{
int right = 32 - ((rbox->visual_x & 31) + rbox->width);
mask = ((1 << rbox->width) - 1) << right;
}
/* Number of layers */ /* Number of layers */
int layer_count[] = { 1, 2, 2, 3 }; int layers = img->profile - (img->profile >> 1) + 1;
int layers = layer_count[img->profile];
/* Number of columns in [img] */ /* Number of longwords to skip between rows of [img] */
int img_columns = (img->width + 31) >> 5; int img_stride = ((img->width + 31) >> 5) * layers;
/* Interwoven layer data. Skip left columns that are not rendered */ /* Interwoven layer data. Skip left columns that are not rendered */
const uint32_t *layer = (void *)img->data; const uint32_t *layer = (void *)img->data;
layer += (rbox->top * img_columns + rbox->left) * layers; layer += (rbox->top * img_stride) + (rbox->left * layers);
/* Starting value of VRAM pointers */ /* Starting value of VRAM pointers */
int offset = (rbox->y << 2) + (rbox->visual_x >> 5); int offset = (rbox->y << 2) + (rbox->visual_x >> 5);
v1 += offset; v1 += offset;
if(v2) v2 += offset;
/* Number of rows */ /* Number of rows */
int rows = rbox->height; int rows = rbox->height;
/* Mask shift */
int shift = -(rbox->x & 31);
if(rbox->x < 0) shift += 32;
/* Render the grid immediately; mono version */ /* Render the grid immediately; mono version */
if(!v2) while(rows--) if(!v2)
{ {
f.asm_mono_scsp(v1, layer, mask, shift); asm_mono_scsp_t *f = asm_mono_scsp[img->profile];
layer += img_columns * layers; while(rows--)
{
f(v1, layer, mask, rbox->x);
layer += img_stride;
v1 += 4; v1 += 4;
} }
}
/* Gray version */ /* Gray version */
else while(rows--) else
{ {
f.asm_gray_scsp(v1, layer, mask, v2, shift); asm_gray_scsp_t *f = asm_gray_scsp[img->profile];
layer += img_columns * layers; v2 += offset;
while(rows--)
{
f(v1, layer, mask, v2, rbox->x);
layer += img_stride;
v1 += 4; v1 += 4;
v2 += 4; v2 += 4;
} }
}
} }
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left, int bopti_clip(bopti_image_t const *img, struct rbox *r)
int top, int width, int height, uint32_t *v1, uint32_t *v2)
{ {
/* This load/save is not elegant but it makes GCC use register-only
operations, which is what we need for efficiency */
int x = r->visual_x, y = r->y;
int left = r->left, top = r->top;
int width = r->width, height = r->height;
/* Adjust the bounding box of the input image */ /* Adjust the bounding box of the input image */
if(left < 0) width += left, x -= left, left = 0; if(left < 0) width += left, x -= left, left = 0;
if(top < 0) height += top, y -= top, top = 0; if(top < 0) height += top, y -= top, top = 0;
@ -298,32 +281,34 @@ void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
if(x + width > DWIDTH) width = DWIDTH - x; if(x + width > DWIDTH) width = DWIDTH - x;
if(y + height > DHEIGHT) height = DHEIGHT - y; if(y + height > DHEIGHT) height = DHEIGHT - y;
/* Early finish for empty intersections */ r->visual_x = x;
if(width <= 0 || height <= 0) return; r->y = y;
r->left = left;
r->top = top;
r->width = width;
r->height = height;
/* Finish with the noclip variant */ /* Return non-zero if the result is empty */
bopti_render_noclip(x, y, img, left, top, width, height, v1, v2); return (width <= 0 || height <= 0);
} }
void bopti_render_noclip(int visual_x, int y, bopti_image_t const *img, void bopti_render_noclip(bopti_image_t const *img, struct rbox *r,
int left, int top, int width, int height, uint32_t *v1, uint32_t *v2) uint32_t *v1, uint32_t *v2)
{ {
int left = r->left;
/* Start column and end column (both included) */ /* Start column and end column (both included) */
int cl = (left) >> 5; r->left >>= 5;
int cr = (left + width - 1) >> 5;
/* Finish with the standard bopti renderer */ if(r->columns == 1 && (r->visual_x & 31) + r->width <= 32)
struct rbox rbox = { 0, visual_x, y, width, cl, cr, top, height };
if(cl == cr && (visual_x & 31) + width <= 32)
{ {
rbox.x = (visual_x & 31) - (left & 31); r->x = (left & 31) - (r->visual_x & 31);
bopti_render_scsp(img, &rbox, v1, v2); bopti_render_scsp(img, r, v1, v2);
} }
else else
{ {
/* x-coordinate of the first pixel of the first column */ /* x-coordinate of the first pixel of the first column */
rbox.x = visual_x - (left & 31); r->x = r->visual_x - (left & 31);
bopti_render(img, &rbox, v1, v2); bopti_render(img, r, v1, v2);
} }
} }

View file

@ -2,22 +2,43 @@
#include "render-fx.h" #include "render-fx.h"
#include "bopti-asm.h" #include "bopti-asm.h"
#pragma GCC optimize("O3")
/* dsubimage(): Render a section of an image */ /* dsubimage(): Render a section of an image */
void dsubimage(int x, int y, bopti_image_t const *img, int left, int top, void dsubimage(int x, int y, bopti_image_t const *img, int left, int top,
int width, int height, int flags) int width, int height, int flags)
{ {
DMODE_OVERRIDE(dsubimage, x, y, img, left, top, width, height, flags); struct rbox r = {
0, x, y, width, left, 0, top, height
};
DMODE_OVERRIDE(dsubimage, img, &r, flags);
if(img->gray) return; if(img->gray) return;
if(flags & DIMAGE_NOCLIP) /* Intersect the bounding box with both the source image and the VRAM,
except if DIMAGE_NOCLIP is provided */
if(!(flags & DIMAGE_NOCLIP))
{ {
bopti_render_noclip(x, y, img, left, top, width, height, /* Early finish for empty intersections */
gint_vram, NULL); if(bopti_clip(img, &r)) return;
}
left = r.left;
width = r.width;
int visual_x = r.visual_x;
r.left = left >> 5;
r.columns = ((left + width - 1) >> 5) - r.left + 1;
if(r.columns == 1 && (visual_x & 31) + width <= 32)
{
r.x = (left & 31) - (visual_x & 31);
bopti_render_scsp(img, &r, gint_vram, NULL);
} }
else else
{ {
bopti_render_clip(x, y, img, left, top, width, height, /* x-coordinate of the first pixel of the first column */
gint_vram, NULL); r.x = visual_x - (left & 31);
bopti_render(img, &r, gint_vram, NULL);
} }
} }

View file

@ -7,6 +7,7 @@
#include <gint/defs/types.h> #include <gint/defs/types.h>
#include <gint/display.h> #include <gint/display.h>
#include "bopti-asm.h"
/* masks(): Compute the vram masks for a given rectangle /* masks(): Compute the vram masks for a given rectangle
@ -24,13 +25,10 @@
@masks Stores the result of the function (four uint32_t values) */ @masks Stores the result of the function (four uint32_t values) */
void masks(int x1, int x2, uint32_t *masks); void masks(int x1, int x2, uint32_t *masks);
/* bopti_render_clip(): Render a bopti image with clipping /* bopti_clip(): Clip a bounding box to image and VRAM
@x @y Location of the top-left corner
@img Image encoded by [fxconv] @img Image encoded by [fxconv]
@left @top @w @h Bounding box to render @rbox Rendering box */
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */ int bopti_clip(bopti_image_t const *img, struct rbox *rbox);
void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
int top, int w, int h, uint32_t *v1, uint32_t *v2);
/* bopti_render_noclip(): Render a bopti image without clipping /* bopti_render_noclip(): Render a bopti image without clipping
This function is only ever slightly faster than bopti_render_clip(), This function is only ever slightly faster than bopti_render_clip(),
@ -42,8 +40,8 @@ void bopti_render_clip(int x, int y, bopti_image_t const *img, int left,
@img Image encoded by [fxconv] @img Image encoded by [fxconv]
@left @top @w @h Bounding box to render @left @top @w @h Bounding box to render
@v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */ @v1 @v2 VRAMs (gray rendering is used if v2 != NULL) */
void bopti_render_noclip(int x, int y, bopti_image_t const *img, int left, void bopti_render_noclip(bopti_image_t const *img, struct rbox *rbox,
int top, int w, int h, uint32_t *v1, uint32_t *v2); uint32_t *v1, uint32_t *v2);
//--- //---
// Alternate rendering modes // Alternate rendering modes
@ -72,8 +70,7 @@ struct rendering_mode
(int x, int y, int fg, int bg, int halign, int valign, (int x, int y, int fg, int bg, int halign, int valign,
char const *str); char const *str);
void (*dsubimage) void (*dsubimage)
(int x, int y, bopti_image_t const *image, int left, int top, (bopti_image_t const *image, struct rbox *r, int flags);
int width, int height, int flags);
}; };
/* The alternate rendering mode pointer (initially NULL)*/ /* The alternate rendering mode pointer (initially NULL)*/
@ -89,9 +86,7 @@ void gint_gvline(int y1, int y2, int x, int color);
void gtext_opt void gtext_opt
(int x, int y, int fg, int bg, int halign, int valign, (int x, int y, int fg, int bg, int halign, int valign,
char const *str); char const *str);
void gsubimage void gsubimage(bopti_image_t const *image, struct rbox *r, int flags);
(int x, int y, bopti_image_t const *image, int left, int top,
int width, int height, int flags);
/* Short macro to call the alternate rendering function when available */ /* Short macro to call the alternate rendering function when available */
#define DMODE_OVERRIDE(func, ...) \ #define DMODE_OVERRIDE(func, ...) \