mirror of
https://git.planet-casio.com/Lephenixnoir/gint.git
synced 2025-04-03 09:07:11 +02:00
render-cg: add new image rendering functions with dynamic effects
This commit is contained in:
parent
904ab74984
commit
f219e5c882
31 changed files with 2506 additions and 47 deletions
|
@ -169,6 +169,35 @@ set(SOURCES_CG
|
|||
src/render-cg/gint_dline.c
|
||||
src/render-cg/topti-asm.s
|
||||
src/render-cg/topti.c
|
||||
# Fast image renderer
|
||||
src/render-cg/image/image.c
|
||||
src/render-cg/image/image_rgb16.S
|
||||
src/render-cg/image/image_rgb16_normal.S
|
||||
src/render-cg/image/image_rgb16_clearbg_dye.S
|
||||
src/render-cg/image/image_rgb16_swapcolor.S
|
||||
src/render-cg/image/image_p8.S
|
||||
src/render-cg/image/image_p8_normal.S
|
||||
src/render-cg/image/image_p8_clearbg.S
|
||||
src/render-cg/image/image_p8_swapcolor.S
|
||||
src/render-cg/image/image_p8_dye.S
|
||||
src/render-cg/image/image_p4.S
|
||||
src/render-cg/image/image_p4_normal.S
|
||||
src/render-cg/image/image_p4_clearbg.S
|
||||
src/render-cg/image/image_p4_swapcolor.S
|
||||
src/render-cg/image/image_p4_dye.S
|
||||
# Interface to the fast image renderer
|
||||
src/render-cg/image/image_rgb16.c
|
||||
src/render-cg/image/image_rgb16_effect.c
|
||||
src/render-cg/image/image_rgb16_swapcolor.c
|
||||
src/render-cg/image/image_rgb16_dye.c
|
||||
src/render-cg/image/image_p8.c
|
||||
src/render-cg/image/image_p8_effect.c
|
||||
src/render-cg/image/image_p8_swapcolor.c
|
||||
src/render-cg/image/image_p8_dye.c
|
||||
src/render-cg/image/image_p4.c
|
||||
src/render-cg/image/image_p4_effect.c
|
||||
src/render-cg/image/image_p4_swapcolor.c
|
||||
src/render-cg/image/image_p4_dye.c
|
||||
)
|
||||
|
||||
set(ASSETS_FX src/font5x7.png)
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
//---
|
||||
// gint:display-cg - fxcg50 rendering functions
|
||||
// gint:display-cg - fx-CG 50 rendering functions
|
||||
//
|
||||
// This module covers all 16-bit opaque rendering functions. For
|
||||
// gamma-related functions, color composition, check out a color library.
|
||||
// This module covers rendering functions specific to the fx-CG 50. In addition
|
||||
// to triple-buffering management, this mainly includes image manipulation
|
||||
// tools as well as the very versatile dimage_effect() and dsubimage_effect()
|
||||
// functions that support high-performance image rendering with a number of
|
||||
// geometric and color effects.
|
||||
//
|
||||
// All the functions in this module work on a 396x224 resolution - gint
|
||||
// lets you use the full surface!
|
||||
// The fx-CG OS restricts the display to a 384x216 rectangle rougly around the
|
||||
// center, leaving margins on three sides. However, gint configures the display
|
||||
// to use the full 396x224 surface!
|
||||
//---
|
||||
|
||||
#ifndef GINT_DISPLAY_CG
|
||||
|
@ -18,6 +22,7 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
#include <gint/defs/types.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
/* Dimensions of the VRAM */
|
||||
#define DWIDTH 396
|
||||
|
@ -57,49 +62,9 @@ enum {
|
|||
green is not used). */
|
||||
#define C_RGB(r,g,b) (((r) << 11) | ((g) << 6) | (b))
|
||||
|
||||
//---
|
||||
// Image rendering (bopti)
|
||||
//---
|
||||
/* See <gint/image.h> for the details on image manipulation. */
|
||||
typedef image_t bopti_image_t;
|
||||
|
||||
/* bopti_image_t: Image files encoded for bopti
|
||||
This format is created by the fxSDK's [fxconv] tool from standard images. */
|
||||
typedef struct
|
||||
{
|
||||
/* Color profile (type of palette), could be extended into a bit field
|
||||
later on */
|
||||
uint16_t profile;
|
||||
|
||||
/* Color code assigned to transparent pixels (unused in 16-bit). In
|
||||
P8_RGB565A, the value assigned to alpha is always 0. */
|
||||
uint16_t alpha;
|
||||
|
||||
/* Full width and height, in pixels */
|
||||
uint16_t width;
|
||||
uint16_t height;
|
||||
|
||||
/* Here we lose structure because of the flexible array.
|
||||
|
||||
RGB565, RGB565A:
|
||||
* Pixels in row-major order, 16 bits per pixel
|
||||
P8:
|
||||
* Palette with 256 entries (512 bytes total)
|
||||
* Pixels in row-major order, 8 bits per pixel
|
||||
P8_RGB565A, P8_RGB565:
|
||||
* Number of entries in palette, N (2 bytes)
|
||||
* Palette with N entries (2N bytes)
|
||||
* Pixels in row-major order, 8 bits per pixel (signed indices in
|
||||
an uint16_t array starting at <palette>+<256 bytes>)
|
||||
P4/P4_RGB565A, P4_RGB565:
|
||||
* Palette with 16 entries (32 bytes total)
|
||||
* Pixels in row-major order, 4 bits per pixel, each row
|
||||
byte-padded */
|
||||
uint16_t data[];
|
||||
|
||||
} GPACKED(4) bopti_image_t;
|
||||
|
||||
/* Old alias to image_t, now deprecated because of libimg */
|
||||
typedef bopti_image_t image_t __attribute__((deprecated(
|
||||
"image_t has been renamed to bopti_image_t")));
|
||||
|
||||
//---
|
||||
// Video RAM management
|
||||
|
|
365
include/gint/image.h
Normal file
365
include/gint/image.h
Normal file
|
@ -0,0 +1,365 @@
|
|||
//---
|
||||
// gint:image - Image manipulation and rendering
|
||||
//
|
||||
// Note: this module is currently only available on fx-CG.
|
||||
//
|
||||
// This header provides image manipulation functions. This mainly consists of a
|
||||
// reference-based image format, various access and modification functions, and
|
||||
// a number of high-performance transformations and rendering effects. If you
|
||||
// find yourself limited by rendering time, note that RAM writing speed is
|
||||
// often the bottleneck, and image rendering is much faster in Azur (which is
|
||||
// what the renderer was initially designed for).
|
||||
//
|
||||
// We support 3 bit depths: full-color 16-bit (RGB565), indexed 8-bit (P8) and
|
||||
// indexed 4-bit (P4). All three have an "alpha" variation where one color is
|
||||
// treated as transparent, leading to 6 total formats.
|
||||
//
|
||||
// The image renderers support so-called *dynamic effects*, which are image
|
||||
// transformations performed on-the-fly while rendering, without generating an
|
||||
// intermediate image. They comprise straightforward transformations that
|
||||
// achieve similar performance to straight rendering and can be combined to
|
||||
// some extent, which makes them reliable whenever applicable.
|
||||
//
|
||||
// TODO: Switch to libimg-style image refs.
|
||||
//---
|
||||
|
||||
#ifndef GINT_IMAGE
|
||||
#define GINT_IMAGE
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef FXCG50
|
||||
#error <gint/image.h> is only supported on FXCG50
|
||||
#else
|
||||
|
||||
#include <gint/defs/attributes.h>
|
||||
#include <gint/defs/types.h>
|
||||
|
||||
//---
|
||||
// Image structures
|
||||
//---
|
||||
|
||||
/* Image formats. Note that transparency really only indicates the default
|
||||
rendering method, as a transparent background can always be added or removed
|
||||
by a dynamic effect on any image. */
|
||||
enum {
|
||||
IMAGE_RGB565 = 0, /* RGB565 without alpha */
|
||||
IMAGE_RGB565A = 1, /* RGB565 with one transparent color */
|
||||
IMAGE_P8_RGB565 = 4, /* 8-bit palette, all opaque colors */
|
||||
IMAGE_P8_RGB565A = 5, /* 8-bit with one transparent color */
|
||||
IMAGE_P4_RGB565 = 6, /* 4-bit palette, all opaque colors */
|
||||
IMAGE_P4_RGB565A = 3, /* 4-bit with one transparent color */
|
||||
|
||||
IMAGE_DEPRECATED_P8 = 2,
|
||||
};
|
||||
|
||||
/* image_t: gint's native bitmap image format
|
||||
Images of this format can be created through this header's API but also by
|
||||
using the fxSDK's built-in image converters with fxconv. */
|
||||
typedef struct
|
||||
{
|
||||
/* Color format, one of the IMAGE_* values defined above. */
|
||||
uint16_t profile;
|
||||
/* For formats with alpha, value or index used for transparency. */
|
||||
uint16_t alpha;
|
||||
/* Full width and height, in pixels */
|
||||
uint16_t width;
|
||||
uint16_t height;
|
||||
|
||||
/* Here we lose structure because of the flexible array.
|
||||
|
||||
RGB565, RGB565A:
|
||||
* Pixels in row-major order, 16 bits per pixel
|
||||
P8:
|
||||
* Palette with 256 entries (512 bytes total)
|
||||
* Pixels in row-major order, 8 bits per pixel
|
||||
P8_RGB565A, P8_RGB565:
|
||||
* Number of entries in palette, N (2 bytes)
|
||||
* Palette with N entries (2N bytes)
|
||||
* Pixels in row-major order, 8 bits per pixel (signed indices in
|
||||
an uint16_t array starting at <palette>+<256 bytes>)
|
||||
P4/P4_RGB565A, P4_RGB565:
|
||||
* Palette with 16 entries (32 bytes total)
|
||||
* Pixels in row-major order, 4 bits per pixel, each row
|
||||
byte-padded */
|
||||
uint16_t data[];
|
||||
|
||||
} GPACKED(4) image_t;
|
||||
|
||||
/* Dynamic effects: these transformations can be applied on images while
|
||||
rendering. Not all effects can be combined; unless specified otherwise:
|
||||
- HFLIP and VFLIP can both be added regardless of any other effect
|
||||
- At most one color effect can be applied */
|
||||
enum {
|
||||
/* Value 0x01 is reserved, because it is DIMAGE_NOCLIP, which although
|
||||
part of the old API still needs to be supported. */
|
||||
|
||||
/* [Any]: Skip clipping the command against the source image */
|
||||
IMAGE_NOCLIP_INPUT = 0x04,
|
||||
/* [Any]: Skip clipping the command against the output VRAM */
|
||||
IMAGE_NOCLIP_OUTPUT = 0x08,
|
||||
/* [Any]: Skip clipping both */
|
||||
IMAGE_NOCLIP = IMAGE_NOCLIP_INPUT | IMAGE_NOCLIP_OUTPUT,
|
||||
|
||||
// Geometric effects. These values should remain at exactly bit 8 and
|
||||
// following, or change gint_image_mkcmd() along with it.
|
||||
|
||||
/* [Any]: Flip image vertically */
|
||||
IMAGE_VFLIP = 0x0100,
|
||||
/* [Any]: Flip image horizontally */
|
||||
IMAGE_HFLIP = 0x0200,
|
||||
|
||||
// Color effects
|
||||
|
||||
/* [RGB565, P8_RGB565, P4_RGB565]: Make a color transparent
|
||||
Adds one argument:
|
||||
* Color to clear (RGB16: 16-bit value; P8/P4: palette index) */
|
||||
IMAGE_CLEARBG = 0x10,
|
||||
/* [RGB565, P8_RGB565, P4_RGB565]: Turn a color into another
|
||||
Adds two arguments:
|
||||
* Color to replace (RGB16: 16-bit value; P8/P4: palette index)
|
||||
* Replacement color (16-bit value) */
|
||||
IMAGE_SWAPCOLOR = 0x20,
|
||||
/* [RGB565A, P8_RGB565A, P4_RGB565A]: Add a background
|
||||
Adds one argument:
|
||||
* Background color (16-bit value) */
|
||||
IMAGE_ADDBG = 0x40,
|
||||
/* [RGB565A, P8_RGB565A, P4_RGB565A]: Dye all non-transparent pixels
|
||||
Adds one argument:
|
||||
* Dye color (16-bit value) */
|
||||
IMAGE_DYE = 0x80,
|
||||
};
|
||||
|
||||
//---
|
||||
// Image access and information
|
||||
//---
|
||||
|
||||
/* TODO: Expand */
|
||||
|
||||
int image_get_pixel(image_t const *img, int x, int y);
|
||||
|
||||
int image_decode_pixel(image_t const *img, int pixel);
|
||||
|
||||
//---
|
||||
// Image rendering functions
|
||||
//
|
||||
// The following functions extend dimage() and dsubimage(). The [effects]
|
||||
// parameter takes a combination of IMAGE_* flags and effects, limited to the
|
||||
// combinations previously described, with additional arguments depending on
|
||||
// the color effect being applied.
|
||||
//
|
||||
// dimage_effect(x, y, img, effects, ...)
|
||||
// dsubimage_effect(x, y, img, left, top, w, h, effects, ...)
|
||||
//
|
||||
// However if you use these super-generic functions you will link the code for
|
||||
// all effects and all formats into your add-in, which takes a fair amount of
|
||||
// space. If that's a problem, you can use the more specific functions below:
|
||||
//
|
||||
// * dimage_<FORMAT>_<EFFECT>() for one particular format (rgb16, p8, p4) along
|
||||
// with one particular color effect (clearbg, swapcolor, addbg, dye).
|
||||
// * dimage_<FORMAT>() is like the above when no color effect is applied.
|
||||
//
|
||||
// All of them support the HFLIP and VFLIP flags. For effect-specific functions
|
||||
// the corresponding effect flag can be omitted (fi. IMAGE_CLEARBG is implicit
|
||||
// when using dimage_p8_clearbg()).
|
||||
//---
|
||||
|
||||
/* dimage_effect(): Generalized dimage() supporting dynamic effects */
|
||||
#define dimage_effect(x, y, img, eff, ...) \
|
||||
dsubimage_effect(x, y, img, 0, 0, (img)->width, (img)->height, eff, \
|
||||
##__VA_ARGS__)
|
||||
/* dsubimage_effect(): Generalized dsubimage() supporting dynamic effects */
|
||||
void dsubimage_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int effects, ...);
|
||||
|
||||
/* Specific versions for each format */
|
||||
#define DIMAGE_SIG1(NAME, ...) \
|
||||
void dimage_ ## NAME(int x, int y, image_t const *img,##__VA_ARGS__); \
|
||||
void dsubimage_ ## NAME(int x, int y, image_t const *img, \
|
||||
int left, int top, int w, int h, ##__VA_ARGS__);
|
||||
#define DIMAGE_SIG(NAME, ...) \
|
||||
DIMAGE_SIG1(rgb16 ## NAME, ##__VA_ARGS__) \
|
||||
DIMAGE_SIG1(p8 ## NAME, ##__VA_ARGS__) \
|
||||
DIMAGE_SIG1(p4 ## NAME, ##__VA_ARGS__)
|
||||
|
||||
/* d[sub]image_{rgb16,p8,p4}_effect(..., effects, <extra arguments>) */
|
||||
DIMAGE_SIG(_effect, int effects, ...)
|
||||
/* d[sub]image_{rgb16,p8,p4}(..., effects) (no color effect, like dimage()) */
|
||||
DIMAGE_SIG(, int effects)
|
||||
/* d[sub]image_{rgb16,p8,p4}_clearbg(..., effects, bg_color_or_index) */
|
||||
DIMAGE_SIG(_clearbg, int effects, int bg_color_or_index)
|
||||
/* d[sub]image_{rgb16,p8,p4}_swapcolor(..., effects, source, replacement) */
|
||||
DIMAGE_SIG(_swapcolor, int effects, int source, int replacement)
|
||||
/* d[sub]image_{rgb16,p8,p4}_addbg(..., effects, bg_color) */
|
||||
DIMAGE_SIG(_addbg, int effects, int bg_color)
|
||||
/* d[sub]image_{rgb16,p8,p4}_dye(..., effects, dye_color) */
|
||||
DIMAGE_SIG(_dye, int effects, int dye_color)
|
||||
|
||||
#define dimage_rgb16_effect(x, y, img, eff, ...) \
|
||||
dsubimage_rgb16_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
#define dimage_p8_effect(x, y, img, eff, ...) \
|
||||
dsubimage_p8_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
#define dimage_p4_effect(x, y, img, eff, ...) \
|
||||
dsubimage_p4_effect(x, y, img, 0, 0, (img)->width, (img)->height, \
|
||||
eff, ##__VA_ARGS__)
|
||||
|
||||
#undef DIMAGE_SIG
|
||||
#undef DIMAGE_SIG1
|
||||
|
||||
//---
|
||||
// Clipping utilities
|
||||
//---
|
||||
|
||||
/* Double box specifying both a source and target area */
|
||||
struct gint_image_box
|
||||
{
|
||||
/* Target location of top-left corner */
|
||||
int x, y;
|
||||
/* Width and height of rendered sub-image */
|
||||
int w, h;
|
||||
/* Source bounding box (low included, high excluded) */
|
||||
int left, top;
|
||||
};
|
||||
|
||||
/* Clip the provided box against the input. If, after clipping, the box no
|
||||
longer intersects the output (whose size is specified as out_w/out_h),
|
||||
returns false. Otherwise, returns true. */
|
||||
bool gint_image_clip_input(image_t const *img, struct gint_image_box *box,
|
||||
int out_w, int out_h);
|
||||
|
||||
/* Clip the provided box against the output. */
|
||||
void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h);
|
||||
|
||||
//---
|
||||
// Internal image rendering routines
|
||||
//
|
||||
// The following functions (or non-functions) are implemented in assembler and
|
||||
// make up the internal interface of the image renderer. If you just want to
|
||||
// display images, use dimage() and variations; these are only useful if you
|
||||
// have a different rendering system and wish to use image rendering with
|
||||
// dynamic effects in it.
|
||||
//---
|
||||
|
||||
/* Renderer command. This structure includes most of the information used by
|
||||
the image renderer to perform blits. Some of the information on the target
|
||||
is also passed as direct arguments, which is more convenient and slightly
|
||||
faster.
|
||||
|
||||
Most of the values here can be set with gint_image_mkcmd(). The last two
|
||||
members, along with the return values of the gint_image_FORMAT_loop()
|
||||
functions, are used to update the command if one needs to draw *parts* of
|
||||
the image and resume the rendering later. This is used in Azur. */
|
||||
struct gint_image_cmd
|
||||
{
|
||||
/* Shader ID. This is used in Azur, and ignored in gint */
|
||||
uint8_t shader_id;
|
||||
/* Dynamic effects
|
||||
Bit 0: VFLIP
|
||||
Bit 1: HFLIP
|
||||
Bits 2-7: 0=NONE, 1=CLEARBG, 2=SWAPCOLOR, 3=DYE */
|
||||
uint8_t effect;
|
||||
|
||||
/* Number of pixels to render per line. For formats that force either x
|
||||
or width alignment (most of them), this is already adjusted to a
|
||||
suitable multiple (usually a multiple of 2). */
|
||||
int16_t columns;
|
||||
|
||||
/* Stride of the input image (number of pixels between each row), in
|
||||
pixels, without subtracting the number of columns */
|
||||
int16_t input_stride;
|
||||
|
||||
/* Number of lines in the command. This can be adjusted freely, and is
|
||||
particularly useful in Azur for fragmented rendering. */
|
||||
uint8_t lines;
|
||||
|
||||
/* [Any effect]: Offset of first edge */
|
||||
int8_t edge_1;
|
||||
|
||||
/* Core loop; this is an internal label of the renderer */
|
||||
void const *loop;
|
||||
/* Output pixel array, offset by target x/y */
|
||||
void const *output;
|
||||
/* Input pixel array, offset by source x/y. For formats that force x
|
||||
alignment, this is already adjusted. */
|
||||
void const *input;
|
||||
/* Palette, when applicable */
|
||||
uint16_t const *palette;
|
||||
|
||||
/* [Any effect]: Offset of right edge */
|
||||
int16_t edge_2;
|
||||
/* [CLEARBG, SWAPCOLOR]: Source color */
|
||||
uint16_t color_1;
|
||||
/* [SWAPCOLOR]: Destination color */
|
||||
uint16_t color_2;
|
||||
|
||||
/* Remaining height (for updates between fragments) */
|
||||
int16_t height;
|
||||
/* Local x position (for updates between fragments) */
|
||||
int16_t x;
|
||||
};
|
||||
|
||||
/* gint_image_mkcmd(): Prepare a rendering command with dynamic effects
|
||||
|
||||
This function crafts an image renderer command. It loads all the settings
|
||||
except for effect-dependent parameters: the [.loop] label, the color section
|
||||
of [.effect], and color effect settings. See the effect-specific functions
|
||||
to see how they are defined.
|
||||
|
||||
The benefit of this approach is that the rendering code does not need to be
|
||||
linked in unless an effect is actually used, which avoids blowing up the
|
||||
size of the add-in as the number of support dynamic effects increases.
|
||||
|
||||
@box Requested on-screen box (will be clipped depending on effects)
|
||||
@img Source image
|
||||
@effects Set of dynamic effects to be applied, as an [IMAGE_*] bitmask
|
||||
@left_edge Whether to force 2-alignment on the input (box->left)
|
||||
@right_edge Whether to force 2-alignment on the width
|
||||
@cmd Command to be filled
|
||||
@out_width Output width (usually DWIDTH)
|
||||
@out_height Output height (usually DHEIGHT)
|
||||
|
||||
Returns false if there is nothing to render because of clipping (in which
|
||||
case [cmd] is unchanged), true otherwise. [*box] is also updated to reflect
|
||||
the final box after clipping but not accounting for edges. */
|
||||
bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
|
||||
int effects, bool left_edge, bool right_edge,
|
||||
struct gint_image_cmd *cmd, int out_width, int out_height);
|
||||
|
||||
/* Entry point of the renderers. These functions can be called normally as long
|
||||
as you can build the commands (eg. by using gint_image_mkcmd() then filling
|
||||
the effect-specific information). */
|
||||
void *gint_image_rgb16_loop (int output_width, struct gint_image_cmd *cmd);
|
||||
void *gint_image_p8_loop (int output_width, struct gint_image_cmd *cmd);
|
||||
void *gint_image_p4_loop (int output_width, struct gint_image_cmd *cmd);
|
||||
|
||||
/* Renderer fragments. The following can absolutely not be called from C code
|
||||
as they aren't full functions (and this isn't their prototype). These are
|
||||
continuations to be specified in the [.loop] field of a command before using
|
||||
one of the functions above. */
|
||||
|
||||
void gint_image_rgb16_normal(void);
|
||||
void gint_image_rgb16_clearbg(void);
|
||||
void gint_image_rgb16_swapcolor(void);
|
||||
void gint_image_rgb16_dye(void);
|
||||
|
||||
void gint_image_p8_normal(void);
|
||||
void gint_image_p8_clearbg(void);
|
||||
void gint_image_p8_swapcolor(void);
|
||||
void gint_image_p8_dye(void);
|
||||
|
||||
void gint_image_p4_normal(void);
|
||||
void gint_image_p4_clearbg(void);
|
||||
void gint_image_p4_swapcolor(void);
|
||||
void gint_image_p4_dye(void);
|
||||
|
||||
#endif /* FXCG50 */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* GINT_IMAGE */
|
107
src/render-cg/image/image.c
Normal file
107
src/render-cg/image/image.c
Normal file
|
@ -0,0 +1,107 @@
|
|||
#include <gint/image.h>
|
||||
#include <gint/display.h>
|
||||
|
||||
bool gint_image_clip_input(image_t const *img, struct gint_image_box *b,
|
||||
int out_w, int out_h)
|
||||
{
|
||||
/* Adjust the bounding box of the input image */
|
||||
if(b->left < 0) b->w += b->left, b->x -= b->left, b->left = 0;
|
||||
if(b->top < 0) b->h += b->top, b->y -= b->top, b->top = 0;
|
||||
if(b->left + b->w > img->width) b->w = img->width - b->left;
|
||||
if(b->top + b->h > img->height) b->h = img->height - b->top;
|
||||
|
||||
/* Check whether the box intersects the screen */
|
||||
if(b->w <= 0 || b->h <= 0)
|
||||
return false;
|
||||
if(b->x + b->w <= 0 || b->x >= out_w)
|
||||
return false;
|
||||
if(b->y + b->w <= 0 || b->y >= out_h)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void gint_image_clip_output(struct gint_image_box *b, int out_w, int out_h)
|
||||
{
|
||||
/* Intersect with the bounding box on-screen */
|
||||
if(b->y < 0) b->top -= b->y, b->h += b->y, b->y = 0;
|
||||
if(b->y + b->h > out_h) b->h = (out_h - b->y);
|
||||
if(b->x < 0) b->left -= b->x, b->w += b->x, b->x = 0;
|
||||
if(b->x + b->w > out_w) b->w = (out_w - b->x);
|
||||
}
|
||||
|
||||
bool gint_image_mkcmd(struct gint_image_box *box, image_t const *img,
|
||||
int effects, bool left_edge, bool right_edge,
|
||||
struct gint_image_cmd *cmd, int out_width, int out_height)
|
||||
{
|
||||
/* Convert the old DIMAGE_NOCLIP flag */
|
||||
if(effects & DIMAGE_NOCLIP)
|
||||
effects |= IMAGE_NOCLIP;
|
||||
|
||||
if(!(effects & IMAGE_NOCLIP_INPUT)) {
|
||||
if(!gint_image_clip_input(img, box, out_width, out_height))
|
||||
return false;
|
||||
}
|
||||
if(!(effects & IMAGE_NOCLIP_OUTPUT))
|
||||
gint_image_clip_output(box, out_width, out_height);
|
||||
|
||||
cmd->effect = (effects & (IMAGE_VFLIP | IMAGE_HFLIP)) >> 8;
|
||||
cmd->columns = box->w;
|
||||
cmd->input_stride = img->width;
|
||||
cmd->x = box->x;
|
||||
cmd->edge_1 = -1;
|
||||
cmd->edge_2 = -1;
|
||||
|
||||
int p = img->profile;
|
||||
int input_row = (effects & IMAGE_VFLIP) ? box->top+box->h-1 : box->top;
|
||||
|
||||
if(p == IMAGE_RGB565 || p == IMAGE_RGB565A) {
|
||||
cmd->input_stride += (cmd->input_stride & 1);
|
||||
cmd->input = (void *)img->data +
|
||||
(input_row * cmd->input_stride + box->left) * 2;
|
||||
}
|
||||
else if(p == IMAGE_P8_RGB565 || p == IMAGE_P8_RGB565A) {
|
||||
cmd->input = (void *)img->data + img->data[0] * 2 + 2 +
|
||||
(input_row * img->width + box->left);
|
||||
cmd->palette = (void *)img->data + 258;
|
||||
}
|
||||
else {
|
||||
cmd->input = (void *)img->data + 32 +
|
||||
input_row * ((img->width + 1) >> 1) + (box->left >> 1);
|
||||
cmd->palette = img->data;
|
||||
/* By default, use edge_1 to indicate (box->left & 1), so that
|
||||
functions that don't use edge_1 can still work properly */
|
||||
if(!left_edge)
|
||||
cmd->edge_1 = (box->left & 1);
|
||||
}
|
||||
|
||||
if(left_edge && (box->left & 1)) {
|
||||
if(effects & IMAGE_HFLIP) {
|
||||
cmd->edge_1 = cmd->columns;
|
||||
}
|
||||
else {
|
||||
cmd->x--;
|
||||
cmd->edge_1 = 0;
|
||||
}
|
||||
cmd->columns++;
|
||||
}
|
||||
if(right_edge && (cmd->columns & 1)) {
|
||||
if(effects & IMAGE_HFLIP) {
|
||||
cmd->x--;
|
||||
cmd->edge_1++;
|
||||
cmd->edge_2 = 0;
|
||||
}
|
||||
else {
|
||||
cmd->edge_2 = cmd->columns;
|
||||
}
|
||||
cmd->columns++;
|
||||
}
|
||||
|
||||
/* Settings for further updates */
|
||||
cmd->height = box->h;
|
||||
|
||||
/* This is the default for gint, but Azur overwrites it */
|
||||
cmd->lines = box->h;
|
||||
cmd->output = (void *)gint_vram + (DWIDTH * box->y + cmd->x) * 2;
|
||||
return true;
|
||||
}
|
25
src/render-cg/image/image_macros.S
Normal file
25
src/render-cg/image/image_macros.S
Normal file
|
@ -0,0 +1,25 @@
|
|||
/* START: Sets up the inner and outer loop. The outer loop is anything between
|
||||
the calls to macros START and END, while the inner loop is the code between
|
||||
labels 2: and 3: (both *INCLUDED*). */
|
||||
.macro START
|
||||
ldrs 2f
|
||||
ldre 3f
|
||||
1: ldrc r2
|
||||
nop
|
||||
.endm
|
||||
|
||||
/* END: Finishes the outer loop and adds strides. */
|
||||
.macro END
|
||||
dt r1
|
||||
add r4, r3
|
||||
bf.s 1b
|
||||
add r6, r5
|
||||
.endm
|
||||
|
||||
/* EPILOGUE: Finishes the call by reloading registers saved in the prologue. */
|
||||
.macro EPILOGUE
|
||||
mov.l @r15+, r9
|
||||
mov r3, r0
|
||||
rts
|
||||
mov.l @r15+, r8
|
||||
.endm
|
86
src/render-cg/image/image_p4.S
Normal file
86
src/render-cg/image/image_p4.S
Normal file
|
@ -0,0 +1,86 @@
|
|||
.global _gint_image_p4_loop
|
||||
|
||||
/* gint's image renderer: 4-bit indexed entry point
|
||||
|
||||
P4 compacts pixel data further than P8 by restricting values to a 16-color
|
||||
palette and packing 2 pixels in each byte. This severely restricts our
|
||||
ability to use sub-images because odd positions land within bytes.
|
||||
|
||||
Fortunately, we can solve this by using more edge pixels. The simplest way
|
||||
to write a P4 loop is to process 2 pixels from a 2-aligned source image
|
||||
position in a single iteration. Other structures don't even come close in
|
||||
terms of CPU performance (which, as a reminder, is the main bottleneck in
|
||||
Azur but not in gint): selecting nibbles individually is too long, while not
|
||||
unrolling is still clearly inefficient. So it becomes very important to
|
||||
forcibly align the sub-image on byte-aligned input boundaries and stick to
|
||||
that grid.
|
||||
|
||||
Obviously, this approach causes up to one extra pixel to be overwritten on
|
||||
each side of every line. We solve this problem by adding *another* edge
|
||||
pixel on the left side. In the renderer this is called the left edge or
|
||||
edge_1, while the standard one is called right edge or edge_2.
|
||||
|
||||
r0: - (initially: cmd.effect)
|
||||
r1: Number of lines remaining to draw
|
||||
r2: Number of columns per line
|
||||
r3: Input pointer
|
||||
r4: Input stride
|
||||
r5: Output pointer
|
||||
r6: Output stride
|
||||
r7: Right edge pointer
|
||||
r8: - (initially: cmd)
|
||||
r9: - (initially: cmd.loop)
|
||||
r10: Left edge pointer */
|
||||
|
||||
_gint_image_p4_loop:
|
||||
/* r4: int output_width (pixels)
|
||||
r5: struct gint_image_cmd *cmd */
|
||||
|
||||
mov.b @(1,r5), r0 /* cmd.effect */
|
||||
add #2, r5
|
||||
|
||||
mov.w @r5+, r2 /* cmd.columns */
|
||||
mov r4, r6
|
||||
|
||||
mov.l r8, @-r15
|
||||
mov r5, r8
|
||||
|
||||
/* For here on the command is r8 */
|
||||
|
||||
mov.l r9, @-r15
|
||||
sub r2, r6
|
||||
|
||||
mov.w @r8+, r4 /* cmd.input_stride */
|
||||
add r6, r6
|
||||
|
||||
mov.b @r8+, r1 /* cmd.lines */
|
||||
shlr r4
|
||||
|
||||
mov.l r10, @-r15
|
||||
extu.b r1, r1
|
||||
|
||||
mov.b @r8+, r10 /* cmd.edge_1 */
|
||||
nop
|
||||
|
||||
mov #0, r9
|
||||
addc r9, r4 /* r4 = (img.width + 1) >> 1 */
|
||||
|
||||
mov.l @r8+, r9
|
||||
shlr r0 /* T bit is now VFLIP */
|
||||
|
||||
mov.l @r8+, r5 /* cmd.output */
|
||||
nop
|
||||
|
||||
bf.s _NO_VFLIP
|
||||
mov.l @r8+, r3 /* cmd.input */
|
||||
|
||||
_VFLIP:
|
||||
neg r4, r4
|
||||
nop
|
||||
|
||||
_NO_VFLIP:
|
||||
mov r2, r7
|
||||
shlr r7
|
||||
|
||||
jmp @r9
|
||||
subc r7, r4
|
42
src/render-cg/image/image_p4.c
Normal file
42
src/render-cg/image/image_p4.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include <gint/image.h>
|
||||
#include <gint/display.h>
|
||||
|
||||
void dimage_p4(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
dsubimage_p4(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void dsubimage_p4(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_P4_RGB565A)
|
||||
return dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.loop = gint_image_p4_normal;
|
||||
gint_image_p4_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_p4_clearbg(int x, int y, image_t const *img, int eff, int bg)
|
||||
{
|
||||
dsubimage_p4_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
|
||||
}
|
||||
|
||||
void dsubimage_p4_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = gint_image_p4_clearbg;
|
||||
gint_image_p4_loop(DWIDTH, &cmd);
|
||||
}
|
153
src/render-cg/image/image_p4_clearbg.S
Normal file
153
src/render-cg/image/image_p4_clearbg.S
Normal file
|
@ -0,0 +1,153 @@
|
|||
.global _gint_image_p4_clearbg
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P4 CLEARBG, RAM version: by NULL canceling.
|
||||
|
||||
This function is similar to P8 CLEARBG. Transparent pixels are not limited
|
||||
by RAM writing speed, so a tight CPU loop is used. See P8 CLEARBG for an
|
||||
explanation of NULL canceling.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: Alpha value
|
||||
r9: Palette
|
||||
r10: Left edge pointer
|
||||
r11: Nullable output pointer
|
||||
r12: 0 (in outer loop: edge stride)
|
||||
r13: [temporary]
|
||||
r14: [temporary]
|
||||
|
||||
Spilled to stack:
|
||||
@(-12,r15): Right edge value
|
||||
@(-8,r15): Left edge value
|
||||
@(-4,r15): Edge stride */
|
||||
|
||||
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
shlr r2
|
||||
nop
|
||||
|
||||
add r10, r10
|
||||
nop
|
||||
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
mov r2, r0
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shll2 r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add r5, r10
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-4, r5
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_1 */
|
||||
add #-1, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
shll r8 /* alpha*2 compares against palette offsets */
|
||||
nop
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
nop
|
||||
|
||||
mov.w @r7, r0 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
shll \TMP1
|
||||
|
||||
mov.w @r10, r0 /* Save left edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
nop
|
||||
|
||||
mov.l r12, @-r15
|
||||
mov #0, r12
|
||||
|
||||
2: mov \TMP1, r0
|
||||
and #0x1e, r0
|
||||
|
||||
cmp/eq r0, r8
|
||||
mov #-1, r11
|
||||
|
||||
addc r12, r11
|
||||
mov #-4, \TMP2
|
||||
|
||||
and r5, r11
|
||||
mov.w @(r0,r9), r0
|
||||
|
||||
shld \TMP2, \TMP1
|
||||
mov #0x1e, \TMP2
|
||||
|
||||
and \TMP2, \TMP1
|
||||
mov.w r0, @(\OFF1,r11)
|
||||
|
||||
cmp/eq \TMP1, r8
|
||||
mov #-1, r11
|
||||
|
||||
addc r12, r11
|
||||
mov \TMP1, r0
|
||||
|
||||
and r5, r11
|
||||
mov.b @r3+, \TMP1
|
||||
|
||||
add #\OUT_DIR, r5
|
||||
mov.w @(r0,r9), r0
|
||||
|
||||
mov.w r0, @(\OFF2,r11)
|
||||
3: shll \TMP1
|
||||
|
||||
mov.l @r15+, r12
|
||||
nop
|
||||
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r10 /* Restore left edge */
|
||||
add r12, r10
|
||||
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p4_clearbg:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_LOOP 0, 4, r13, r14, 6, 4
|
||||
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 0, 2
|
147
src/render-cg/image/image_p4_dye.S
Normal file
147
src/render-cg/image/image_p4_dye.S
Normal file
|
@ -0,0 +1,147 @@
|
|||
.global _gint_image_p4_dye
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P4 DYE, RAM version: by NULL canceling.
|
||||
|
||||
Like with P8, this effect removes most of the complexity because there is no
|
||||
longer any need to index the palette. However the decoding still takes a lot
|
||||
of EX work so the performance is not as good. Since there are transparent
|
||||
areas, Azur's CPU-bound version is at least to some extent faster than
|
||||
bopti, so that's what we're using.
|
||||
|
||||
See P8 CLEARBG for an explanation of NULL canceling.
|
||||
|
||||
r0: Dye value
|
||||
r7: Right edge pointer
|
||||
r8: Alpha value
|
||||
r9: 0 (to neutralize addc during NULL-cancelling)
|
||||
r10: Left edge pointer
|
||||
r11: Nullable output pointer
|
||||
r12: Edge stride
|
||||
r13: [temporary]
|
||||
r14: [temporary]
|
||||
|
||||
Spilled to stack:
|
||||
@(-8,r15): Right edge value
|
||||
@(-4,r15): Left edge value */
|
||||
|
||||
.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
shlr r2
|
||||
nop
|
||||
|
||||
add r10, r10
|
||||
nop
|
||||
|
||||
mov.l @r8+, r0 /* cmd.palette (don't care) */
|
||||
mov r2, r0
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shll2 r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add r5, r10
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-4, r5
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */
|
||||
add #-1, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_1 (alpha value) */
|
||||
nop
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
nop
|
||||
|
||||
mov.w @r7, \TMP2 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.l \TMP2, @-r15
|
||||
mov #0x0f, \TMP2
|
||||
|
||||
mov.w @r10, r9 /* Save left edge */
|
||||
and \TMP1, \TMP2
|
||||
|
||||
mov.l r9, @-r15
|
||||
mov #0, r9
|
||||
|
||||
2: cmp/eq \TMP2, r8
|
||||
mov #-1, r11
|
||||
|
||||
addc r9, r11
|
||||
mov #-4, \TMP2
|
||||
|
||||
and r5, r11
|
||||
nop
|
||||
|
||||
shld \TMP2, \TMP1
|
||||
mov #0x0f, \TMP2
|
||||
|
||||
and \TMP2, \TMP1
|
||||
mov.w r0, @(\OFF1,r11)
|
||||
|
||||
cmp/eq \TMP1, r8
|
||||
mov #-1, r11
|
||||
|
||||
addc r9, r11
|
||||
mov.b @r3+, \TMP1
|
||||
|
||||
and r5, r11
|
||||
nop
|
||||
|
||||
mov #0x0f, \TMP2
|
||||
and \TMP1, \TMP2
|
||||
|
||||
add #\OUT_DIR, r5
|
||||
3: mov.w r0, @(\OFF2,r11)
|
||||
|
||||
mov.l @r15+, \TMP2
|
||||
nop
|
||||
|
||||
mov.w \TMP2, @r10 /* Restore left edge */
|
||||
add r12, r10
|
||||
|
||||
mov.l @r15+, \TMP2
|
||||
nop
|
||||
|
||||
mov.w \TMP2, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p4_dye:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_DYE_LOOP 0, 4, r13, r14, 6, 4
|
||||
9: GEN_DYE_LOOP 1, -4, r13, r14, 0, 2
|
23
src/render-cg/image/image_p4_dye.c
Normal file
23
src/render-cg/image/image_p4_dye.c
Normal file
|
@ -0,0 +1,23 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_p4_dye(int x, int y, image_t const *img, int eff, int dye_color)
|
||||
{
|
||||
dsubimage_p4_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void dsubimage_p4_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 12;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = gint_image_p4_dye;
|
||||
gint_image_p4_loop(DWIDTH, &cmd);
|
||||
}
|
32
src/render-cg/image/image_p4_effect.c
Normal file
32
src/render-cg/image/image_p4_effect.c
Normal file
|
@ -0,0 +1,32 @@
|
|||
#include <gint/image.h>
|
||||
|
||||
void dsubimage_p4_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_p4_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int from = va_arg(args, int);
|
||||
int to = va_arg(args, int);
|
||||
dsubimage_p4_swapcolor(x, y, img, left, top, w, h, eff, from,
|
||||
to);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_p4_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
dsubimage_p4_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
dsubimage_p4(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
125
src/render-cg/image/image_p4_normal.S
Normal file
125
src/render-cg/image/image_p4_normal.S
Normal file
|
@ -0,0 +1,125 @@
|
|||
.global _gint_image_p4_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P4 Opaque rendering, VRAM version: by unrolling without edge pixels.
|
||||
|
||||
This is the most unique function in the renderer, Azur included. A P4 image
|
||||
cannot reasonably be decoded on a per-pixel basis because extracting half-
|
||||
bytes is too slow. But using edge pixels results in extra write surface that
|
||||
makes us slower than bopti in gint 2.7.
|
||||
|
||||
This loop is thus the only one to implement 2-unrolling (no pipeline) while
|
||||
manually avoiding the writes that a pair of edge pixels usually fix. Subtle
|
||||
adjustments to strides are involved, making this function one of the most
|
||||
tricky.
|
||||
|
||||
A slight change is made to the command for the purpose of this function;
|
||||
cmd.edge_1 (which is r10) is set to indicate whether the [left] side of the
|
||||
box is even (r10=0) or odd (r10=1). This allows us to enter the loop at the
|
||||
correct position.
|
||||
|
||||
r0: [temporary]
|
||||
r7: [temporary]
|
||||
r8: Column counter
|
||||
r9: Palette
|
||||
r10: box->left & 1
|
||||
r11: [temporary] */
|
||||
|
||||
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
add #-4, r5 /* Better positioning for @(OFF[12], r5) */
|
||||
|
||||
/* The following arithmetic is to decrease r4 if the width is even
|
||||
(r2 & 1) and left is odd (r10 = 1), since that means both the first
|
||||
and last pixel load a full byte but use only half */
|
||||
|
||||
mov r2, r0
|
||||
xor #1, r0
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 (don't care) */
|
||||
and r10, r0
|
||||
|
||||
mov.l r11, @-r15
|
||||
sub r0, r4
|
||||
|
||||
.if \HFLIP
|
||||
mov r2, r0
|
||||
shll r0
|
||||
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
1: mov r2, r8
|
||||
tst r10, r10 /* Check whether to do an extra half iter. */
|
||||
|
||||
bt 2f
|
||||
nop
|
||||
|
||||
/* Additional half-iteration if box->left = 1 */
|
||||
|
||||
mov.b @r3+, r0
|
||||
shll r0
|
||||
and #0x1e, r0
|
||||
mov.w @(r0, r9), r0
|
||||
dt r8
|
||||
mov.w r0, @(\OFF1, r5)
|
||||
bt.s 3f
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
/* The main loop needs to load pixels in output order. This is not
|
||||
ideal for CPU usage, but we have some margins */
|
||||
|
||||
2: mov.b @r3+, \TMP1
|
||||
mov #-4, \TMP2
|
||||
|
||||
/* Stall */
|
||||
|
||||
shll \TMP1
|
||||
mov \TMP1, r0
|
||||
|
||||
shld \TMP2, r0
|
||||
nop
|
||||
|
||||
and #0x1e, r0
|
||||
mov #0x1e, \TMP2
|
||||
|
||||
/* Stall */
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
and \TMP2, \TMP1
|
||||
|
||||
dt r8
|
||||
mov.w r0, @(\OFF1,r5)
|
||||
|
||||
bt.s 3f
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov \TMP1, r0
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
dt r8
|
||||
mov.w @(r0,r9), r0
|
||||
|
||||
bf.s 2b
|
||||
mov.w r0, @(\OFF2,r5)
|
||||
|
||||
3: END
|
||||
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p4_normal:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_NORMAL_LOOP 0, 2, r7, r11, 4, 2
|
||||
9: GEN_NORMAL_LOOP 1, -2, r7, r11, 2, 4
|
175
src/render-cg/image/image_p4_swapcolor.S
Normal file
175
src/render-cg/image/image_p4_swapcolor.S
Normal file
|
@ -0,0 +1,175 @@
|
|||
.global _gint_image_p4_swapcolor
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P4 SWAPCOLOR, RAM version: by branchless xor selection.
|
||||
|
||||
I'm not sure whether this is the most optimized version for RAM. But it's
|
||||
about 7-8% slower than bopti, and the effort of writing yet another
|
||||
variation of P4's arduous loops doesn't seem worth it for a rare dynamic
|
||||
effect. This is Azur's version.
|
||||
|
||||
See P8 SWAPCOLOR for an explanation of branchless xor selection.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
|
||||
r9: Palette
|
||||
r10: Left edge pointer
|
||||
r11: Holds (x ^ y) & -(c == x) during selection
|
||||
r12: cmd.color_1
|
||||
r13: [temporary]
|
||||
r14: [temporary] (in outer loop: edge stride)
|
||||
|
||||
Spilled to stack:
|
||||
@(-12,r15): Right edge value
|
||||
@(-8,r15): Left edge value
|
||||
@(-4,r15): Edge stride */
|
||||
|
||||
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
shlr r2
|
||||
nop
|
||||
|
||||
add r10, r10
|
||||
nop
|
||||
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
mov r2, r0
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
shll2 r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r13, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov.w @r8+, r13 /* cmd.color_1 */
|
||||
add r5, r10
|
||||
|
||||
mov.l r11, @-r15
|
||||
add #-4, r5
|
||||
|
||||
mov r13, r12
|
||||
shll r13
|
||||
|
||||
mov.l r14, @-r15
|
||||
add r9, r13
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_2 */
|
||||
add #-1, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.w @r13, r13
|
||||
mov r0, r14
|
||||
|
||||
add r6, r14
|
||||
nop
|
||||
|
||||
xor r13, r8
|
||||
nop
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
shll r12 /* Compare color_1 * 2 with shifted values */
|
||||
nop
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
nop
|
||||
|
||||
mov.w @r7, r0 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
shll \TMP1
|
||||
|
||||
mov.w @r10, r0 /* Save left edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
nop
|
||||
|
||||
mov.l r14, @-r15
|
||||
nop
|
||||
|
||||
2: mov \TMP1, r0
|
||||
and #0x1e, r0
|
||||
|
||||
cmp/eq r0, r12
|
||||
mov #-4, \TMP2
|
||||
|
||||
subc r11, r11
|
||||
nop
|
||||
|
||||
mov.w @(r0,r9), r0
|
||||
and r8, r11
|
||||
|
||||
shld \TMP2, \TMP1
|
||||
mov #0x1e, \TMP2
|
||||
|
||||
xor r11, r0
|
||||
mov.w r0, @(\OFF1,r5)
|
||||
|
||||
and \TMP2, \TMP1
|
||||
nop
|
||||
|
||||
cmp/eq \TMP1, r12
|
||||
nop
|
||||
|
||||
subc r11, r11
|
||||
mov \TMP1, r0
|
||||
|
||||
add #\OUT_DIR, r5
|
||||
mov.b @r3+, \TMP1
|
||||
|
||||
and r8, r11
|
||||
mov.w @(r0,r9), r0
|
||||
|
||||
shll \TMP1
|
||||
nop
|
||||
|
||||
xor r11, r0
|
||||
3: mov.w r0, @(\OFF2,r5)
|
||||
|
||||
|
||||
mov.l @r15+, r14
|
||||
nop
|
||||
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r10 /* Restore left edge */
|
||||
add r14, r10
|
||||
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r7 /* Restore right edge */
|
||||
add r14, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r12
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p4_swapcolor:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_SWAPCOLOR_LOOP 0, 4, r13, r14, 6, 0
|
||||
9: GEN_SWAPCOLOR_LOOP 1, -4, r13, r14, 0, 6
|
46
src/render-cg/image/image_p4_swapcolor.c
Normal file
46
src/render-cg/image/image_p4_swapcolor.c
Normal file
|
@ -0,0 +1,46 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_p4_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
dsubimage_p4_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void dsubimage_p4_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_index, int new_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_index;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = gint_image_p4_swapcolor;
|
||||
gint_image_p4_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_p4_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
dsubimage_p4_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void dsubimage_p4_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, true, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = gint_image_p4_swapcolor;
|
||||
gint_image_p4_loop(DWIDTH, &cmd);
|
||||
}
|
103
src/render-cg/image/image_p8.S
Normal file
103
src/render-cg/image/image_p8.S
Normal file
|
@ -0,0 +1,103 @@
|
|||
.global _gint_image_p8_loop
|
||||
|
||||
/* gint's image renderer: 8-bit indexed entry point
|
||||
|
||||
P8 compacts images by indexing each pixel on a 256-color palette, thus
|
||||
halving the amount of data per pixel. This comes at the cost of an
|
||||
additional lookup during rendering. For these format, there is no way to
|
||||
bundle pixels together, and the more advanced loops handle pixels
|
||||
individually with a 2-unrolled 2-stage-pipeline structure to accelerate the
|
||||
CPU processing when that is the bottleneck (which often means where there
|
||||
are transparent pixels to skip).
|
||||
|
||||
For readers not familiar with loop optimization literature, the main idea is
|
||||
that a simple loop which loads a pixel, processes it, and writes it, is too
|
||||
inefficient because of RAW delays. To use the full speed of the CPU, one
|
||||
needs to do more work in parallel and spread out actions on a single pixel,
|
||||
which we do here with two loop transforms:
|
||||
|
||||
* _Pipelining_ the loop consists in handling a single pixel over several
|
||||
iterations by doing a little bit of work in each iteration. The data for
|
||||
the pixel would move from register to register at each iteration, with the
|
||||
loop code doing one stage's worth of computation on each register. This
|
||||
gives us more pixels to work on simultaneously, and more independent work
|
||||
means less RAW limitations. Loops in this renderer have 2 stages at most.
|
||||
|
||||
* _Unrolling_ iterations of the loop consists in loading two (or more) pixels
|
||||
at the start of each iteration so that we can work on one while waiting
|
||||
for stalls and dependencies on the other. Unlike pipelining, pixels are
|
||||
still confined within iterations. Non-trivial loops in this renderer
|
||||
process 2 pixels per iteration.
|
||||
|
||||
Unrolling has one major flaw: handling pairs of pixels only works if the
|
||||
total amount of pixels to draw is even. The usual way to handle this for n
|
||||
pixels is to do ⌊n/2⌋ iterations and handle the last pixel individually if n
|
||||
is odd. This is extremely annoying, since every row must check the value of
|
||||
n, and an extra copy of the loop code for a single pixel must be maintained
|
||||
on the side, which takes more space and more effort.
|
||||
|
||||
However, we have a specialized solution here with *edge pixels*. The idea of
|
||||
edge pixels is to round the number of pixels *up* and perform ⌊(n+1)/2⌋ runs
|
||||
of the inner loop. If n is odd, this will overwrite a single pixel at the
|
||||
end of the line. We can cancel this error after-the-fact by saving the value
|
||||
of the (n+1)-th pixel of the line before the loop, and restoring it
|
||||
afterwards. Note that if n is even then the save/restore is a no-op.
|
||||
|
||||
This takes some caution however, as the temporary overwrite could be seen by
|
||||
an interrupt. Some measures are put in place to reserve a couple of bytes on
|
||||
each side of gint's VRAM and Azur's target fragment to avoid any problems.
|
||||
|
||||
r0: - (initially: cmd.effect)
|
||||
r1: Number of lines remaining to draw
|
||||
r2: Number of columns per line
|
||||
r3: Input pointer
|
||||
r4: Input stride
|
||||
r5: Output pointer
|
||||
r6: Output stride
|
||||
r7: Right edge or [temporary]
|
||||
r8: - (initially: cmd)
|
||||
r9: - (initially: cmd.loop) */
|
||||
|
||||
_gint_image_p8_loop:
|
||||
/* r4: int output_width (pixels)
|
||||
r5: struct gint_image_cmd *cmd */
|
||||
|
||||
mov.b @(1,r5), r0 /* cmd.effect */
|
||||
add #2, r5
|
||||
|
||||
mov.l r8, @-r15
|
||||
mov r4, r6
|
||||
|
||||
mov.w @r5+, r2 /* cmd.columns */
|
||||
mov r5, r8
|
||||
|
||||
/* For here on the command is r8 */
|
||||
|
||||
mov.l r9, @-r15
|
||||
shlr r0 /* T bit is now VFLIP */
|
||||
|
||||
mov.w @r8+, r4 /* cmd.input_stride */
|
||||
sub r2, r6
|
||||
|
||||
mov.b @r8+, r1 /* cmd.lines */
|
||||
add r6, r6
|
||||
|
||||
mov.b @r8+, r9 /* cmd.edge_1 - don't care */
|
||||
nop
|
||||
|
||||
mov.l @r8+, r9
|
||||
extu.b r1, r1
|
||||
|
||||
mov.l @r8+, r5 /* cmd.output */
|
||||
nop
|
||||
|
||||
bf.s _NO_VFLIP
|
||||
mov.l @r8+, r3 /* cmd.input */
|
||||
|
||||
_VFLIP:
|
||||
neg r4, r4
|
||||
nop
|
||||
|
||||
_NO_VFLIP:
|
||||
jmp @r9
|
||||
sub r2, r4
|
42
src/render-cg/image/image_p8.c
Normal file
42
src/render-cg/image/image_p8.c
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include <gint/image.h>
|
||||
#include <gint/display.h>
|
||||
|
||||
void dimage_p8(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
dsubimage_p8(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void dsubimage_p8(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_P8_RGB565A)
|
||||
return dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.loop = gint_image_p8_normal;
|
||||
gint_image_p8_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_p8_clearbg(int x, int y, image_t const *img, int eff, int bg)
|
||||
{
|
||||
dsubimage_p8_clearbg(x, y, img, 0, 0, img->width, img->height, eff,bg);
|
||||
}
|
||||
|
||||
void dsubimage_p8_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = gint_image_p8_clearbg;
|
||||
gint_image_p8_loop(DWIDTH, &cmd);
|
||||
}
|
147
src/render-cg/image/image_p8_clearbg.S
Normal file
147
src/render-cg/image/image_p8_clearbg.S
Normal file
|
@ -0,0 +1,147 @@
|
|||
.global _gint_image_p8_clearbg
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 CLEARBG, RAM version: by NULL canceling.
|
||||
|
||||
This function is one of the few that can still be bottlenecked by CPU in the
|
||||
RAM model. This is because transparent pixels can be skipped over as fast as
|
||||
the CPU allows without worrying about the writing speed of the RAM.
|
||||
|
||||
For some reason that I have yet to uncover, branches are way slower than the
|
||||
SH4AL-DSP manual suggests, and even slower while inside of DSP loops. This
|
||||
completely favors branchless methods, and the one used here is one I call
|
||||
"NULL canceling".
|
||||
|
||||
The idea is that a write can be turned into a no-op by either writing the
|
||||
value that is already in memory, or by writing somewhere else. The first
|
||||
option is pretty slow, especially because it requires a selection operation
|
||||
(rn = condition ? rn : rm) which is like the most general branchless trick.
|
||||
|
||||
NULL canceling abuses the fact that NULL is mapped read-only on the platform
|
||||
to turn the target pointer in NULL with the following identity:
|
||||
|
||||
target & -(condition) = (condition ? target : NULL)
|
||||
|
||||
The term -(condition) is materialized with an [addc #-1, #0] instruction
|
||||
after the test, then the result is applied onto the target pointer with
|
||||
[and], completing the trick in only 2 EX instructions. It does take more
|
||||
registers, and prevents from using pre-decrement on the target.
|
||||
|
||||
r0: [temporary]
|
||||
r7: Right edge pointer
|
||||
r8: Alpha value
|
||||
r9: Palette
|
||||
r10: Nullable output pointer
|
||||
r11: 0 (to neutralize addc during NULL-cancelling)
|
||||
r12: Right edge stride
|
||||
r13: [temporary]
|
||||
r14: [temporary]
|
||||
|
||||
Spilled to stack:
|
||||
@(-4,r15): Right edge value */
|
||||
|
||||
.macro GEN_CLEARBG_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
shlr r2
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
mov r2, r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll2 r0
|
||||
|
||||
mov.l r10, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add #-4, r5
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */
|
||||
mov #0, r11
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
nop
|
||||
|
||||
mov.w @r7, r0 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.l r0, @-r15
|
||||
cmp/eq \TMP2, r8
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
add \TMP2, \TMP2
|
||||
|
||||
2: mov #-1, r10
|
||||
addc r11, r10 /* r10 is now the mask */
|
||||
|
||||
and r5, r10
|
||||
mov \TMP2, r0
|
||||
|
||||
cmp/eq \TMP1, r8
|
||||
mov.w @(r0, r9), r0
|
||||
|
||||
mov.w r0, @(\OFF1, r10)
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
nop
|
||||
|
||||
mov #-1, r10
|
||||
addc r11, r10
|
||||
|
||||
add \TMP1, \TMP1
|
||||
mov \TMP1, r0
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
and r5, r10
|
||||
|
||||
mov.w @(r0, r9), r0
|
||||
cmp/eq \TMP2, r8
|
||||
|
||||
mov.w r0, @(\OFF2, r10)
|
||||
3: add \TMP2, \TMP2
|
||||
|
||||
mov.l @r15+, r0
|
||||
nop
|
||||
|
||||
mov.w r0, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r12
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p8_clearbg:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_LOOP 0, 4, r13, r14, 4, 2
|
||||
9: GEN_CLEARBG_LOOP 1, -4, r13, r14, 2, 4
|
115
src/render-cg/image/image_p8_dye.S
Normal file
115
src/render-cg/image/image_p8_dye.S
Normal file
|
@ -0,0 +1,115 @@
|
|||
.global _gint_image_p8_dye
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 DYE, RAM version: by NULL canceling.
|
||||
|
||||
This effect basically removes all the complexity out of P8 because we no
|
||||
longer need to index the palette. We only keep the tight loop so that the
|
||||
CPU can speed in areas with many transparent pixels. This gives some
|
||||
acceleration over bopti.
|
||||
|
||||
See P8 CLEARBG for an explanation of NULL canceling.
|
||||
|
||||
r0: Dye value
|
||||
r7: Right edge pointer
|
||||
r8: Alpha value
|
||||
r9: Right edge value
|
||||
r10: Nullable output pointer
|
||||
r11: 0 (to neutralize addc during NULL-cancelling)
|
||||
r12: Right edge stride
|
||||
r13: [temporary]
|
||||
r14: [temporary] */
|
||||
|
||||
.macro GEN_DYE_LOOP HFLIP, OUT_DIR, TMP1, TMP2, OFF1, OFF2
|
||||
mov.l @r8+, r9 /* cmd.palette (don't care) */
|
||||
shlr r2
|
||||
|
||||
mov.w @r8+, r7 /* cmd.edge_2 */
|
||||
mov r2, r0
|
||||
|
||||
mov.l r12, @-r15
|
||||
shll2 r0
|
||||
|
||||
mov.l r10, @-r15
|
||||
shll r7
|
||||
|
||||
mov.l r11, @-r15
|
||||
add r5, r7
|
||||
|
||||
mov r0, r12
|
||||
add r6, r12
|
||||
|
||||
mov.l r13, @-r15
|
||||
add #-4, r5
|
||||
|
||||
mov.l r14, @-r15
|
||||
add #-2, r4 /* Input stride compensation for pipelining */
|
||||
|
||||
.if \HFLIP
|
||||
add r0, r5
|
||||
nop
|
||||
|
||||
shll r0
|
||||
nop
|
||||
|
||||
add r0, r6
|
||||
nop
|
||||
.endif
|
||||
|
||||
mov.w @(2,r8), r0 /* cmd.color_2 (dye value) */
|
||||
nop
|
||||
|
||||
mov.w @r8, r8 /* cmd.color_1 ≤ 255, thus zero-extended */
|
||||
mov #0, r11
|
||||
|
||||
START
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
nop
|
||||
|
||||
mov.w @r7, r9 /* Save right edge */
|
||||
nop
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
cmp/eq \TMP2, r8
|
||||
|
||||
2: mov #-1, r10
|
||||
addc r11, r10 /* r10 is now the mask */
|
||||
|
||||
and r5, r10
|
||||
nop
|
||||
|
||||
mov.b @r3+, \TMP2
|
||||
cmp/eq \TMP1, r8
|
||||
|
||||
mov.w r0, @(\OFF1, r10)
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
mov #-1, r10
|
||||
addc r11, r10
|
||||
|
||||
mov.b @r3+, \TMP1
|
||||
and r5, r10
|
||||
|
||||
cmp/eq \TMP2, r8
|
||||
3: mov.w r0, @(\OFF2, r10)
|
||||
|
||||
mov.w r9, @r7 /* Restore right edge */
|
||||
add r12, r7
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r14
|
||||
mov.l @r15+, r13
|
||||
mov.l @r15+, r11
|
||||
mov.l @r15+, r10
|
||||
mov.l @r15+, r12
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p8_dye:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_DYE_LOOP 0, 4, r13, r14, 4, 2
|
||||
9: GEN_DYE_LOOP 1, -4, r13, r14, 2, 4
|
23
src/render-cg/image/image_p8_dye.c
Normal file
23
src/render-cg/image/image_p8_dye.c
Normal file
|
@ -0,0 +1,23 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_p8_dye(int x, int y, image_t const *img, int eff, int dye_color)
|
||||
{
|
||||
dsubimage_p8_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void dsubimage_p8_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, true, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 12;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = gint_image_p8_dye;
|
||||
gint_image_p8_loop(DWIDTH, &cmd);
|
||||
}
|
32
src/render-cg/image/image_p8_effect.c
Normal file
32
src/render-cg/image/image_p8_effect.c
Normal file
|
@ -0,0 +1,32 @@
|
|||
#include <gint/image.h>
|
||||
|
||||
void dsubimage_p8_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_p8_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int from = va_arg(args, int);
|
||||
int to = va_arg(args, int);
|
||||
dsubimage_p8_swapcolor(x, y, img, left, top, w, h, eff, from,
|
||||
to);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_p8_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
dsubimage_p8_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
dsubimage_p8(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
42
src/render-cg/image/image_p8_normal.S
Normal file
42
src/render-cg/image/image_p8_normal.S
Normal file
|
@ -0,0 +1,42 @@
|
|||
.global _gint_image_p8_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 Opaque rendering, RAM version: trivial.
|
||||
|
||||
As usual with RAM it is fairly easy to bottleneck writing speed, and so
|
||||
there is no need for complex methods. Building longwords could be an option,
|
||||
but it would require output alignment with edges, which is painful. */
|
||||
|
||||
.macro GEN_NORMAL_LOOP HFLIP, OUT_DIR
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
|
||||
.if \HFLIP
|
||||
add #-2, r5
|
||||
mov r2, r0
|
||||
shll r0
|
||||
add r0, r5
|
||||
shll r0
|
||||
add r0, r6
|
||||
.endif
|
||||
|
||||
1: mov r2, r8
|
||||
|
||||
2: mov.b @r3+, r0
|
||||
shll r0
|
||||
mov.w @(r0, r9), r0
|
||||
mov.w r0, @r5
|
||||
|
||||
3: dt r8
|
||||
bf.s 2b
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
END
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p8_normal:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_NORMAL_LOOP 0, 2
|
||||
9: GEN_NORMAL_LOOP 1, -2
|
77
src/render-cg/image/image_p8_swapcolor.S
Normal file
77
src/render-cg/image/image_p8_swapcolor.S
Normal file
|
@ -0,0 +1,77 @@
|
|||
.global _gint_image_p8_swapcolor
|
||||
#include "image_macros.S"
|
||||
|
||||
/* P8 SWAPCOLOR, RAM version: by branchless xor selection.
|
||||
|
||||
The core action of this loop is to render full pixels while replacing any
|
||||
occurrence of cmd.color_1 (x) with the value cmd.color_2 (y). Branching is
|
||||
too slow as often, so instead we use the fact that both x and y are fixed to
|
||||
use the identity
|
||||
|
||||
c ^ ((x ^ y) & -(c == x)) = (c == x ? y : c)
|
||||
|
||||
We materialize -(c == x) by subtracting a register from itself with subc
|
||||
after the comparison (which is delightfully elegant), while (x ^ y) is pre-
|
||||
computed. This way, the selection is performed in one [subc], one [and] and
|
||||
one [xor] for a total of 3 EX slots. This is slower than NULL-cancelling
|
||||
(which only takes 2 EX slots) but still better than symmetric alternatives.
|
||||
|
||||
Since we have a palette, we further trick by comparing against the index but
|
||||
selecting against the palette entry, ie. we do
|
||||
|
||||
palette[c] ^ ((palette[x] ^ y) & -(c == x)) = (c == x ? y : palette[c])
|
||||
|
||||
which allows the computation to occur in parallel with the palette access
|
||||
and does not require the replacement value to be located at a valid index.
|
||||
|
||||
r0: [temporary]
|
||||
r7: cmd.color_1
|
||||
r8: palette[cmd.color_1] ^ cmd.color_2 (ie. x ^ y)
|
||||
r9: Palette
|
||||
r10: Holds (x ^ y) & -(c == x) during selection */
|
||||
|
||||
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
|
||||
mov.l @r8+, r9 /* cmd.palette */
|
||||
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
|
||||
mov.w @r8+, r7 /* cmd.color_1 */
|
||||
mov.l r10, @-r15
|
||||
exts.b r7, r7
|
||||
mov r7, r0
|
||||
mov.w @r8, r8 /* cmd.color_2 */
|
||||
add r0, r0
|
||||
mov.w @(r0, r9), r0
|
||||
xor r0, r8
|
||||
|
||||
.if \HFLIP
|
||||
add #-2, r5
|
||||
mov r2, r0
|
||||
shll r0
|
||||
add r0, r5
|
||||
shll r0
|
||||
add r0, r6
|
||||
.endif
|
||||
|
||||
START
|
||||
|
||||
2: mov.b @r3+, r0
|
||||
cmp/eq r0, r7
|
||||
add r0, r0
|
||||
subc r10, r10
|
||||
mov.w @(r0, r9), r0
|
||||
and r8, r10
|
||||
xor r10, r0
|
||||
mov.w r0, @r5
|
||||
3: add #\OUT_DIR, r5
|
||||
|
||||
END
|
||||
|
||||
mov.l @r15+, r10
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_p8_swapcolor:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_SWAPCOLOR_LOOP 0, 2
|
||||
9: GEN_SWAPCOLOR_LOOP 1, -2
|
46
src/render-cg/image/image_p8_swapcolor.c
Normal file
46
src/render-cg/image/image_p8_swapcolor.c
Normal file
|
@ -0,0 +1,46 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_p8_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
dsubimage_p8_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void dsubimage_p8_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_index, int new_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_index;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = gint_image_p8_swapcolor;
|
||||
gint_image_p8_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_p8_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
dsubimage_p8_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void dsubimage_p8_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = gint_image_p8_swapcolor;
|
||||
gint_image_p8_loop(DWIDTH, &cmd);
|
||||
}
|
69
src/render-cg/image/image_rgb16.S
Normal file
69
src/render-cg/image/image_rgb16.S
Normal file
|
@ -0,0 +1,69 @@
|
|||
.global _gint_image_rgb16_loop
|
||||
|
||||
/* gint's image renderer: 16-bit RGB entry piont
|
||||
|
||||
These formats are the simplest of the bunch. RGB565 can use longword access
|
||||
in cases when alignment is favorable and no geometric effect is applied. In
|
||||
other cases, pixels are handled individually; geometric effects affect the
|
||||
input/output logic while color effects change the computations themselves.
|
||||
|
||||
r0: - (initially: cmd.effect)
|
||||
r1: Number of lines remaining to draw
|
||||
r2: Number of columns per line
|
||||
r3: Input pointer
|
||||
r4: Input stride
|
||||
r5: Output pointer
|
||||
r6: Output stride
|
||||
r7: Right edge (only used in Azur) or [temporary]
|
||||
r8: - (initially: cmd)
|
||||
r9: - (initially: cmd.loop) */
|
||||
|
||||
_gint_image_rgb16_loop:
|
||||
/* r4: int output_width (pixels)
|
||||
r5: struct gint_image_cmd *cmd */
|
||||
|
||||
mov.b @(1,r5), r0 /* cmd.effect */
|
||||
add #2, r5
|
||||
|
||||
mov.l r8, @-r15
|
||||
mov r4, r6
|
||||
|
||||
mov.w @r5+, r2 /* cmd.columns */
|
||||
mov r5, r8
|
||||
|
||||
/* For here on the command is r8 */
|
||||
|
||||
mov.l r9, @-r15
|
||||
shlr r0 /* T bit is now VFLIP */
|
||||
|
||||
mov.w @r8+, r4 /* cmd.input_stride */
|
||||
sub r2, r6
|
||||
|
||||
mov.b @r8+, r1 /* cmd.lines */
|
||||
add r6, r6
|
||||
|
||||
mov.b @r8+, r9 /* cmd.edge_1 (don't care) */
|
||||
nop
|
||||
|
||||
mov.l @r8+, r9
|
||||
extu.b r1, r1
|
||||
|
||||
mov.l @r8+, r5 /* cmd.output */
|
||||
nop
|
||||
|
||||
mov.l @r8+, r3 /* cmd.input */
|
||||
nop
|
||||
|
||||
bf.s _NO_VFLIP
|
||||
add #4, r8 /* cmd.palette (don't care) */
|
||||
|
||||
_VFLIP:
|
||||
neg r4, r4
|
||||
nop
|
||||
|
||||
_NO_VFLIP:
|
||||
sub r2, r4
|
||||
nop
|
||||
|
||||
jmp @r9
|
||||
add r4, r4
|
43
src/render-cg/image/image_rgb16.c
Normal file
43
src/render-cg/image/image_rgb16.c
Normal file
|
@ -0,0 +1,43 @@
|
|||
#include <gint/image.h>
|
||||
#include <gint/display.h>
|
||||
|
||||
void dimage_rgb16(int x, int y, image_t const *img, int eff)
|
||||
{
|
||||
dsubimage_rgb16(x, y, img, 0, 0, img->width, img->height, eff);
|
||||
}
|
||||
|
||||
void dsubimage_rgb16(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff)
|
||||
{
|
||||
if(img->profile == IMAGE_RGB565A)
|
||||
return dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff,
|
||||
img->alpha);
|
||||
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.loop = gint_image_rgb16_normal;
|
||||
gint_image_rgb16_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_rgb16_clearbg(int x, int y, image_t const *img, int eff,int bg)
|
||||
{
|
||||
dsubimage_rgb16_clearbg(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
bg);
|
||||
}
|
||||
|
||||
void dsubimage_rgb16_clearbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 4;
|
||||
cmd.color_1 = bg_color;
|
||||
cmd.loop = gint_image_rgb16_clearbg;
|
||||
gint_image_rgb16_loop(DWIDTH, &cmd);
|
||||
}
|
53
src/render-cg/image/image_rgb16_clearbg_dye.S
Normal file
53
src/render-cg/image/image_rgb16_clearbg_dye.S
Normal file
|
@ -0,0 +1,53 @@
|
|||
.global _gint_image_rgb16_clearbg
|
||||
.global _gint_image_rgb16_dye
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 CLEARBG and DYE, RAM version: trivial.
|
||||
|
||||
This function handles both CLEARBG and DYE; in RGB16 they are the same,
|
||||
except that DYE writes not the pixel value (TMP) but a fixed color (SRC). As
|
||||
if often the case, the RAM speed is limiting, so there is no point in
|
||||
improving speed of the code on the CPU side. */
|
||||
|
||||
.macro GEN_CLEARBG_DYE_LOOP HFLIP, OUT_DIR, TMP, SRC
|
||||
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
|
||||
mov.w @r8+, r9 /* cmd.color_1 (alpha color) */
|
||||
mov.w @r8+, r0 /* cmd.color_2 (dye color) */
|
||||
|
||||
.if \HFLIP
|
||||
add #-2, r5
|
||||
mov r2, r8
|
||||
shll r8
|
||||
add r8, r5
|
||||
shll r8
|
||||
add r8, r6
|
||||
.endif
|
||||
|
||||
1: mov r2, r8
|
||||
|
||||
2: mov.w @r3+, \TMP
|
||||
cmp/eq \TMP, r9
|
||||
bt 3f
|
||||
mov.w \SRC, @r5
|
||||
|
||||
3: dt r8
|
||||
bf.s 2b
|
||||
add #(\OUT_DIR/2), r5
|
||||
|
||||
END
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_rgb16_clearbg:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_DYE_LOOP 0, 4, r0, r0
|
||||
9: GEN_CLEARBG_DYE_LOOP 1, -4, r0, r0
|
||||
|
||||
_gint_image_rgb16_dye:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_CLEARBG_DYE_LOOP 0, 4, r7, r0
|
||||
9: GEN_CLEARBG_DYE_LOOP 1, -4, r7, r0
|
23
src/render-cg/image/image_rgb16_dye.c
Normal file
23
src/render-cg/image/image_rgb16_dye.c
Normal file
|
@ -0,0 +1,23 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_rgb16_dye(int x, int y, image_t const *img, int eff, int dye_color)
|
||||
{
|
||||
dsubimage_rgb16_dye(x, y, img, 0, 0, img->width, img->height, eff,
|
||||
dye_color);
|
||||
}
|
||||
|
||||
void dsubimage_rgb16_dye(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int dye_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 12;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = dye_color;
|
||||
cmd.loop = gint_image_rgb16_dye;
|
||||
gint_image_rgb16_loop(DWIDTH, &cmd);
|
||||
}
|
32
src/render-cg/image/image_rgb16_effect.c
Normal file
32
src/render-cg/image/image_rgb16_effect.c
Normal file
|
@ -0,0 +1,32 @@
|
|||
#include <gint/image.h>
|
||||
|
||||
void dsubimage_rgb16_effect(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, eff);
|
||||
|
||||
if(eff & IMAGE_CLEARBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_rgb16_clearbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_SWAPCOLOR) {
|
||||
int from = va_arg(args, int);
|
||||
int to = va_arg(args, int);
|
||||
dsubimage_rgb16_swapcolor(x, y, img, left, top, w, h, eff,
|
||||
from, to);
|
||||
}
|
||||
else if(eff & IMAGE_ADDBG) {
|
||||
int bg = va_arg(args, int);
|
||||
dsubimage_rgb16_addbg(x, y, img, left, top, w, h, eff, bg);
|
||||
}
|
||||
else if(eff & IMAGE_DYE) {
|
||||
int dye = va_arg(args, int);
|
||||
dsubimage_rgb16_dye(x, y, img, left, top, w, h, eff, dye);
|
||||
}
|
||||
else {
|
||||
dsubimage_rgb16(x, y, img, left, top, w, h, eff);
|
||||
}
|
||||
|
||||
va_end(args);
|
||||
}
|
201
src/render-cg/image/image_rgb16_normal.S
Normal file
201
src/render-cg/image/image_rgb16_normal.S
Normal file
|
@ -0,0 +1,201 @@
|
|||
.global _gint_image_rgb16_normal
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 Opaque rendering, RAM version: by longword access.
|
||||
|
||||
This function of the image renderer is designed for the RAM model only. At
|
||||
default overclock levels, the RAM can register a write every 13-14 cycles,
|
||||
regardless of size. Since this amount of time is more than enough to build a
|
||||
target longword regardless of alignment and geometry considerations, the
|
||||
main and only focus of this function is to only write longwords.
|
||||
|
||||
Since longwords can only be written at 4-aligned addresses and always make
|
||||
pairs of pixels, there are variations on the loop depending on the rendered
|
||||
width and destination. These are marked with the following convention:
|
||||
|
||||
* w1 / w2 denotes the parity of the command width;
|
||||
* o2 / o4 denotes the alignment of the output.
|
||||
|
||||
There is a forward and a backward variation for all four combinations of
|
||||
these parameters, noted F_ and B_ in label names. Some word-based variations
|
||||
are provided for width ≤ 8, which is just a way to ensure that the longword-
|
||||
based loops always have a least one interation, since they're implemented as
|
||||
do/while.
|
||||
|
||||
The loops themselves are nowhere near tight on the CPU side and entirely
|
||||
bottlenecked by the RAM, hence the simplicity and complete disregard for
|
||||
superscalar parallelism. */
|
||||
|
||||
_gint_image_rgb16_normal:
|
||||
/* We use word copy for width ≤ 8; this is to ensure that there is at
|
||||
least one longword in the non-trivial loop, simplifying checks */
|
||||
tst #1, r0
|
||||
mov #8, r0
|
||||
|
||||
bf.s .BACKWARD
|
||||
cmp/ge r2, r0
|
||||
|
||||
.FORWARD:
|
||||
bt _FORWARD_WORD_COPY
|
||||
nop
|
||||
|
||||
bra _FORWARD_LONG_COPY
|
||||
nop
|
||||
|
||||
.BACKWARD:
|
||||
mov r2, r0
|
||||
add r0, r0
|
||||
add r0, r5
|
||||
add r0, r0
|
||||
|
||||
bt.s _BACKWARD_WORD_COPY
|
||||
add r0, r6
|
||||
|
||||
bra _BACKWARD_LONG_COPY
|
||||
nop
|
||||
|
||||
_FORWARD_WORD_COPY:
|
||||
START
|
||||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @r5+
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
_BACKWARD_WORD_COPY:
|
||||
START
|
||||
2: movs.w @r3+, x0
|
||||
3: movs.w x0, @-r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
_FORWARD_LONG_COPY:
|
||||
shlr r2 /* Test width parity */
|
||||
mov #2, r0
|
||||
|
||||
bt .F_w1
|
||||
nop
|
||||
|
||||
.F_w2: tst r0, r5 /* Test alignment of output */
|
||||
bf .F_w2o2
|
||||
|
||||
.F_w2o4:
|
||||
START
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r7
|
||||
xtrct r0, r7
|
||||
mov.l r7, @r5
|
||||
3: add #4, r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.F_w2o2:
|
||||
add #-1, r2
|
||||
START
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @r5
|
||||
add #2, r5
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r7
|
||||
xtrct r0, r7
|
||||
mov.l r7, @r5
|
||||
3: add #4, r5
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @r5
|
||||
add #2, r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.F_w1: tst r0, r5 /* Test alignment of output */
|
||||
bf .F_w1o2
|
||||
|
||||
.F_w1o4:
|
||||
START
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r7
|
||||
xtrct r0, r7
|
||||
mov.l r7, @r5
|
||||
3: add #4, r5
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @r5
|
||||
add #2, r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.F_w1o2:
|
||||
START
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @r5
|
||||
add #2, r5
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r7
|
||||
xtrct r0, r7
|
||||
mov.l r7, @r5
|
||||
3: add #4, r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
_BACKWARD_LONG_COPY:
|
||||
shlr r2 /* Test width parity */
|
||||
mov #2, r0
|
||||
|
||||
bt .B_w1
|
||||
nop
|
||||
|
||||
.B_w2: tst r0, r5 /* Test alignment of output */
|
||||
bf .B_w2o2
|
||||
|
||||
.B_w2o4:
|
||||
START
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r0
|
||||
xtrct r7, r0
|
||||
3: mov.l r0, @-r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.B_w2o2:
|
||||
add #-1, r2
|
||||
START
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @-r5
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r0
|
||||
xtrct r7, r0
|
||||
3: mov.l r0, @-r5
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @-r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.B_w1: tst r0, r5 /* Test alignment of output */
|
||||
bf .B_w1o2
|
||||
|
||||
.B_w1o4:
|
||||
START
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r0
|
||||
xtrct r7, r0
|
||||
3: mov.l r0, @-r5
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @-r5
|
||||
END
|
||||
EPILOGUE
|
||||
|
||||
.B_w1o2:
|
||||
START
|
||||
mov.w @r3+, r0
|
||||
mov.w r0, @-r5
|
||||
2: mov.w @r3+, r0
|
||||
mov.w @r3+, r7
|
||||
shll16 r0
|
||||
xtrct r7, r0
|
||||
3: mov.l r0, @-r5
|
||||
END
|
||||
EPILOGUE
|
45
src/render-cg/image/image_rgb16_swapcolor.S
Normal file
45
src/render-cg/image/image_rgb16_swapcolor.S
Normal file
|
@ -0,0 +1,45 @@
|
|||
.global _gint_image_rgb16_swapcolor
|
||||
#include "image_macros.S"
|
||||
|
||||
/* RGB16 SWAPCOLOR, RAM version: trivial.
|
||||
|
||||
This function is once again bottlenecked by RAM. Generating longwords would
|
||||
be tight and require significant adjustments, so we stick to words, and the
|
||||
trivial bopti-style version already maxes out the output rate. */
|
||||
|
||||
.macro GEN_SWAPCOLOR_LOOP HFLIP, OUT_DIR
|
||||
mov.w @r8+, r0 /* cmd.edge_2 (don't care) */
|
||||
mov.w @r8+, r9 /* cmd.color_1 */
|
||||
mov.w @r8+, r7 /* cmd.color_2 */
|
||||
|
||||
.if \HFLIP
|
||||
add #-2, r5
|
||||
mov r2, r0
|
||||
shll r0
|
||||
add r0, r5
|
||||
shll r0
|
||||
add r0, r6
|
||||
.endif
|
||||
|
||||
1: mov r2, r8
|
||||
|
||||
2: mov.w @r3+, r0
|
||||
cmp/eq r0, r9
|
||||
bf 4f
|
||||
mov r7, r0
|
||||
4: mov.w r0, @r5
|
||||
|
||||
3: dt r8
|
||||
bf.s 2b
|
||||
add #\OUT_DIR, r5
|
||||
|
||||
END
|
||||
EPILOGUE
|
||||
.endm
|
||||
|
||||
_gint_image_rgb16_swapcolor:
|
||||
tst #1, r0
|
||||
bf 9f
|
||||
|
||||
GEN_SWAPCOLOR_LOOP 0, 2
|
||||
9: GEN_SWAPCOLOR_LOOP 1, -2
|
46
src/render-cg/image/image_rgb16_swapcolor.c
Normal file
46
src/render-cg/image/image_rgb16_swapcolor.c
Normal file
|
@ -0,0 +1,46 @@
|
|||
#include <gint/display.h>
|
||||
#include <gint/image.h>
|
||||
|
||||
void dimage_rgb16_swapcolor(int x, int y, image_t const *img, int eff,
|
||||
int old_color, int new_color)
|
||||
{
|
||||
dsubimage_rgb16_swapcolor(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, old_color, new_color);
|
||||
}
|
||||
|
||||
void dsubimage_rgb16_swapcolor(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int old_color, int new_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = old_color;
|
||||
cmd.color_2 = new_color;
|
||||
cmd.loop = gint_image_rgb16_swapcolor;
|
||||
gint_image_rgb16_loop(DWIDTH, &cmd);
|
||||
}
|
||||
|
||||
void dimage_rgb16_addbg(int x, int y, image_t const *img, int eff,
|
||||
int bg_color)
|
||||
{
|
||||
dsubimage_rgb16_addbg(x, y, img, 0, 0, img->width, img->height,
|
||||
eff, bg_color);
|
||||
}
|
||||
|
||||
void dsubimage_rgb16_addbg(int x, int y, image_t const *img,
|
||||
int left, int top, int w, int h, int eff, int bg_color)
|
||||
{
|
||||
struct gint_image_box box = { x, y, w, h, left, top };
|
||||
struct gint_image_cmd cmd;
|
||||
|
||||
if(!gint_image_mkcmd(&box, img, eff, false, false, &cmd, DWIDTH,
|
||||
DHEIGHT)) return;
|
||||
cmd.effect += 8;
|
||||
cmd.color_1 = img->alpha;
|
||||
cmd.color_2 = bg_color;
|
||||
cmd.loop = gint_image_rgb16_swapcolor;
|
||||
gint_image_rgb16_loop(DWIDTH, &cmd);
|
||||
}
|
Loading…
Add table
Reference in a new issue