Skip to content

Commit

Permalink
ARM Ltd palette SIMD code
Browse files Browse the repository at this point in the history
This enables the ARM Ltd SIMD code to assist "png_expand_palette".  The
code has been refactored so that while it does not change the disruption
to the main libpng code, in particular pngrtran.c, is minimized and
generalized.

It should now be possible for any manufacturer to optimize the same
transform (expansion of a color-mapped PNG to RGB or RGB8) without new
manufacturer favouring code in the main body of libpng.

Signed-off-by: John Bowler <[email protected]>
  • Loading branch information
jbowler committed Sep 15, 2024
1 parent 2a00b7e commit 7a2e614
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 161 deletions.
191 changes: 175 additions & 16 deletions arm/arm_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,182 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)

#define png_target_init_filter_functions_impl png_init_filter_functions_neon

#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE /*TODO*/
#include "palette_neon_intrinsics.c"

/* TODO:
* png_target_free_data_impl
#ifdef PNG_TARGET_STORES_DATA
/* png_target_free_data_impl
* Must be defined if the implementation stores data in
* png_struct::target_data. Need not be defined otherwise.
*
* png_target_init_palette_support_impl
* Contains code to initialize a palette transformation. This returns
* true if something has been set up. Only called if the state contains
* png_target_palette, need not be defined, may cancel the state flag
* in the png_struct to prevent further calls.
*
* png_target_do_expand_palette
* Handles palette expansion. Need not be defined, only called if the
* state contains png_target_palette, may set this flag to zero, may
* return false to indicate that the expansion was not done.
*/
static void
png_target_free_data_arm(png_structrp pp)
{
png_voidp ptr = pp->target_data;
pp->target_data = NULL;
png_free(pp, ptr);
}
#define png_target_free_data_impl png_target_free_data_arm
#endif /* TARGET_STORES_DATA */

#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE
/* png_target_do_expand_palette_impl [flag: png_target_expand_palette]
* static function
* OPTIONAL
* Handles the transform. Need not be defined, only called if the
* state contains png_target_<transform>, may set this flag to zero, may
* return false to indicate that the transform was not done (so the
* C implementation must then execute).
*/
#include "palette_neon_intrinsics.c"

static int
png_target_do_expand_palette_neon(png_structrp png_ptr, png_row_infop row_info,
png_bytep row, png_const_colorp palette, png_const_bytep trans_alpha,
int num_trans)
{
/* NOTE: it is important that this is done. row_info->width is not a CSE
* because the pointer is not declared with the 'restrict' parameter, this
* makes it a CSE but then it is very important that no one changes it in
* this function, hence the const.
*/
const png_uint_32 row_width = row_info->width;

/* NOTE: this is pretty much the original code:
*
* 1) The original code only works when the original PNG has 8-bits per
* palette. This test was in pngrtran.c and is now here.
*
* 2) The original code starts at the end and works backward but then stops
* when it is within 16 bytes of the start. It then left the remainder to
* the original code in pngrtran.c That code is now here.
*
* 3) The original code takes pointers to the end of the input and the end of
* the output; this is the way png_do_expand_palette works becuase it
* has to copy down from the end (otherwise it would overwrite the input
* data before it read it). Note that the row buffer is aliased by
* these two pointers.
*
* A consequence of passing pointers is that the row pointers (input and
* output) are forced into memory (they can't be in registers). This
* could be fixed and some compilers may be able to handle this but
* no changes have been made to the original ARM code at this point.
*/
if (row_info->color_type == PNG_COLOR_TYPE_PALETTE &&
row_info->bit_depth == 8 /* <8 requires a bigger "riffled" palette */)
{
png_const_bytep sp = row + (row_width - 1); /* 8 bit palette index */
if (num_trans > 0)
{
/* This case needs a "riffled" palette. In this implementation the
* initialization is done here, on demand.
*/
if (png_ptr->target_data == NULL)
{
/* Initialize the accelerated palette expansion.
*
* The data is now allocated using png_malloc_warn so the code
* does not error out on OOM.
*/
png_ptr->target_data = png_malloc_warn(png_ptr, 256 * 4);

/* On allocation error it is essential to clear the flag or a
* massive number of warnings will be output.
*/
if (png_ptr->target_data != NULL)
png_riffle_palette_neon(png_ptr->target_data, palette,
trans_alpha, num_trans);
else
goto clear_flag;
}

/* This is the general convention in the core transform code; when
* expanding the number of bytes in the row copy down (necessary) and
* pass a pointer to the last byte, not the first.
*
* It does not have to be preserved here but maybe it is better this
* way despite the fact that the comments in the neon palette code
* obfuscate what is happening.
*/
png_bytep dp = row + (4/*RGBA*/*row_width - 1);

/* Cosmin Truta: "Sometimes row_info->bit_depth has been changed to 8.
* In these cases, the palette hasn't been riffled."
*
* John Bowler: Explanation: The code in png_do_palette_expand
* *invariably* changes the bit depth to 8. So low palette bit depth
* gets expanded to 8 and png_row_info is adjusted to reflect this (see
* png_do_palette_expand), however the "riffle" initialization code
* checked the original png_ptr bit depth, so it didn't know this would
* happen...
*
* This could be changed; the original bit depth is irrelevant to the
* initialization code.
*/
png_uint_32 i = png_target_do_expand_palette_rgba8_neon(
png_ptr->target_data, row_info->width, &sp, &dp);

if (i == 0) /* nothing was done */
return 0; /* Return here: interlaced images start out narrow */

/* Now 'i' make not have reached row_width.
* NOTE: [i] is not the index into the row buffer, rather than is
* [row_width-i], this is the way it is done in the original
* png_do_expand_palette.
*/
for (; i < row_width; i++)
{
if ((int)(*sp) >= num_trans)
*dp-- = 0xff;
else
*dp-- = trans_alpha[*sp];
*dp-- = palette[*sp].blue;
*dp-- = palette[*sp].green;
*dp-- = palette[*sp].red;
sp--;
}

/* Finally update row_info to reflect the expanded output: */
row_info->bit_depth = 8;
row_info->pixel_depth = 32;
row_info->rowbytes = row_width * 4;
row_info->color_type = 6;
row_info->channels = 4;
return 1;
}
else
{
/* No tRNS chunk (num_trans == 0), expand to RGB not RGBA. */
png_bytep dp = row + (3/*RGB*/*row_width - 1);

png_uint_32 i = png_target_do_expand_palette_rgb8_neon(palette,
row_info->width, &sp, &dp);

if (i == 0)
return 0; /* Return here: interlaced images start out narrow */

/* Finish the last bytes: */
for (; i < row_width; i++)
{
*dp-- = palette[*sp].blue;
*dp-- = palette[*sp].green;
*dp-- = palette[*sp].red;
sp--;
}

row_info->bit_depth = 8;
row_info->pixel_depth = 24;
row_info->rowbytes = row_width * 3;
row_info->color_type = 2;
row_info->channels = 3;
return 1;
}
}

clear_flag:
/* Here on malloc failure and on an inapplicable image. */
png_ptr->target_state &= ~png_target_expand_palette;
return 0;
}

#define png_target_do_expand_palette_impl png_target_do_expand_palette_neon
/* EXPAND_PALETTE */

#endif /*TODO*/
4 changes: 2 additions & 2 deletions arm/check.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
# define PNG_TARGET_CODE_IMPLEMENTATION "arm/arm_init.c"
# define PNG_TARGET_IMPLEMENTS_FILTERS
# ifdef PNG_READ_EXPAND_SUPPORTED
/*TODO: # define PNG_TARGET_STORES_DATA */
/*TODO: # define PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE */
# define PNG_TARGET_STORES_DATA
# define PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE
# endif /* READ_EXPAND */
# define PNG_TARGET_ROW_ALIGNMENT 16
#endif /* ARM_NEON */
35 changes: 15 additions & 20 deletions arm/palette_neon_intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,9 @@

/* Build an RGBA8 palette from the separate RGB and alpha palettes. */
static void
png_riffle_palette_neon(png_structrp png_ptr)
png_riffle_palette_neon(png_bytep riffled_palette, png_const_colorp palette,
png_const_bytep trans_alpha, int num_trans)
{
png_const_colorp palette = png_ptr->palette;
png_bytep riffled_palette = png_ptr->riffled_palette;
png_const_bytep trans_alpha = png_ptr->trans_alpha;
int num_trans = png_ptr->num_trans;
int i;

/* Initially black, opaque. */
Expand Down Expand Up @@ -47,19 +44,15 @@ png_riffle_palette_neon(png_structrp png_ptr)
}

/* Expands a palettized row into RGBA8. */
static int
png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
static png_uint_32
png_target_do_expand_palette_rgba8_neon(const png_uint_32 *riffled_palette,
png_uint_32 row_width, png_const_bytep *ssp, png_bytep *ddp)
{
png_uint_32 row_width = row_info->width;
const png_uint_32 *riffled_palette =
(const png_uint_32 *)png_ptr->riffled_palette;
const png_uint_32 pixels_per_chunk = 4;
png_uint_32 i;

png_debug(1, "in png_do_expand_palette_rgba8_neon");

PNG_UNUSED(row)
if (row_width < pixels_per_chunk)
return 0;

Expand All @@ -72,7 +65,8 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
for (i = 0; i < row_width; i += pixels_per_chunk)
{
uint32x4_t cur;
png_bytep sp = *ssp - i, dp = *ddp - (i << 2);
png_const_bytep sp = *ssp - i;
png_bytep dp = *ddp - (i << 2);
cur = vld1q_dup_u32 (riffled_palette + *(sp - 3));
cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1);
cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2);
Expand All @@ -92,18 +86,18 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
}

/* Expands a palettized row into RGB8. */
static int
png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
static png_uint_32
png_target_do_expand_palette_rgb8_neon(png_const_colorp paletteIn,
png_uint_32 row_width, png_const_bytep *ssp, png_bytep *ddp)
{
png_uint_32 row_width = row_info->width;
png_const_bytep palette = (png_const_bytep)png_ptr->palette;
/* TODO: This case is VERY dangerous: */
png_const_bytep palette = (png_const_bytep)paletteIn;

const png_uint_32 pixels_per_chunk = 8;
png_uint_32 i;

png_debug(1, "in png_do_expand_palette_rgb8_neon");

PNG_UNUSED(row)
if (row_width <= pixels_per_chunk)
return 0;

Expand All @@ -113,7 +107,8 @@ png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
for (i = 0; i < row_width; i += pixels_per_chunk)
{
uint8x8x3_t cur;
png_bytep sp = *ssp - i, dp = *ddp - ((i << 1) + i);
png_const_bytep sp = *ssp - i;
png_bytep dp = *ddp - ((i << 1) + i);
cur = vld3_dup_u8(palette + sizeof(png_color) * (*(sp - 7)));
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 6)), cur, 1);
cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 5)), cur, 2);
Expand Down
22 changes: 12 additions & 10 deletions pngpriv.h
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row,(png_structrp pp, png_row_infop
* available for this image.
*/
#define png_target_filters 1 /* MASK: hardware support for filters */
#define png_target_palette 2 /* MASK: hardware support for palettes */
#define png_target_expand_palette 2 /* MASK: hardware support for palettes */

PNG_INTERNAL_FUNCTION(void,png_target_init,(png_structrp),PNG_EMPTY);
/* Initialize png_struct::target_state if required. */
Expand All @@ -1115,15 +1115,17 @@ PNG_INTERNAL_FUNCTION(void, png_target_init_filter_functions,
* implementation. Called once before the first row needs to be defiltered.
*/

PNG_INTERNAL_FUNCTION(void, png_target_init_palette_support, (png_structrp),
PNG_EMPTY);
PNG_INTERNAL_FUNCTION(int, png_target_do_expand_palette, (png_structrp,
png_row_infop, png_const_bytep, const png_bytepp, const png_bytepp),
PNG_EMPTY);
/* Two functions to set up and execute palette expansion. The 'init'
* must succeed but then the 'do_expand' might, apparently, still fail.
*/
#endif /* HARDWARE */
/* Handlers for specific transforms (currently only 'expand_palette'). These
* are implemented in pngsimd.c to call the actual SIMD implementation if
* required.
*
* The handlers return "false" if nothing was done and the C code will then be
* called. The implementations must do everything or nothing.
*/
PNG_INTERNAL_FUNCTION(int, png_target_do_expand_palette,
(png_structrp, png_row_infop), PNG_EMPTY);
/* Expand the palette and return true or do nothing and return false. */
#endif /* TARGET_CODE */

/* Choose the best filter to use and filter the row data */
PNG_INTERNAL_FUNCTION(void,png_write_find_filter,(png_structrp png_ptr,
Expand Down
Loading

0 comments on commit 7a2e614

Please sign in to comment.