From e390d1dcd672bfed9fc9f72877bd32bf4bdc0411 Mon Sep 17 00:00:00 2001 From: John Bowler Date: Sat, 14 Sep 2024 17:04:22 -0700 Subject: [PATCH] ARM Ltd palette SIMD code This enables the ARM Ltd SIMD code to assist "png_expand_palette". The code has been refactored so that while it does not change the disruption to the main libpng code, in particular pngrtran.c, is minimized and generalized. It should now be possible for any manufacturer to optimize the same transform (expansion of a color-mapped PNG to RGB or RGB8) without new manufacturer favouring code in the main body of libpng. Signed-off-by: John Bowler --- arm/arm_init.c | 191 +++++++++++++++++++++++++++++++--- arm/check.h | 4 +- arm/palette_neon_intrinsics.c | 35 +++---- pngpriv.h | 22 ++-- pngrtran.c | 48 ++------- pngsimd.c | 104 ++++++------------ 6 files changed, 243 insertions(+), 161 deletions(-) diff --git a/arm/arm_init.c b/arm/arm_init.c index 44f92f7b2a..63281fc869 100644 --- a/arm/arm_init.c +++ b/arm/arm_init.c @@ -52,23 +52,182 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) #define png_target_init_filter_functions_impl png_init_filter_functions_neon -#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE /*TODO*/ -#include "palette_neon_intrinsics.c" - -/* TODO: - * png_target_free_data_impl +#ifdef PNG_TARGET_STORES_DATA +/* png_target_free_data_impl * Must be defined if the implementation stores data in * png_struct::target_data. Need not be defined otherwise. - * - * png_target_init_palette_support_impl - * Contains code to initialize a palette transformation. This returns - * true if something has been set up. Only called if the state contains - * png_target_palette, need not be defined, may cancel the state flag - * in the png_struct to prevent further calls. - * - * png_target_do_expand_palette - * Handles palette expansion. Need not be defined, only called if the - * state contains png_target_palette, may set this flag to zero, may - * return false to indicate that the expansion was not done. */ +static void +png_target_free_data_arm(png_structrp pp) +{ + png_voidp ptr = pp->target_data; + pp->target_data = NULL; + png_free(pp, ptr); +} +#define png_target_free_data_impl png_target_free_data_arm +#endif /* TARGET_STORES_DATA */ + +#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE +/* png_target_do_expand_palette_impl [flag: png_target_expand_palette] + * static function + * OPTIONAL + * Handles the transform. Need not be defined, only called if the + * state contains png_target_, may set this flag to zero, may + * return false to indicate that the transform was not done (so the + * C implementation must then execute). + */ +#include "palette_neon_intrinsics.c" + +static int +png_target_do_expand_palette_neon(png_structrp png_ptr, png_row_infop row_info, + png_bytep row, png_const_colorp palette, png_const_bytep trans_alpha, + int num_trans) +{ + /* NOTE: it is important that this is done. row_info->width is not a CSE + * because the pointer is not declared with the 'restrict' parameter, this + * makes it a CSE but then it is very important that no one changes it in + * this function, hence the const. + */ + const png_uint_32 row_width = row_info->width; + + /* NOTE: this is pretty much the original code: + * + * 1) The original code only works when the original PNG has 8-bits per + * palette. This test was in pngrtran.c and is now here. + * + * 2) The original code starts at the end and works backward but then stops + * when it is within 16 bytes of the start. It then left the remainder to + * the original code in pngrtran.c That code is now here. + * + * 3) The original code takes pointers to the end of the input and the end of + * the output; this is the way png_do_expand_palette works becuase it + * has to copy down from the end (otherwise it would overwrite the input + * data before it read it). Note that the row buffer is aliased by + * these two pointers. + * + * A consequence of passing pointers is that the row pointers (input and + * output) are forced into memory (they can't be in registers). This + * could be fixed and some compilers may be able to handle this but + * no changes have been made to the original ARM code at this point. + */ + if (row_info->color_type == PNG_COLOR_TYPE_PALETTE && + row_info->bit_depth == 8 /* <8 requires a bigger "riffled" palette */) + { + png_const_bytep sp = row + (row_width - 1); /* 8 bit palette index */ + if (num_trans > 0) + { + /* This case needs a "riffled" palette. In this implementation the + * initialization is done here, on demand. + */ + if (png_ptr->target_data == NULL) + { + /* Initialize the accelerated palette expansion. + * + * The data is now allocated using png_malloc_warn so the code + * does not error out on OOM. + */ + png_ptr->target_data = png_malloc_warn(png_ptr, 256 * 4); + + /* On allocation error it is essential to clear the flag or a + * massive number of warnings will be output. + */ + if (png_ptr->target_data != NULL) + png_riffle_palette_neon(png_ptr->target_data, palette, + trans_alpha, num_trans); + else + goto clear_flag; + } + + /* This is the general convention in the core transform code; when + * expanding the number of bytes in the row copy down (necessary) and + * pass a pointer to the last byte, not the first. + * + * It does not have to be preserved here but maybe it is better this + * way despite the fact that the comments in the neon palette code + * obfuscate what is happening. + */ + png_bytep dp = row + (4/*RGBA*/*row_width - 1); + + /* Cosmin Truta: "Sometimes row_info->bit_depth has been changed to 8. + * In these cases, the palette hasn't been riffled." + * + * John Bowler: Explanation: The code in png_do_palette_expand + * *invariably* changes the bit depth to 8. So low palette bit depth + * gets expanded to 8 and png_row_info is adjusted to reflect this (see + * png_do_palette_expand), however the "riffle" initialization code + * checked the original png_ptr bit depth, so it didn't know this would + * happen... + * + * This could be changed; the original bit depth is irrelevant to the + * initialization code. + */ + png_uint_32 i = png_target_do_expand_palette_rgba8_neon( + png_ptr->target_data, row_info->width, &sp, &dp); + + if (i == 0) /* nothing was done */ + return 0; /* Return here: interlaced images start out narrow */ + + /* Now 'i' make not have reached row_width. + * NOTE: [i] is not the index into the row buffer, rather than is + * [row_width-i], this is the way it is done in the original + * png_do_expand_palette. + */ + for (; i < row_width; i++) + { + if ((int)(*sp) >= num_trans) + *dp-- = 0xff; + else + *dp-- = trans_alpha[*sp]; + *dp-- = palette[*sp].blue; + *dp-- = palette[*sp].green; + *dp-- = palette[*sp].red; + sp--; + } + + /* Finally update row_info to reflect the expanded output: */ + row_info->bit_depth = 8; + row_info->pixel_depth = 32; + row_info->rowbytes = row_width * 4; + row_info->color_type = 6; + row_info->channels = 4; + return 1; + } + else + { + /* No tRNS chunk (num_trans == 0), expand to RGB not RGBA. */ + png_bytep dp = row + (3/*RGB*/*row_width - 1); + + png_uint_32 i = png_target_do_expand_palette_rgb8_neon(palette, + row_info->width, &sp, &dp); + + if (i == 0) + return 0; /* Return here: interlaced images start out narrow */ + + /* Finish the last bytes: */ + for (; i < row_width; i++) + { + *dp-- = palette[*sp].blue; + *dp-- = palette[*sp].green; + *dp-- = palette[*sp].red; + sp--; + } + + row_info->bit_depth = 8; + row_info->pixel_depth = 24; + row_info->rowbytes = row_width * 3; + row_info->color_type = 2; + row_info->channels = 3; + return 1; + } + } + +clear_flag: + /* Here on malloc failure and on an inapplicable image. */ + png_ptr->target_state &= ~png_target_expand_palette; + return 0; +} + +#define png_target_do_expand_palette_impl png_target_do_expand_palette_neon +/* EXPAND_PALETTE */ + #endif /*TODO*/ diff --git a/arm/check.h b/arm/check.h index e9b0696cb1..8fb3521a82 100644 --- a/arm/check.h +++ b/arm/check.h @@ -12,8 +12,8 @@ # define PNG_TARGET_CODE_IMPLEMENTATION "arm/arm_init.c" # define PNG_TARGET_IMPLEMENTS_FILTERS # ifdef PNG_READ_EXPAND_SUPPORTED - /*TODO: # define PNG_TARGET_STORES_DATA */ - /*TODO: # define PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE */ +# define PNG_TARGET_STORES_DATA +# define PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE # endif /* READ_EXPAND */ # define PNG_TARGET_ROW_ALIGNMENT 16 #endif /* ARM_NEON */ diff --git a/arm/palette_neon_intrinsics.c b/arm/palette_neon_intrinsics.c index 706daef167..0c29fad0ea 100644 --- a/arm/palette_neon_intrinsics.c +++ b/arm/palette_neon_intrinsics.c @@ -11,12 +11,9 @@ /* Build an RGBA8 palette from the separate RGB and alpha palettes. */ static void -png_riffle_palette_neon(png_structrp png_ptr) +png_riffle_palette_neon(png_bytep riffled_palette, png_const_colorp palette, + png_const_bytep trans_alpha, int num_trans) { - png_const_colorp palette = png_ptr->palette; - png_bytep riffled_palette = png_ptr->riffled_palette; - png_const_bytep trans_alpha = png_ptr->trans_alpha; - int num_trans = png_ptr->num_trans; int i; /* Initially black, opaque. */ @@ -47,19 +44,15 @@ png_riffle_palette_neon(png_structrp png_ptr) } /* Expands a palettized row into RGBA8. */ -static int -png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info, - png_const_bytep row, png_bytepp ssp, png_bytepp ddp) +static png_uint_32 +png_target_do_expand_palette_rgba8_neon(const png_uint_32 *riffled_palette, + png_uint_32 row_width, png_const_bytep *ssp, png_bytep *ddp) { - png_uint_32 row_width = row_info->width; - const png_uint_32 *riffled_palette = - (const png_uint_32 *)png_ptr->riffled_palette; const png_uint_32 pixels_per_chunk = 4; png_uint_32 i; png_debug(1, "in png_do_expand_palette_rgba8_neon"); - PNG_UNUSED(row) if (row_width < pixels_per_chunk) return 0; @@ -72,7 +65,8 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info, for (i = 0; i < row_width; i += pixels_per_chunk) { uint32x4_t cur; - png_bytep sp = *ssp - i, dp = *ddp - (i << 2); + png_const_bytep sp = *ssp - i; + png_bytep dp = *ddp - (i << 2); cur = vld1q_dup_u32 (riffled_palette + *(sp - 3)); cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1); cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2); @@ -92,18 +86,18 @@ png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info, } /* Expands a palettized row into RGB8. */ -static int -png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info, - png_const_bytep row, png_bytepp ssp, png_bytepp ddp) +static png_uint_32 +png_target_do_expand_palette_rgb8_neon(png_const_colorp paletteIn, + png_uint_32 row_width, png_const_bytep *ssp, png_bytep *ddp) { - png_uint_32 row_width = row_info->width; - png_const_bytep palette = (png_const_bytep)png_ptr->palette; + /* TODO: This case is VERY dangerous: */ + png_const_bytep palette = (png_const_bytep)paletteIn; + const png_uint_32 pixels_per_chunk = 8; png_uint_32 i; png_debug(1, "in png_do_expand_palette_rgb8_neon"); - PNG_UNUSED(row) if (row_width <= pixels_per_chunk) return 0; @@ -113,7 +107,8 @@ png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info, for (i = 0; i < row_width; i += pixels_per_chunk) { uint8x8x3_t cur; - png_bytep sp = *ssp - i, dp = *ddp - ((i << 1) + i); + png_const_bytep sp = *ssp - i; + png_bytep dp = *ddp - ((i << 1) + i); cur = vld3_dup_u8(palette + sizeof(png_color) * (*(sp - 7))); cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 6)), cur, 1); cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 5)), cur, 2); diff --git a/pngpriv.h b/pngpriv.h index 9272455fbe..c3a543a461 100644 --- a/pngpriv.h +++ b/pngpriv.h @@ -1100,7 +1100,7 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row,(png_structrp pp, png_row_infop * available for this image. */ #define png_target_filters 1 /* MASK: hardware support for filters */ -#define png_target_palette 2 /* MASK: hardware support for palettes */ +#define png_target_expand_palette 2 /* MASK: hardware support for palettes */ PNG_INTERNAL_FUNCTION(void,png_target_init,(png_structrp),PNG_EMPTY); /* Initialize png_struct::target_state if required. */ @@ -1115,15 +1115,17 @@ PNG_INTERNAL_FUNCTION(void, png_target_init_filter_functions, * implementation. Called once before the first row needs to be defiltered. */ -PNG_INTERNAL_FUNCTION(void, png_target_init_palette_support, (png_structrp), - PNG_EMPTY); -PNG_INTERNAL_FUNCTION(int, png_target_do_expand_palette, (png_structrp, - png_row_infop, png_const_bytep, const png_bytepp, const png_bytepp), - PNG_EMPTY); - /* Two functions to set up and execute palette expansion. The 'init' - * must succeed but then the 'do_expand' might, apparently, still fail. - */ -#endif /* HARDWARE */ +/* Handlers for specific transforms (currently only 'expand_palette'). These + * are implemented in pngsimd.c to call the actual SIMD implementation if + * required. + * + * The handlers return "false" if nothing was done and the C code will then be + * called. The implementations must do everything or nothing. + */ +PNG_INTERNAL_FUNCTION(int, png_target_do_expand_palette, + (png_structrp, png_row_infop), PNG_EMPTY); + /* Expand the palette and return true or do nothing and return false. */ +#endif /* TARGET_CODE */ /* Choose the best filter to use and filter the row data */ PNG_INTERNAL_FUNCTION(void,png_write_find_filter,(png_structrp png_ptr, diff --git a/pngrtran.c b/pngrtran.c index 4af9fe04ef..545e97b644 100644 --- a/pngrtran.c +++ b/pngrtran.c @@ -4191,9 +4191,8 @@ png_do_encode_alpha(png_row_infop row_info, png_bytep row, png_structrp png_ptr) * upon whether you supply trans and num_trans. */ static void -png_do_expand_palette(png_structrp png_ptr, png_row_infop row_info, - png_bytep row, png_const_colorp palette, png_const_bytep trans_alpha, - int num_trans) +png_do_expand_palette(png_row_infop row_info, png_bytep row, png_const_colorp + palette, png_const_bytep trans_alpha, int num_trans) { int shift, value; png_bytep sp, dp; @@ -4298,21 +4297,7 @@ png_do_expand_palette(png_structrp png_ptr, png_row_infop row_info, dp = row + ((size_t)row_width << 2) - 1; i = 0; -#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE - if ((png_ptr->target_state & png_target_palette) != 0) - { - /* The RGBA optimization works with png_ptr->bit_depth == 8 - * but sometimes row_info->bit_depth has been changed to 8. - * In these cases, the palette hasn't been riffled. - */ - i = png_do_expand_palette_rgba8_neon(png_ptr, row_info, row, - &sp, &dp); - } -#else - PNG_UNUSED(png_ptr) -#endif - - for (; i < row_width; i++) + for (i = 0; i < row_width; i++) { if ((int)(*sp) >= num_trans) *dp-- = 0xff; @@ -4334,15 +4319,7 @@ png_do_expand_palette(png_structrp png_ptr, png_row_infop row_info, { sp = row + (size_t)row_width - 1; dp = row + (size_t)(row_width * 3) - 1; - i = 0; -#ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE - i = png_do_expand_palette_rgb8_neon(png_ptr, row_info, row, - &sp, &dp); -#else - PNG_UNUSED(png_ptr) -#endif - - for (; i < row_width; i++) + for (i = 0; i < row_width; i++) { *dp-- = palette[*sp].blue; *dp-- = palette[*sp].green; @@ -4757,18 +4734,13 @@ png_do_read_transformations(png_structrp png_ptr, png_row_infop row_info) if (row_info->color_type == PNG_COLOR_TYPE_PALETTE) { #ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE - if ((png_ptr->num_trans > 0) && (png_ptr->bit_depth == 8)) - { - if (png_ptr->riffled_palette == NULL) - { - /* Initialize the accelerated palette expansion. */ - png_ptr->riffled_palette = - (png_bytep)png_malloc(png_ptr, 256 * 4); - png_riffle_palette_neon(png_ptr); - } - } + /* Do not call 'png_do_expand_palette' if the SIMD implementation + * does it (note that this accomodates SIMD implementations which might + * only handle specific cases). + */ + if (!png_target_do_expand_palette(png_ptr, row_info)) #endif - png_do_expand_palette(png_ptr, row_info, png_ptr->row_buf + 1, + png_do_expand_palette(row_info, png_ptr->row_buf + 1, png_ptr->palette, png_ptr->trans_alpha, png_ptr->num_trans); } diff --git a/pngsimd.c b/pngsimd.c index 7c1cb2f166..91c04fe65d 100644 --- a/pngsimd.c +++ b/pngsimd.c @@ -35,29 +35,22 @@ * UNDEFINED if PNG_TARGET_STORES_DATA is not defined * A function to free data stored in png_struct::target_data. * - * png_target_init_filter_functions_impl + * png_target_init_filter_functions_impl [flag: png_target_filters] * OPTIONAL * Contains code to overwrite the png_struct::read_filter array, see * the definition of png_init_filter_functions. Need not be defined, * only called if target_state contains png_target_filters. * - * png_target_init_palette_support_impl + * png_target_do_expand_palette_impl [flag: png_target_expand_palette] * static function * OPTIONAL - * Contains code to initialize a palette transformation. This returns - * true if something has been set up. Only called if the state contains - * png_target_palette, need not be defined, may cancel the state flag - * in the png_struct to prevent further calls. + * Handles the transform. Need not be defined, only called if the + * state contains png_target_, may set this flag to zero, may + * return false to indicate that the transform was not done (so the + * C implementation must then execute). * - * png_target_do_expand_palette_impl - * static function - * OPTIONAL - * Handles palette expansion. Need not be defined, only called if the - * state contains png_target_palette, may set this flag to zero, may - * return false to indicate that the expansion was not done. - * - * Either png_target_init_filter_functions_impl or - * png_target_do_expand_palette_impl must be defined. + * Note that pngtarget.h verifies that at least one thing is implemented, the + * checks below ensure that the corresponding _impl macro is defined. */ /* This will fail in an obvious way with a meaningful error message if the file @@ -70,12 +63,17 @@ #endif #if defined(PNG_TARGET_STORES_DATA) != defined(png_target_free_data_impl) -# error HARDWARE: PNG_TARGET_STORES_DATA !match png_target_free_data_impl +# error HARDWARE: png_target_free_data_impl unexpected setting #endif -#if !defined(png_target_init_filter_functions_impl) &&\ - !defined(png_target_init_palette_support) -# error HARDWARE: target specifc code turned on but none provided +#if defined(PNG_TARGET_IMPLEMENTS_FILTERS) !=\ + defined(png_target_init_filter_functions_impl) +# error HARDWARE: png_target_init_filter_functions_impl unexpected setting +#endif + +#if defined(PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE) !=\ + defined(png_target_do_expand_palette_impl) +# error HARDWARE: png_target_do_expand_palette_impl unexpected setting #endif void @@ -88,7 +86,7 @@ png_target_init(png_structrp pp) # define F 0U # endif # ifdef png_target_do_expand_palette_impl -# define P png_target_palette +# define P png_target_expand_palette # else # define P 0U # endif @@ -101,9 +99,6 @@ png_target_init(png_structrp pp) } #ifdef PNG_TARGET_STORES_DATA -#ifndef png_target_free_data_impl -# error PNG_TARGET_STORES_DATA defined without implementation -#endif void png_target_free_data(png_structrp pp) { @@ -111,7 +106,7 @@ png_target_free_data(png_structrp pp) */ if (pp->target_data != NULL) { - png_target_free_data_impl(pp); + png_target_free_data_impl(pp); if (pp->target_data != NULL) png_error(pp, png_target_impl ": allocated data not released"); } @@ -119,9 +114,6 @@ png_target_free_data(png_structrp pp) #endif #ifdef PNG_TARGET_IMPLEMENTS_FILTERS -#ifndef png_target_init_filter_functions_impl -# error PNG_TARGET_IMPLEMENTS_FILTERS defined without implementation -#endif void png_target_init_filter_functions(png_structp pp, unsigned int bpp) { @@ -132,55 +124,17 @@ png_target_init_filter_functions(png_structp pp, unsigned int bpp) #endif /* filters */ #ifdef PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE -#ifndef png_target_init_palette_support_impl -# error PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE defined without implementation -#endif -void -png_target_init_palette_support(png_structrp pp) -{ - if (((pp->options >> PNG_TARGET_SPECIFIC_CODE) & 3) == PNG_OPTION_ON && - (pp->target_state & png_target_palette) != 0 && - !png_target_init_palette_support_impl(pp, bpp)) - png_ptr->target_state &= ~png_target_palette; -} - -#ifndef png_target_do_expand_palette_impl -# error PNG_TARGET_IMPLEMENTS_EXPAND_PALETTE defined without implementation -#endif int -png_target_do_expand_palette(png_structrp pp, png_row_infop rip, - png_const_bytep row, const png_bytepp ssp, const png_bytepp ddp) +png_target_do_expand_palette(png_structrp pp, png_row_infop rip) + /*png_const_bytep row, const png_bytepp ssp, const png_bytepp ddp) */ { - if (((pp->options >> PNG_TARGET_SPECIFIC_CODE) & 3) == PNG_OPTION_ON && - (pp->target_state & png_target_palette) != 0) - return png_target_do_expand_palette_impl(pp, rip, row, ssp, ddp); + /* This is exactly like 'png_do_expand_palette' except that there is a check + * on the options and target_state: + */ + return ((pp->options >> PNG_TARGET_SPECIFIC_CODE) & 3) == PNG_OPTION_ON && + (pp->target_state & png_target_expand_palette) != 0 && + png_target_do_expand_palette_impl(pp, rip, pp->row_buf + 1, + pp->palette, pp->trans_alpha, pp->num_trans); } -#endif /* palette */ - -/* - * png_target_init_impl - * Set the mask of png_target_support values to - * png_struct::target_state. If the value is non-0 hardware support - * will be recorded as enabled. - * - * png_target_free_data_impl - * Must be defined if the implementation stores data in - * png_struct::target_data. Need not be defined otherwise. - * - * png_target_init_filter_functions_impl - * Contains code to overwrite the png_struct::read_filter array, see - * the definition of png_init_filter_functions. Need not be defined, - * only called if the state contains png_target_filters. - * - * png_target_init_palette_support_impl - * Contains code to initialize a palette transformation. This returns - * true if something has been set up. Only called if the state contains - * png_target_palette, need not be defined, may cancel the state flag - * in the png_struct to prevent further calls. - * - * png_target_do_expand_palette - * Handles palette expansion. Need not be defined, only called if the - * state contains png_target_palette, may set this flag to zero, may - * return false to indicate that the expansion was not done. - */ +#endif /* EXPAND_PALETTE */ #endif /* PNG_TARGET_ARCH */