Skip to content

Commit 58731e2

Browse files
committed
Specialise highbd Neon 2D horiz convolution for 4-tap filters
Add a highbd Neon implementation of the horizontal portion of 2D convolution specialised for executing with 4-tap filters. This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I513e35c4f8857bc89e0def5e9402bc31ddd46440
1 parent 3127962 commit 58731e2

File tree

1 file changed

+154
-36
lines changed

1 file changed

+154
-36
lines changed

vpx_dsp/arm/highbd_vpx_convolve8_neon.c

Lines changed: 154 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -672,23 +672,115 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
672672
}
673673
}
674674

675-
static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
675+
static INLINE void highbd_convolve_4tap_2d_horiz_neon(
676676
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
677-
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4,
678-
int y0_q4, int y_step_q4, int w, int h, int bd) {
679-
assert((intptr_t)dst % 4 == 0);
680-
assert(dst_stride % 4 == 0);
681-
assert(x_step_q4 == 16);
682-
assert(h % 4 == 3 && h >= 7);
677+
ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
678+
if (w == 4) {
679+
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
680+
const int16_t *s = (const int16_t *)src;
681+
uint16_t *d = dst;
683682

684-
(void)x_step_q4;
685-
(void)y0_q4;
686-
(void)y_step_q4;
683+
do {
684+
int16x4_t s0[4], s1[4], s2[4], s3[4];
685+
load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
686+
load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
687+
load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
688+
load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
687689

688-
const int16x8_t filters = vld1q_s16(filter[x0_q4]);
690+
uint16x4_t d0 =
691+
highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
692+
uint16x4_t d1 =
693+
highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
694+
uint16x4_t d2 =
695+
highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
696+
uint16x4_t d3 =
697+
highbd_convolve4_4(s3[0], s3[1], s3[2], s3[3], filter, max);
689698

690-
src -= 3;
699+
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
700+
701+
s += 4 * src_stride;
702+
d += 4 * dst_stride;
703+
h -= 4;
704+
} while (h != 3);
705+
706+
// Process final three rows (h % 4 == 3). See vpx_highbd_convolve8_neon()
707+
// below for further details on possible values of block height.
708+
int16x4_t s0[4], s1[4], s2[4];
709+
load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
710+
load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
711+
load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
712+
713+
uint16x4_t d0 = highbd_convolve4_4(s0[0], s0[1], s0[2], s0[3], filter, max);
714+
uint16x4_t d1 = highbd_convolve4_4(s1[0], s1[1], s1[2], s1[3], filter, max);
715+
uint16x4_t d2 = highbd_convolve4_4(s2[0], s2[1], s2[2], s2[3], filter, max);
716+
717+
store_u16_4x3(d, dst_stride, d0, d1, d2);
718+
} else {
719+
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
720+
721+
do {
722+
const int16_t *s = (const int16_t *)src;
723+
uint16_t *d = dst;
724+
int width = w;
725+
726+
do {
727+
int16x8_t s0[4], s1[4], s2[4], s3[4];
728+
load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
729+
load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
730+
load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
731+
load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]);
732+
733+
uint16x8_t d0 =
734+
highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
735+
uint16x8_t d1 =
736+
highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
737+
uint16x8_t d2 =
738+
highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
739+
uint16x8_t d3 =
740+
highbd_convolve4_8(s3[0], s3[1], s3[2], s3[3], filter, max);
741+
742+
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
743+
744+
s += 8;
745+
d += 8;
746+
width -= 8;
747+
} while (width != 0);
748+
src += 4 * src_stride;
749+
dst += 4 * dst_stride;
750+
h -= 4;
751+
} while (h != 3);
752+
753+
// Process final three rows (h % 4 == 3). See vpx_highbd_convolve8_neon()
754+
// below for further details on possible values of block height.
755+
const int16_t *s = (const int16_t *)src;
756+
uint16_t *d = dst;
757+
int width = w;
691758

759+
do {
760+
int16x8_t s0[4], s1[4], s2[4];
761+
load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]);
762+
load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]);
763+
load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]);
764+
765+
uint16x8_t d0 =
766+
highbd_convolve4_8(s0[0], s0[1], s0[2], s0[3], filter, max);
767+
uint16x8_t d1 =
768+
highbd_convolve4_8(s1[0], s1[1], s1[2], s1[3], filter, max);
769+
uint16x8_t d2 =
770+
highbd_convolve4_8(s2[0], s2[1], s2[2], s2[3], filter, max);
771+
772+
store_u16_8x3(d, dst_stride, d0, d1, d2);
773+
774+
s += 8;
775+
d += 8;
776+
width -= 8;
777+
} while (width != 0);
778+
}
779+
}
780+
781+
static INLINE void highbd_convolve_8tap_2d_horiz_neon(
782+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
783+
ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
692784
if (w == 4) {
693785
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
694786
const int16_t *s = (const int16_t *)src;
@@ -706,13 +798,13 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
706798
&s3[4], &s3[5], &s3[6], &s3[7]);
707799

708800
uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4],
709-
s0[5], s0[6], s0[7], filters, max);
801+
s0[5], s0[6], s0[7], filter, max);
710802
uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4],
711-
s1[5], s1[6], s1[7], filters, max);
803+
s1[5], s1[6], s1[7], filter, max);
712804
uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4],
713-
s2[5], s2[6], s2[7], filters, max);
805+
s2[5], s2[6], s2[7], filter, max);
714806
uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4],
715-
s3[5], s3[6], s3[7], filters, max);
807+
s3[5], s3[6], s3[7], filter, max);
716808

717809
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
718810

@@ -732,11 +824,11 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
732824
&s2[5], &s2[6], &s2[7]);
733825

734826
uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5],
735-
s0[6], s0[7], filters, max);
827+
s0[6], s0[7], filter, max);
736828
uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5],
737-
s1[6], s1[7], filters, max);
829+
s1[6], s1[7], filter, max);
738830
uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5],
739-
s2[6], s2[7], filters, max);
831+
s2[6], s2[7], filter, max);
740832

741833
store_u16_4x3(d, dst_stride, d0, d1, d2);
742834
} else {
@@ -759,13 +851,13 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
759851
&s3[4], &s3[5], &s3[6], &s3[7]);
760852

761853
uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
762-
s0[5], s0[6], s0[7], filters, max);
854+
s0[5], s0[6], s0[7], filter, max);
763855
uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
764-
s1[5], s1[6], s1[7], filters, max);
856+
s1[5], s1[6], s1[7], filter, max);
765857
uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
766-
s2[5], s2[6], s2[7], filters, max);
858+
s2[5], s2[6], s2[7], filter, max);
767859
uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4],
768-
s3[5], s3[6], s3[7], filters, max);
860+
s3[5], s3[6], s3[7], filter, max);
769861

770862
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
771863

@@ -794,11 +886,11 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
794886
&s2[4], &s2[5], &s2[6], &s2[7]);
795887

796888
uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4],
797-
s0[5], s0[6], s0[7], filters, max);
889+
s0[5], s0[6], s0[7], filter, max);
798890
uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4],
799-
s1[5], s1[6], s1[7], filters, max);
891+
s1[5], s1[6], s1[7], filter, max);
800892
uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4],
801-
s2[5], s2[6], s2[7], filters, max);
893+
s2[5], s2[6], s2[7], filter, max);
802894

803895
store_u16_8x3(d, dst_stride, d0, d1, d2);
804896

@@ -809,6 +901,30 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
809901
}
810902
}
811903

904+
static INLINE void highbd_convolve8_2d_horiz_neon(
905+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
906+
ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4,
907+
int y0_q4, int y_step_q4, int w, int h, int bd) {
908+
assert((intptr_t)dst % 4 == 0);
909+
assert(dst_stride % 4 == 0);
910+
assert(x_step_q4 == 16);
911+
assert(h % 4 == 3 && h >= 7);
912+
913+
(void)x_step_q4;
914+
(void)y0_q4;
915+
(void)y_step_q4;
916+
917+
if (vpx_get_filter_taps(filter[x0_q4]) <= 4) {
918+
const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2);
919+
highbd_convolve_4tap_2d_horiz_neon(src - 1, src_stride, dst, dst_stride, w,
920+
h, x_filter_4tap, bd);
921+
} else {
922+
const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]);
923+
highbd_convolve_8tap_2d_horiz_neon(src - 3, src_stride, dst, dst_stride, w,
924+
h, x_filter_8tap, bd);
925+
}
926+
}
927+
812928
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
813929
uint16_t *dst, ptrdiff_t dst_stride,
814930
const InterpKernel *filter, int x0_q4,
@@ -825,14 +941,15 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
825941
DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
826942
const int im_stride = 64;
827943

828-
// Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior and
829-
// SUBPEL_TAPS / 2 lines post.
830-
const int im_height = h + SUBPEL_TAPS - 1;
831-
const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
944+
const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
945+
// Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
946+
// and vert_filter_taps / 2 lines post.
947+
const int im_height = h + vert_filter_taps - 1;
948+
const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
832949

833-
vpx_highbd_convolve8_2d_horiz_neon(
834-
src - src_stride * border_offset, src_stride, im_block, im_stride, filter,
835-
x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd);
950+
highbd_convolve8_2d_horiz_neon(src - src_stride * border_offset, src_stride,
951+
im_block, im_stride, filter, x0_q4, x_step_q4,
952+
y0_q4, y_step_q4, w, im_height, bd);
836953

837954
vpx_highbd_convolve8_vert_neon(im_block + im_stride * border_offset,
838955
im_stride, dst, dst_stride, filter, x0_q4,
@@ -853,12 +970,13 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
853970
// See above for buffer size derivation.
854971
DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
855972
const int im_stride = 64;
973+
// Averaging convolution always uses an 8-tap filter.
856974
const int im_height = h + SUBPEL_TAPS - 1;
857975
const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1;
858976

859-
vpx_highbd_convolve8_2d_horiz_neon(
860-
src - src_stride * border_offset, src_stride, im_block, im_stride, filter,
861-
x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd);
977+
highbd_convolve8_2d_horiz_neon(src - src_stride * border_offset, src_stride,
978+
im_block, im_stride, filter, x0_q4, x_step_q4,
979+
y0_q4, y_step_q4, w, im_height, bd);
862980

863981
vpx_highbd_convolve8_avg_vert_neon(im_block + im_stride * border_offset,
864982
im_stride, dst, dst_stride, filter, x0_q4,

0 commit comments

Comments
 (0)