Skip to content

Commit 364326c

Browse files
committed
Merge horiz. and vert. passes in HBD Neon 2D convolution
The current Neon approach to 2D convolution is: 1) Filter horizontally, storing to an intermediate buffer. 2) Filter vertically and store the final output. This patch merges the two phases for high bitdepth 2D convolution to avoid the storing and re-loading from the intermediate buffer. This provides a small gain (<5%) for large block sizes but the benefit increases for small block sizes - as the proportion of compute to memory access decreases. These effects are amplified further when considering little (in-order) core performance. Change-Id: I8ec13fb9edd642fdb927bf5394a3c2a349d22a29
1 parent 58731e2 commit 364326c

File tree

1 file changed

+338
-15
lines changed

1 file changed

+338
-15
lines changed

vpx_dsp/arm/highbd_vpx_convolve8_neon.c

Lines changed: 338 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,326 @@ static INLINE void highbd_convolve8_2d_horiz_neon(
925925
}
926926
}
927927

928+
static INLINE void highbd_convolve_2d_4tap_neon(
929+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
930+
ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter,
931+
const int16x4_t y_filter, int bd) {
932+
if (w == 4) {
933+
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
934+
const int16_t *s = (const int16_t *)src;
935+
uint16_t *d = dst;
936+
937+
int16x4_t h_s0[4], h_s1[4], h_s2[4];
938+
load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
939+
load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
940+
load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
941+
942+
int16x4_t v_s0 = vreinterpret_s16_u16(
943+
highbd_convolve4_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
944+
int16x4_t v_s1 = vreinterpret_s16_u16(
945+
highbd_convolve4_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
946+
int16x4_t v_s2 = vreinterpret_s16_u16(
947+
highbd_convolve4_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
948+
949+
s += 3 * src_stride;
950+
951+
do {
952+
int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
953+
load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
954+
&h_s3[3]);
955+
load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
956+
&h_s4[3]);
957+
load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
958+
&h_s5[3]);
959+
load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
960+
&h_s6[3]);
961+
962+
int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4(
963+
h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
964+
int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4(
965+
h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
966+
int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4(
967+
h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
968+
int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4(
969+
h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
970+
971+
uint16x4_t d0 = highbd_convolve4_4(v_s0, v_s1, v_s2, v_s3, y_filter, max);
972+
uint16x4_t d1 = highbd_convolve4_4(v_s1, v_s2, v_s3, v_s4, y_filter, max);
973+
uint16x4_t d2 = highbd_convolve4_4(v_s2, v_s3, v_s4, v_s5, y_filter, max);
974+
uint16x4_t d3 = highbd_convolve4_4(v_s3, v_s4, v_s5, v_s6, y_filter, max);
975+
976+
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
977+
978+
v_s0 = v_s4;
979+
v_s1 = v_s5;
980+
v_s2 = v_s6;
981+
s += 4 * src_stride;
982+
d += 4 * dst_stride;
983+
h -= 4;
984+
} while (h != 0);
985+
986+
return;
987+
}
988+
989+
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
990+
991+
do {
992+
const int16_t *s = (const int16_t *)src;
993+
uint16_t *d = dst;
994+
int height = h;
995+
996+
int16x8_t h_s0[4], h_s1[4], h_s2[4];
997+
load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]);
998+
load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]);
999+
load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]);
1000+
1001+
int16x8_t v_s0 = vreinterpretq_s16_u16(
1002+
highbd_convolve4_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max));
1003+
int16x8_t v_s1 = vreinterpretq_s16_u16(
1004+
highbd_convolve4_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max));
1005+
int16x8_t v_s2 = vreinterpretq_s16_u16(
1006+
highbd_convolve4_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max));
1007+
1008+
s += 3 * src_stride;
1009+
1010+
do {
1011+
int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4];
1012+
load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2],
1013+
&h_s3[3]);
1014+
load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2],
1015+
&h_s4[3]);
1016+
load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2],
1017+
&h_s5[3]);
1018+
load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2],
1019+
&h_s6[3]);
1020+
1021+
int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8(
1022+
h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max));
1023+
int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8(
1024+
h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max));
1025+
int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8(
1026+
h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max));
1027+
int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8(
1028+
h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max));
1029+
1030+
uint16x8_t d0 = highbd_convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter, max);
1031+
uint16x8_t d1 = highbd_convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter, max);
1032+
uint16x8_t d2 = highbd_convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter, max);
1033+
uint16x8_t d3 = highbd_convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter, max);
1034+
1035+
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1036+
1037+
v_s0 = v_s4;
1038+
v_s1 = v_s5;
1039+
v_s2 = v_s6;
1040+
s += 4 * src_stride;
1041+
d += 4 * dst_stride;
1042+
height -= 4;
1043+
} while (height != 0);
1044+
src += 8;
1045+
dst += 8;
1046+
w -= 8;
1047+
} while (w != 0);
1048+
}
1049+
1050+
static INLINE void highbd_convolve_2d_8tap_neon(
1051+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
1052+
ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter,
1053+
const int16x8_t y_filter, int bd) {
1054+
if (w == 4) {
1055+
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
1056+
const int16_t *s = (const int16_t *)src;
1057+
uint16_t *d = dst;
1058+
1059+
int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1060+
load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1061+
&h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1062+
load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1063+
&h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1064+
load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1065+
&h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1066+
load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1067+
&h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1068+
load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1069+
&h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1070+
load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1071+
&h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1072+
load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1073+
&h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1074+
1075+
int16x4_t v_s0 = vreinterpret_s16_u16(
1076+
highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1077+
h_s0[6], h_s0[7], x_filter, max));
1078+
int16x4_t v_s1 = vreinterpret_s16_u16(
1079+
highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1080+
h_s1[6], h_s1[7], x_filter, max));
1081+
int16x4_t v_s2 = vreinterpret_s16_u16(
1082+
highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1083+
h_s2[6], h_s2[7], x_filter, max));
1084+
int16x4_t v_s3 = vreinterpret_s16_u16(
1085+
highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1086+
h_s3[6], h_s3[7], x_filter, max));
1087+
int16x4_t v_s4 = vreinterpret_s16_u16(
1088+
highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1089+
h_s4[6], h_s4[7], x_filter, max));
1090+
int16x4_t v_s5 = vreinterpret_s16_u16(
1091+
highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1092+
h_s5[6], h_s5[7], x_filter, max));
1093+
int16x4_t v_s6 = vreinterpret_s16_u16(
1094+
highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1095+
h_s6[6], h_s6[7], x_filter, max));
1096+
1097+
s += 7 * src_stride;
1098+
1099+
do {
1100+
int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1101+
load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1102+
&h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1103+
load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1104+
&h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1105+
load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1106+
&h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1107+
load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1108+
&h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1109+
1110+
int16x4_t v_s7 = vreinterpret_s16_u16(
1111+
highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1112+
h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1113+
int16x4_t v_s8 = vreinterpret_s16_u16(
1114+
highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1115+
h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1116+
int16x4_t v_s9 = vreinterpret_s16_u16(
1117+
highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1118+
h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1119+
int16x4_t v_s10 = vreinterpret_s16_u16(
1120+
highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1121+
h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1122+
1123+
uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1124+
v_s6, v_s7, y_filter, max);
1125+
uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1126+
v_s7, v_s8, y_filter, max);
1127+
uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1128+
v_s8, v_s9, y_filter, max);
1129+
uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1130+
v_s9, v_s10, y_filter, max);
1131+
1132+
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
1133+
1134+
v_s0 = v_s4;
1135+
v_s1 = v_s5;
1136+
v_s2 = v_s6;
1137+
v_s3 = v_s7;
1138+
v_s4 = v_s8;
1139+
v_s5 = v_s9;
1140+
v_s6 = v_s10;
1141+
s += 4 * src_stride;
1142+
d += 4 * dst_stride;
1143+
h -= 4;
1144+
} while (h != 0);
1145+
1146+
return;
1147+
}
1148+
1149+
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1150+
1151+
do {
1152+
const int16_t *s = (const int16_t *)src;
1153+
uint16_t *d = dst;
1154+
int height = h;
1155+
1156+
int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8];
1157+
load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3],
1158+
&h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]);
1159+
load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3],
1160+
&h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]);
1161+
load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3],
1162+
&h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]);
1163+
load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3],
1164+
&h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]);
1165+
load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3],
1166+
&h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]);
1167+
load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3],
1168+
&h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]);
1169+
load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3],
1170+
&h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]);
1171+
1172+
int16x8_t v_s0 = vreinterpretq_s16_u16(
1173+
highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5],
1174+
h_s0[6], h_s0[7], x_filter, max));
1175+
int16x8_t v_s1 = vreinterpretq_s16_u16(
1176+
highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5],
1177+
h_s1[6], h_s1[7], x_filter, max));
1178+
int16x8_t v_s2 = vreinterpretq_s16_u16(
1179+
highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5],
1180+
h_s2[6], h_s2[7], x_filter, max));
1181+
int16x8_t v_s3 = vreinterpretq_s16_u16(
1182+
highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5],
1183+
h_s3[6], h_s3[7], x_filter, max));
1184+
int16x8_t v_s4 = vreinterpretq_s16_u16(
1185+
highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5],
1186+
h_s4[6], h_s4[7], x_filter, max));
1187+
int16x8_t v_s5 = vreinterpretq_s16_u16(
1188+
highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5],
1189+
h_s5[6], h_s5[7], x_filter, max));
1190+
int16x8_t v_s6 = vreinterpretq_s16_u16(
1191+
highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5],
1192+
h_s6[6], h_s6[7], x_filter, max));
1193+
1194+
s += 7 * src_stride;
1195+
1196+
do {
1197+
int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8];
1198+
load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2],
1199+
&h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]);
1200+
load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2],
1201+
&h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]);
1202+
load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2],
1203+
&h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]);
1204+
load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2],
1205+
&h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]);
1206+
1207+
int16x8_t v_s7 = vreinterpretq_s16_u16(
1208+
highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4],
1209+
h_s7[5], h_s7[6], h_s7[7], x_filter, max));
1210+
int16x8_t v_s8 = vreinterpretq_s16_u16(
1211+
highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4],
1212+
h_s8[5], h_s8[6], h_s8[7], x_filter, max));
1213+
int16x8_t v_s9 = vreinterpretq_s16_u16(
1214+
highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4],
1215+
h_s9[5], h_s9[6], h_s9[7], x_filter, max));
1216+
int16x8_t v_s10 = vreinterpretq_s16_u16(
1217+
highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4],
1218+
h_s10[5], h_s10[6], h_s10[7], x_filter, max));
1219+
1220+
uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1221+
v_s6, v_s7, y_filter, max);
1222+
uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1223+
v_s7, v_s8, y_filter, max);
1224+
uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1225+
v_s8, v_s9, y_filter, max);
1226+
uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1227+
v_s9, v_s10, y_filter, max);
1228+
1229+
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
1230+
1231+
v_s0 = v_s4;
1232+
v_s1 = v_s5;
1233+
v_s2 = v_s6;
1234+
v_s3 = v_s7;
1235+
v_s4 = v_s8;
1236+
v_s5 = v_s9;
1237+
v_s6 = v_s10;
1238+
s += 4 * src_stride;
1239+
d += 4 * dst_stride;
1240+
height -= 4;
1241+
} while (height != 0);
1242+
src += 8;
1243+
dst += 8;
1244+
w -= 8;
1245+
} while (w != 0);
1246+
}
1247+
9281248
void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
9291249
uint16_t *dst, ptrdiff_t dst_stride,
9301250
const InterpKernel *filter, int x0_q4,
@@ -936,24 +1256,27 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
9361256
return;
9371257
}
9381258

939-
// Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
940-
// maximum buffer size to 64 * (64 + 7).
941-
DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]);
942-
const int im_stride = 64;
1259+
const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8;
1260+
const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
1261+
// Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
1262+
// lines post both horizontally and vertically.
1263+
const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1;
1264+
const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride;
9431265

944-
const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8;
945-
// Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
946-
// and vert_filter_taps / 2 lines post.
947-
const int im_height = h + vert_filter_taps - 1;
948-
const ptrdiff_t border_offset = vert_filter_taps / 2 - 1;
1266+
if (x_filter_taps == 4 && y_filter_taps == 4) {
1267+
const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2);
1268+
const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2);
9491269

950-
highbd_convolve8_2d_horiz_neon(src - src_stride * border_offset, src_stride,
951-
im_block, im_stride, filter, x0_q4, x_step_q4,
952-
y0_q4, y_step_q4, w, im_height, bd);
1270+
highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride,
1271+
dst, dst_stride, w, h, x_filter, y_filter, bd);
1272+
return;
1273+
}
1274+
1275+
const int16x8_t x_filter = vld1q_s16(filter[x0_q4]);
1276+
const int16x8_t y_filter = vld1q_s16(filter[y0_q4]);
9531277

954-
vpx_highbd_convolve8_vert_neon(im_block + im_stride * border_offset,
955-
im_stride, dst, dst_stride, filter, x0_q4,
956-
x_step_q4, y0_q4, y_step_q4, w, h, bd);
1278+
highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride,
1279+
dst, dst_stride, w, h, x_filter, y_filter, bd);
9571280
}
9581281

9591282
void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,

0 commit comments

Comments
 (0)