@@ -925,6 +925,326 @@ static INLINE void highbd_convolve8_2d_horiz_neon(
925925 }
926926}
927927
928+ static INLINE void highbd_convolve_2d_4tap_neon (
929+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
930+ ptrdiff_t dst_stride , int w , int h , const int16x4_t x_filter ,
931+ const int16x4_t y_filter , int bd ) {
932+ if (w == 4 ) {
933+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
934+ const int16_t * s = (const int16_t * )src ;
935+ uint16_t * d = dst ;
936+
937+ int16x4_t h_s0 [4 ], h_s1 [4 ], h_s2 [4 ];
938+ load_s16_4x4 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ]);
939+ load_s16_4x4 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ]);
940+ load_s16_4x4 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ]);
941+
942+ int16x4_t v_s0 = vreinterpret_s16_u16 (
943+ highbd_convolve4_4 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], x_filter , max ));
944+ int16x4_t v_s1 = vreinterpret_s16_u16 (
945+ highbd_convolve4_4 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], x_filter , max ));
946+ int16x4_t v_s2 = vreinterpret_s16_u16 (
947+ highbd_convolve4_4 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], x_filter , max ));
948+
949+ s += 3 * src_stride ;
950+
951+ do {
952+ int16x4_t h_s3 [4 ], h_s4 [4 ], h_s5 [4 ], h_s6 [4 ];
953+ load_s16_4x4 (s + 0 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ],
954+ & h_s3 [3 ]);
955+ load_s16_4x4 (s + 1 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ],
956+ & h_s4 [3 ]);
957+ load_s16_4x4 (s + 2 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ],
958+ & h_s5 [3 ]);
959+ load_s16_4x4 (s + 3 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ],
960+ & h_s6 [3 ]);
961+
962+ int16x4_t v_s3 = vreinterpret_s16_u16 (highbd_convolve4_4 (
963+ h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], x_filter , max ));
964+ int16x4_t v_s4 = vreinterpret_s16_u16 (highbd_convolve4_4 (
965+ h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], x_filter , max ));
966+ int16x4_t v_s5 = vreinterpret_s16_u16 (highbd_convolve4_4 (
967+ h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], x_filter , max ));
968+ int16x4_t v_s6 = vreinterpret_s16_u16 (highbd_convolve4_4 (
969+ h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], x_filter , max ));
970+
971+ uint16x4_t d0 = highbd_convolve4_4 (v_s0 , v_s1 , v_s2 , v_s3 , y_filter , max );
972+ uint16x4_t d1 = highbd_convolve4_4 (v_s1 , v_s2 , v_s3 , v_s4 , y_filter , max );
973+ uint16x4_t d2 = highbd_convolve4_4 (v_s2 , v_s3 , v_s4 , v_s5 , y_filter , max );
974+ uint16x4_t d3 = highbd_convolve4_4 (v_s3 , v_s4 , v_s5 , v_s6 , y_filter , max );
975+
976+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
977+
978+ v_s0 = v_s4 ;
979+ v_s1 = v_s5 ;
980+ v_s2 = v_s6 ;
981+ s += 4 * src_stride ;
982+ d += 4 * dst_stride ;
983+ h -= 4 ;
984+ } while (h != 0 );
985+
986+ return ;
987+ }
988+
989+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
990+
991+ do {
992+ const int16_t * s = (const int16_t * )src ;
993+ uint16_t * d = dst ;
994+ int height = h ;
995+
996+ int16x8_t h_s0 [4 ], h_s1 [4 ], h_s2 [4 ];
997+ load_s16_8x4 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ]);
998+ load_s16_8x4 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ]);
999+ load_s16_8x4 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ]);
1000+
1001+ int16x8_t v_s0 = vreinterpretq_s16_u16 (
1002+ highbd_convolve4_8 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], x_filter , max ));
1003+ int16x8_t v_s1 = vreinterpretq_s16_u16 (
1004+ highbd_convolve4_8 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], x_filter , max ));
1005+ int16x8_t v_s2 = vreinterpretq_s16_u16 (
1006+ highbd_convolve4_8 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], x_filter , max ));
1007+
1008+ s += 3 * src_stride ;
1009+
1010+ do {
1011+ int16x8_t h_s3 [4 ], h_s4 [4 ], h_s5 [4 ], h_s6 [4 ];
1012+ load_s16_8x4 (s + 0 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ],
1013+ & h_s3 [3 ]);
1014+ load_s16_8x4 (s + 1 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ],
1015+ & h_s4 [3 ]);
1016+ load_s16_8x4 (s + 2 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ],
1017+ & h_s5 [3 ]);
1018+ load_s16_8x4 (s + 3 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ],
1019+ & h_s6 [3 ]);
1020+
1021+ int16x8_t v_s3 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1022+ h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], x_filter , max ));
1023+ int16x8_t v_s4 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1024+ h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], x_filter , max ));
1025+ int16x8_t v_s5 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1026+ h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], x_filter , max ));
1027+ int16x8_t v_s6 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1028+ h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], x_filter , max ));
1029+
1030+ uint16x8_t d0 = highbd_convolve4_8 (v_s0 , v_s1 , v_s2 , v_s3 , y_filter , max );
1031+ uint16x8_t d1 = highbd_convolve4_8 (v_s1 , v_s2 , v_s3 , v_s4 , y_filter , max );
1032+ uint16x8_t d2 = highbd_convolve4_8 (v_s2 , v_s3 , v_s4 , v_s5 , y_filter , max );
1033+ uint16x8_t d3 = highbd_convolve4_8 (v_s3 , v_s4 , v_s5 , v_s6 , y_filter , max );
1034+
1035+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1036+
1037+ v_s0 = v_s4 ;
1038+ v_s1 = v_s5 ;
1039+ v_s2 = v_s6 ;
1040+ s += 4 * src_stride ;
1041+ d += 4 * dst_stride ;
1042+ height -= 4 ;
1043+ } while (height != 0 );
1044+ src += 8 ;
1045+ dst += 8 ;
1046+ w -= 8 ;
1047+ } while (w != 0 );
1048+ }
1049+
1050+ static INLINE void highbd_convolve_2d_8tap_neon (
1051+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
1052+ ptrdiff_t dst_stride , int w , int h , const int16x8_t x_filter ,
1053+ const int16x8_t y_filter , int bd ) {
1054+ if (w == 4 ) {
1055+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
1056+ const int16_t * s = (const int16_t * )src ;
1057+ uint16_t * d = dst ;
1058+
1059+ int16x4_t h_s0 [8 ], h_s1 [8 ], h_s2 [8 ], h_s3 [8 ], h_s4 [8 ], h_s5 [8 ], h_s6 [8 ];
1060+ load_s16_4x8 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ],
1061+ & h_s0 [4 ], & h_s0 [5 ], & h_s0 [6 ], & h_s0 [7 ]);
1062+ load_s16_4x8 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ],
1063+ & h_s1 [4 ], & h_s1 [5 ], & h_s1 [6 ], & h_s1 [7 ]);
1064+ load_s16_4x8 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ],
1065+ & h_s2 [4 ], & h_s2 [5 ], & h_s2 [6 ], & h_s2 [7 ]);
1066+ load_s16_4x8 (s + 3 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ], & h_s3 [3 ],
1067+ & h_s3 [4 ], & h_s3 [5 ], & h_s3 [6 ], & h_s3 [7 ]);
1068+ load_s16_4x8 (s + 4 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ], & h_s4 [3 ],
1069+ & h_s4 [4 ], & h_s4 [5 ], & h_s4 [6 ], & h_s4 [7 ]);
1070+ load_s16_4x8 (s + 5 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ], & h_s5 [3 ],
1071+ & h_s5 [4 ], & h_s5 [5 ], & h_s5 [6 ], & h_s5 [7 ]);
1072+ load_s16_4x8 (s + 6 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ], & h_s6 [3 ],
1073+ & h_s6 [4 ], & h_s6 [5 ], & h_s6 [6 ], & h_s6 [7 ]);
1074+
1075+ int16x4_t v_s0 = vreinterpret_s16_u16 (
1076+ highbd_convolve8_4 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], h_s0 [4 ], h_s0 [5 ],
1077+ h_s0 [6 ], h_s0 [7 ], x_filter , max ));
1078+ int16x4_t v_s1 = vreinterpret_s16_u16 (
1079+ highbd_convolve8_4 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], h_s1 [4 ], h_s1 [5 ],
1080+ h_s1 [6 ], h_s1 [7 ], x_filter , max ));
1081+ int16x4_t v_s2 = vreinterpret_s16_u16 (
1082+ highbd_convolve8_4 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], h_s2 [4 ], h_s2 [5 ],
1083+ h_s2 [6 ], h_s2 [7 ], x_filter , max ));
1084+ int16x4_t v_s3 = vreinterpret_s16_u16 (
1085+ highbd_convolve8_4 (h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], h_s3 [4 ], h_s3 [5 ],
1086+ h_s3 [6 ], h_s3 [7 ], x_filter , max ));
1087+ int16x4_t v_s4 = vreinterpret_s16_u16 (
1088+ highbd_convolve8_4 (h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], h_s4 [4 ], h_s4 [5 ],
1089+ h_s4 [6 ], h_s4 [7 ], x_filter , max ));
1090+ int16x4_t v_s5 = vreinterpret_s16_u16 (
1091+ highbd_convolve8_4 (h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], h_s5 [4 ], h_s5 [5 ],
1092+ h_s5 [6 ], h_s5 [7 ], x_filter , max ));
1093+ int16x4_t v_s6 = vreinterpret_s16_u16 (
1094+ highbd_convolve8_4 (h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], h_s6 [4 ], h_s6 [5 ],
1095+ h_s6 [6 ], h_s6 [7 ], x_filter , max ));
1096+
1097+ s += 7 * src_stride ;
1098+
1099+ do {
1100+ int16x4_t h_s7 [8 ], h_s8 [8 ], h_s9 [8 ], h_s10 [8 ];
1101+ load_s16_4x8 (s + 0 * src_stride , 1 , & h_s7 [0 ], & h_s7 [1 ], & h_s7 [2 ],
1102+ & h_s7 [3 ], & h_s7 [4 ], & h_s7 [5 ], & h_s7 [6 ], & h_s7 [7 ]);
1103+ load_s16_4x8 (s + 1 * src_stride , 1 , & h_s8 [0 ], & h_s8 [1 ], & h_s8 [2 ],
1104+ & h_s8 [3 ], & h_s8 [4 ], & h_s8 [5 ], & h_s8 [6 ], & h_s8 [7 ]);
1105+ load_s16_4x8 (s + 2 * src_stride , 1 , & h_s9 [0 ], & h_s9 [1 ], & h_s9 [2 ],
1106+ & h_s9 [3 ], & h_s9 [4 ], & h_s9 [5 ], & h_s9 [6 ], & h_s9 [7 ]);
1107+ load_s16_4x8 (s + 3 * src_stride , 1 , & h_s10 [0 ], & h_s10 [1 ], & h_s10 [2 ],
1108+ & h_s10 [3 ], & h_s10 [4 ], & h_s10 [5 ], & h_s10 [6 ], & h_s10 [7 ]);
1109+
1110+ int16x4_t v_s7 = vreinterpret_s16_u16 (
1111+ highbd_convolve8_4 (h_s7 [0 ], h_s7 [1 ], h_s7 [2 ], h_s7 [3 ], h_s7 [4 ],
1112+ h_s7 [5 ], h_s7 [6 ], h_s7 [7 ], x_filter , max ));
1113+ int16x4_t v_s8 = vreinterpret_s16_u16 (
1114+ highbd_convolve8_4 (h_s8 [0 ], h_s8 [1 ], h_s8 [2 ], h_s8 [3 ], h_s8 [4 ],
1115+ h_s8 [5 ], h_s8 [6 ], h_s8 [7 ], x_filter , max ));
1116+ int16x4_t v_s9 = vreinterpret_s16_u16 (
1117+ highbd_convolve8_4 (h_s9 [0 ], h_s9 [1 ], h_s9 [2 ], h_s9 [3 ], h_s9 [4 ],
1118+ h_s9 [5 ], h_s9 [6 ], h_s9 [7 ], x_filter , max ));
1119+ int16x4_t v_s10 = vreinterpret_s16_u16 (
1120+ highbd_convolve8_4 (h_s10 [0 ], h_s10 [1 ], h_s10 [2 ], h_s10 [3 ], h_s10 [4 ],
1121+ h_s10 [5 ], h_s10 [6 ], h_s10 [7 ], x_filter , max ));
1122+
1123+ uint16x4_t d0 = highbd_convolve8_4 (v_s0 , v_s1 , v_s2 , v_s3 , v_s4 , v_s5 ,
1124+ v_s6 , v_s7 , y_filter , max );
1125+ uint16x4_t d1 = highbd_convolve8_4 (v_s1 , v_s2 , v_s3 , v_s4 , v_s5 , v_s6 ,
1126+ v_s7 , v_s8 , y_filter , max );
1127+ uint16x4_t d2 = highbd_convolve8_4 (v_s2 , v_s3 , v_s4 , v_s5 , v_s6 , v_s7 ,
1128+ v_s8 , v_s9 , y_filter , max );
1129+ uint16x4_t d3 = highbd_convolve8_4 (v_s3 , v_s4 , v_s5 , v_s6 , v_s7 , v_s8 ,
1130+ v_s9 , v_s10 , y_filter , max );
1131+
1132+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1133+
1134+ v_s0 = v_s4 ;
1135+ v_s1 = v_s5 ;
1136+ v_s2 = v_s6 ;
1137+ v_s3 = v_s7 ;
1138+ v_s4 = v_s8 ;
1139+ v_s5 = v_s9 ;
1140+ v_s6 = v_s10 ;
1141+ s += 4 * src_stride ;
1142+ d += 4 * dst_stride ;
1143+ h -= 4 ;
1144+ } while (h != 0 );
1145+
1146+ return ;
1147+ }
1148+
1149+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
1150+
1151+ do {
1152+ const int16_t * s = (const int16_t * )src ;
1153+ uint16_t * d = dst ;
1154+ int height = h ;
1155+
1156+ int16x8_t h_s0 [8 ], h_s1 [8 ], h_s2 [8 ], h_s3 [8 ], h_s4 [8 ], h_s5 [8 ], h_s6 [8 ];
1157+ load_s16_8x8 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ],
1158+ & h_s0 [4 ], & h_s0 [5 ], & h_s0 [6 ], & h_s0 [7 ]);
1159+ load_s16_8x8 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ],
1160+ & h_s1 [4 ], & h_s1 [5 ], & h_s1 [6 ], & h_s1 [7 ]);
1161+ load_s16_8x8 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ],
1162+ & h_s2 [4 ], & h_s2 [5 ], & h_s2 [6 ], & h_s2 [7 ]);
1163+ load_s16_8x8 (s + 3 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ], & h_s3 [3 ],
1164+ & h_s3 [4 ], & h_s3 [5 ], & h_s3 [6 ], & h_s3 [7 ]);
1165+ load_s16_8x8 (s + 4 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ], & h_s4 [3 ],
1166+ & h_s4 [4 ], & h_s4 [5 ], & h_s4 [6 ], & h_s4 [7 ]);
1167+ load_s16_8x8 (s + 5 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ], & h_s5 [3 ],
1168+ & h_s5 [4 ], & h_s5 [5 ], & h_s5 [6 ], & h_s5 [7 ]);
1169+ load_s16_8x8 (s + 6 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ], & h_s6 [3 ],
1170+ & h_s6 [4 ], & h_s6 [5 ], & h_s6 [6 ], & h_s6 [7 ]);
1171+
1172+ int16x8_t v_s0 = vreinterpretq_s16_u16 (
1173+ highbd_convolve8_8 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], h_s0 [4 ], h_s0 [5 ],
1174+ h_s0 [6 ], h_s0 [7 ], x_filter , max ));
1175+ int16x8_t v_s1 = vreinterpretq_s16_u16 (
1176+ highbd_convolve8_8 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], h_s1 [4 ], h_s1 [5 ],
1177+ h_s1 [6 ], h_s1 [7 ], x_filter , max ));
1178+ int16x8_t v_s2 = vreinterpretq_s16_u16 (
1179+ highbd_convolve8_8 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], h_s2 [4 ], h_s2 [5 ],
1180+ h_s2 [6 ], h_s2 [7 ], x_filter , max ));
1181+ int16x8_t v_s3 = vreinterpretq_s16_u16 (
1182+ highbd_convolve8_8 (h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], h_s3 [4 ], h_s3 [5 ],
1183+ h_s3 [6 ], h_s3 [7 ], x_filter , max ));
1184+ int16x8_t v_s4 = vreinterpretq_s16_u16 (
1185+ highbd_convolve8_8 (h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], h_s4 [4 ], h_s4 [5 ],
1186+ h_s4 [6 ], h_s4 [7 ], x_filter , max ));
1187+ int16x8_t v_s5 = vreinterpretq_s16_u16 (
1188+ highbd_convolve8_8 (h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], h_s5 [4 ], h_s5 [5 ],
1189+ h_s5 [6 ], h_s5 [7 ], x_filter , max ));
1190+ int16x8_t v_s6 = vreinterpretq_s16_u16 (
1191+ highbd_convolve8_8 (h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], h_s6 [4 ], h_s6 [5 ],
1192+ h_s6 [6 ], h_s6 [7 ], x_filter , max ));
1193+
1194+ s += 7 * src_stride ;
1195+
1196+ do {
1197+ int16x8_t h_s7 [8 ], h_s8 [8 ], h_s9 [8 ], h_s10 [8 ];
1198+ load_s16_8x8 (s + 0 * src_stride , 1 , & h_s7 [0 ], & h_s7 [1 ], & h_s7 [2 ],
1199+ & h_s7 [3 ], & h_s7 [4 ], & h_s7 [5 ], & h_s7 [6 ], & h_s7 [7 ]);
1200+ load_s16_8x8 (s + 1 * src_stride , 1 , & h_s8 [0 ], & h_s8 [1 ], & h_s8 [2 ],
1201+ & h_s8 [3 ], & h_s8 [4 ], & h_s8 [5 ], & h_s8 [6 ], & h_s8 [7 ]);
1202+ load_s16_8x8 (s + 2 * src_stride , 1 , & h_s9 [0 ], & h_s9 [1 ], & h_s9 [2 ],
1203+ & h_s9 [3 ], & h_s9 [4 ], & h_s9 [5 ], & h_s9 [6 ], & h_s9 [7 ]);
1204+ load_s16_8x8 (s + 3 * src_stride , 1 , & h_s10 [0 ], & h_s10 [1 ], & h_s10 [2 ],
1205+ & h_s10 [3 ], & h_s10 [4 ], & h_s10 [5 ], & h_s10 [6 ], & h_s10 [7 ]);
1206+
1207+ int16x8_t v_s7 = vreinterpretq_s16_u16 (
1208+ highbd_convolve8_8 (h_s7 [0 ], h_s7 [1 ], h_s7 [2 ], h_s7 [3 ], h_s7 [4 ],
1209+ h_s7 [5 ], h_s7 [6 ], h_s7 [7 ], x_filter , max ));
1210+ int16x8_t v_s8 = vreinterpretq_s16_u16 (
1211+ highbd_convolve8_8 (h_s8 [0 ], h_s8 [1 ], h_s8 [2 ], h_s8 [3 ], h_s8 [4 ],
1212+ h_s8 [5 ], h_s8 [6 ], h_s8 [7 ], x_filter , max ));
1213+ int16x8_t v_s9 = vreinterpretq_s16_u16 (
1214+ highbd_convolve8_8 (h_s9 [0 ], h_s9 [1 ], h_s9 [2 ], h_s9 [3 ], h_s9 [4 ],
1215+ h_s9 [5 ], h_s9 [6 ], h_s9 [7 ], x_filter , max ));
1216+ int16x8_t v_s10 = vreinterpretq_s16_u16 (
1217+ highbd_convolve8_8 (h_s10 [0 ], h_s10 [1 ], h_s10 [2 ], h_s10 [3 ], h_s10 [4 ],
1218+ h_s10 [5 ], h_s10 [6 ], h_s10 [7 ], x_filter , max ));
1219+
1220+ uint16x8_t d0 = highbd_convolve8_8 (v_s0 , v_s1 , v_s2 , v_s3 , v_s4 , v_s5 ,
1221+ v_s6 , v_s7 , y_filter , max );
1222+ uint16x8_t d1 = highbd_convolve8_8 (v_s1 , v_s2 , v_s3 , v_s4 , v_s5 , v_s6 ,
1223+ v_s7 , v_s8 , y_filter , max );
1224+ uint16x8_t d2 = highbd_convolve8_8 (v_s2 , v_s3 , v_s4 , v_s5 , v_s6 , v_s7 ,
1225+ v_s8 , v_s9 , y_filter , max );
1226+ uint16x8_t d3 = highbd_convolve8_8 (v_s3 , v_s4 , v_s5 , v_s6 , v_s7 , v_s8 ,
1227+ v_s9 , v_s10 , y_filter , max );
1228+
1229+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1230+
1231+ v_s0 = v_s4 ;
1232+ v_s1 = v_s5 ;
1233+ v_s2 = v_s6 ;
1234+ v_s3 = v_s7 ;
1235+ v_s4 = v_s8 ;
1236+ v_s5 = v_s9 ;
1237+ v_s6 = v_s10 ;
1238+ s += 4 * src_stride ;
1239+ d += 4 * dst_stride ;
1240+ height -= 4 ;
1241+ } while (height != 0 );
1242+ src += 8 ;
1243+ dst += 8 ;
1244+ w -= 8 ;
1245+ } while (w != 0 );
1246+ }
1247+
9281248void vpx_highbd_convolve8_neon (const uint16_t * src , ptrdiff_t src_stride ,
9291249 uint16_t * dst , ptrdiff_t dst_stride ,
9301250 const InterpKernel * filter , int x0_q4 ,
@@ -936,24 +1256,27 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
9361256 return ;
9371257 }
9381258
939- // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
940- // maximum buffer size to 64 * (64 + 7).
941- DECLARE_ALIGNED (32 , uint16_t , im_block [64 * 71 ]);
942- const int im_stride = 64 ;
1259+ const int x_filter_taps = vpx_get_filter_taps (filter [x0_q4 ]) <= 4 ? 4 : 8 ;
1260+ const int y_filter_taps = vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ? 4 : 8 ;
1261+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
1262+ // lines post both horizontally and vertically.
1263+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1 ;
1264+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1 ) * src_stride ;
9431265
944- const int vert_filter_taps = vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ? 4 : 8 ;
945- // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
946- // and vert_filter_taps / 2 lines post.
947- const int im_height = h + vert_filter_taps - 1 ;
948- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1 ;
1266+ if (x_filter_taps == 4 && y_filter_taps == 4 ) {
1267+ const int16x4_t x_filter = vld1_s16 (filter [x0_q4 ] + 2 );
1268+ const int16x4_t y_filter = vld1_s16 (filter [y0_q4 ] + 2 );
9491269
950- highbd_convolve8_2d_horiz_neon (src - src_stride * border_offset , src_stride ,
951- im_block , im_stride , filter , x0_q4 , x_step_q4 ,
952- y0_q4 , y_step_q4 , w , im_height , bd );
1270+ highbd_convolve_2d_4tap_neon (src - horiz_offset - vert_offset , src_stride ,
1271+ dst , dst_stride , w , h , x_filter , y_filter , bd );
1272+ return ;
1273+ }
1274+
1275+ const int16x8_t x_filter = vld1q_s16 (filter [x0_q4 ]);
1276+ const int16x8_t y_filter = vld1q_s16 (filter [y0_q4 ]);
9531277
954- vpx_highbd_convolve8_vert_neon (im_block + im_stride * border_offset ,
955- im_stride , dst , dst_stride , filter , x0_q4 ,
956- x_step_q4 , y0_q4 , y_step_q4 , w , h , bd );
1278+ highbd_convolve_2d_8tap_neon (src - horiz_offset - vert_offset , src_stride ,
1279+ dst , dst_stride , w , h , x_filter , y_filter , bd );
9571280}
9581281
9591282void vpx_highbd_convolve8_avg_neon (const uint16_t * src , ptrdiff_t src_stride ,
0 commit comments