@@ -925,6 +925,326 @@ static INLINE void highbd_convolve8_2d_horiz_neon(
925
925
}
926
926
}
927
927
928
+ static INLINE void highbd_convolve_2d_4tap_neon (
929
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
930
+ ptrdiff_t dst_stride , int w , int h , const int16x4_t x_filter ,
931
+ const int16x4_t y_filter , int bd ) {
932
+ if (w == 4 ) {
933
+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
934
+ const int16_t * s = (const int16_t * )src ;
935
+ uint16_t * d = dst ;
936
+
937
+ int16x4_t h_s0 [4 ], h_s1 [4 ], h_s2 [4 ];
938
+ load_s16_4x4 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ]);
939
+ load_s16_4x4 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ]);
940
+ load_s16_4x4 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ]);
941
+
942
+ int16x4_t v_s0 = vreinterpret_s16_u16 (
943
+ highbd_convolve4_4 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], x_filter , max ));
944
+ int16x4_t v_s1 = vreinterpret_s16_u16 (
945
+ highbd_convolve4_4 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], x_filter , max ));
946
+ int16x4_t v_s2 = vreinterpret_s16_u16 (
947
+ highbd_convolve4_4 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], x_filter , max ));
948
+
949
+ s += 3 * src_stride ;
950
+
951
+ do {
952
+ int16x4_t h_s3 [4 ], h_s4 [4 ], h_s5 [4 ], h_s6 [4 ];
953
+ load_s16_4x4 (s + 0 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ],
954
+ & h_s3 [3 ]);
955
+ load_s16_4x4 (s + 1 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ],
956
+ & h_s4 [3 ]);
957
+ load_s16_4x4 (s + 2 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ],
958
+ & h_s5 [3 ]);
959
+ load_s16_4x4 (s + 3 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ],
960
+ & h_s6 [3 ]);
961
+
962
+ int16x4_t v_s3 = vreinterpret_s16_u16 (highbd_convolve4_4 (
963
+ h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], x_filter , max ));
964
+ int16x4_t v_s4 = vreinterpret_s16_u16 (highbd_convolve4_4 (
965
+ h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], x_filter , max ));
966
+ int16x4_t v_s5 = vreinterpret_s16_u16 (highbd_convolve4_4 (
967
+ h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], x_filter , max ));
968
+ int16x4_t v_s6 = vreinterpret_s16_u16 (highbd_convolve4_4 (
969
+ h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], x_filter , max ));
970
+
971
+ uint16x4_t d0 = highbd_convolve4_4 (v_s0 , v_s1 , v_s2 , v_s3 , y_filter , max );
972
+ uint16x4_t d1 = highbd_convolve4_4 (v_s1 , v_s2 , v_s3 , v_s4 , y_filter , max );
973
+ uint16x4_t d2 = highbd_convolve4_4 (v_s2 , v_s3 , v_s4 , v_s5 , y_filter , max );
974
+ uint16x4_t d3 = highbd_convolve4_4 (v_s3 , v_s4 , v_s5 , v_s6 , y_filter , max );
975
+
976
+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
977
+
978
+ v_s0 = v_s4 ;
979
+ v_s1 = v_s5 ;
980
+ v_s2 = v_s6 ;
981
+ s += 4 * src_stride ;
982
+ d += 4 * dst_stride ;
983
+ h -= 4 ;
984
+ } while (h != 0 );
985
+
986
+ return ;
987
+ }
988
+
989
+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
990
+
991
+ do {
992
+ const int16_t * s = (const int16_t * )src ;
993
+ uint16_t * d = dst ;
994
+ int height = h ;
995
+
996
+ int16x8_t h_s0 [4 ], h_s1 [4 ], h_s2 [4 ];
997
+ load_s16_8x4 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ]);
998
+ load_s16_8x4 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ]);
999
+ load_s16_8x4 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ]);
1000
+
1001
+ int16x8_t v_s0 = vreinterpretq_s16_u16 (
1002
+ highbd_convolve4_8 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], x_filter , max ));
1003
+ int16x8_t v_s1 = vreinterpretq_s16_u16 (
1004
+ highbd_convolve4_8 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], x_filter , max ));
1005
+ int16x8_t v_s2 = vreinterpretq_s16_u16 (
1006
+ highbd_convolve4_8 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], x_filter , max ));
1007
+
1008
+ s += 3 * src_stride ;
1009
+
1010
+ do {
1011
+ int16x8_t h_s3 [4 ], h_s4 [4 ], h_s5 [4 ], h_s6 [4 ];
1012
+ load_s16_8x4 (s + 0 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ],
1013
+ & h_s3 [3 ]);
1014
+ load_s16_8x4 (s + 1 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ],
1015
+ & h_s4 [3 ]);
1016
+ load_s16_8x4 (s + 2 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ],
1017
+ & h_s5 [3 ]);
1018
+ load_s16_8x4 (s + 3 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ],
1019
+ & h_s6 [3 ]);
1020
+
1021
+ int16x8_t v_s3 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1022
+ h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], x_filter , max ));
1023
+ int16x8_t v_s4 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1024
+ h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], x_filter , max ));
1025
+ int16x8_t v_s5 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1026
+ h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], x_filter , max ));
1027
+ int16x8_t v_s6 = vreinterpretq_s16_u16 (highbd_convolve4_8 (
1028
+ h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], x_filter , max ));
1029
+
1030
+ uint16x8_t d0 = highbd_convolve4_8 (v_s0 , v_s1 , v_s2 , v_s3 , y_filter , max );
1031
+ uint16x8_t d1 = highbd_convolve4_8 (v_s1 , v_s2 , v_s3 , v_s4 , y_filter , max );
1032
+ uint16x8_t d2 = highbd_convolve4_8 (v_s2 , v_s3 , v_s4 , v_s5 , y_filter , max );
1033
+ uint16x8_t d3 = highbd_convolve4_8 (v_s3 , v_s4 , v_s5 , v_s6 , y_filter , max );
1034
+
1035
+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1036
+
1037
+ v_s0 = v_s4 ;
1038
+ v_s1 = v_s5 ;
1039
+ v_s2 = v_s6 ;
1040
+ s += 4 * src_stride ;
1041
+ d += 4 * dst_stride ;
1042
+ height -= 4 ;
1043
+ } while (height != 0 );
1044
+ src += 8 ;
1045
+ dst += 8 ;
1046
+ w -= 8 ;
1047
+ } while (w != 0 );
1048
+ }
1049
+
1050
+ static INLINE void highbd_convolve_2d_8tap_neon (
1051
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
1052
+ ptrdiff_t dst_stride , int w , int h , const int16x8_t x_filter ,
1053
+ const int16x8_t y_filter , int bd ) {
1054
+ if (w == 4 ) {
1055
+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
1056
+ const int16_t * s = (const int16_t * )src ;
1057
+ uint16_t * d = dst ;
1058
+
1059
+ int16x4_t h_s0 [8 ], h_s1 [8 ], h_s2 [8 ], h_s3 [8 ], h_s4 [8 ], h_s5 [8 ], h_s6 [8 ];
1060
+ load_s16_4x8 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ],
1061
+ & h_s0 [4 ], & h_s0 [5 ], & h_s0 [6 ], & h_s0 [7 ]);
1062
+ load_s16_4x8 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ],
1063
+ & h_s1 [4 ], & h_s1 [5 ], & h_s1 [6 ], & h_s1 [7 ]);
1064
+ load_s16_4x8 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ],
1065
+ & h_s2 [4 ], & h_s2 [5 ], & h_s2 [6 ], & h_s2 [7 ]);
1066
+ load_s16_4x8 (s + 3 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ], & h_s3 [3 ],
1067
+ & h_s3 [4 ], & h_s3 [5 ], & h_s3 [6 ], & h_s3 [7 ]);
1068
+ load_s16_4x8 (s + 4 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ], & h_s4 [3 ],
1069
+ & h_s4 [4 ], & h_s4 [5 ], & h_s4 [6 ], & h_s4 [7 ]);
1070
+ load_s16_4x8 (s + 5 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ], & h_s5 [3 ],
1071
+ & h_s5 [4 ], & h_s5 [5 ], & h_s5 [6 ], & h_s5 [7 ]);
1072
+ load_s16_4x8 (s + 6 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ], & h_s6 [3 ],
1073
+ & h_s6 [4 ], & h_s6 [5 ], & h_s6 [6 ], & h_s6 [7 ]);
1074
+
1075
+ int16x4_t v_s0 = vreinterpret_s16_u16 (
1076
+ highbd_convolve8_4 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], h_s0 [4 ], h_s0 [5 ],
1077
+ h_s0 [6 ], h_s0 [7 ], x_filter , max ));
1078
+ int16x4_t v_s1 = vreinterpret_s16_u16 (
1079
+ highbd_convolve8_4 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], h_s1 [4 ], h_s1 [5 ],
1080
+ h_s1 [6 ], h_s1 [7 ], x_filter , max ));
1081
+ int16x4_t v_s2 = vreinterpret_s16_u16 (
1082
+ highbd_convolve8_4 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], h_s2 [4 ], h_s2 [5 ],
1083
+ h_s2 [6 ], h_s2 [7 ], x_filter , max ));
1084
+ int16x4_t v_s3 = vreinterpret_s16_u16 (
1085
+ highbd_convolve8_4 (h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], h_s3 [4 ], h_s3 [5 ],
1086
+ h_s3 [6 ], h_s3 [7 ], x_filter , max ));
1087
+ int16x4_t v_s4 = vreinterpret_s16_u16 (
1088
+ highbd_convolve8_4 (h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], h_s4 [4 ], h_s4 [5 ],
1089
+ h_s4 [6 ], h_s4 [7 ], x_filter , max ));
1090
+ int16x4_t v_s5 = vreinterpret_s16_u16 (
1091
+ highbd_convolve8_4 (h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], h_s5 [4 ], h_s5 [5 ],
1092
+ h_s5 [6 ], h_s5 [7 ], x_filter , max ));
1093
+ int16x4_t v_s6 = vreinterpret_s16_u16 (
1094
+ highbd_convolve8_4 (h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], h_s6 [4 ], h_s6 [5 ],
1095
+ h_s6 [6 ], h_s6 [7 ], x_filter , max ));
1096
+
1097
+ s += 7 * src_stride ;
1098
+
1099
+ do {
1100
+ int16x4_t h_s7 [8 ], h_s8 [8 ], h_s9 [8 ], h_s10 [8 ];
1101
+ load_s16_4x8 (s + 0 * src_stride , 1 , & h_s7 [0 ], & h_s7 [1 ], & h_s7 [2 ],
1102
+ & h_s7 [3 ], & h_s7 [4 ], & h_s7 [5 ], & h_s7 [6 ], & h_s7 [7 ]);
1103
+ load_s16_4x8 (s + 1 * src_stride , 1 , & h_s8 [0 ], & h_s8 [1 ], & h_s8 [2 ],
1104
+ & h_s8 [3 ], & h_s8 [4 ], & h_s8 [5 ], & h_s8 [6 ], & h_s8 [7 ]);
1105
+ load_s16_4x8 (s + 2 * src_stride , 1 , & h_s9 [0 ], & h_s9 [1 ], & h_s9 [2 ],
1106
+ & h_s9 [3 ], & h_s9 [4 ], & h_s9 [5 ], & h_s9 [6 ], & h_s9 [7 ]);
1107
+ load_s16_4x8 (s + 3 * src_stride , 1 , & h_s10 [0 ], & h_s10 [1 ], & h_s10 [2 ],
1108
+ & h_s10 [3 ], & h_s10 [4 ], & h_s10 [5 ], & h_s10 [6 ], & h_s10 [7 ]);
1109
+
1110
+ int16x4_t v_s7 = vreinterpret_s16_u16 (
1111
+ highbd_convolve8_4 (h_s7 [0 ], h_s7 [1 ], h_s7 [2 ], h_s7 [3 ], h_s7 [4 ],
1112
+ h_s7 [5 ], h_s7 [6 ], h_s7 [7 ], x_filter , max ));
1113
+ int16x4_t v_s8 = vreinterpret_s16_u16 (
1114
+ highbd_convolve8_4 (h_s8 [0 ], h_s8 [1 ], h_s8 [2 ], h_s8 [3 ], h_s8 [4 ],
1115
+ h_s8 [5 ], h_s8 [6 ], h_s8 [7 ], x_filter , max ));
1116
+ int16x4_t v_s9 = vreinterpret_s16_u16 (
1117
+ highbd_convolve8_4 (h_s9 [0 ], h_s9 [1 ], h_s9 [2 ], h_s9 [3 ], h_s9 [4 ],
1118
+ h_s9 [5 ], h_s9 [6 ], h_s9 [7 ], x_filter , max ));
1119
+ int16x4_t v_s10 = vreinterpret_s16_u16 (
1120
+ highbd_convolve8_4 (h_s10 [0 ], h_s10 [1 ], h_s10 [2 ], h_s10 [3 ], h_s10 [4 ],
1121
+ h_s10 [5 ], h_s10 [6 ], h_s10 [7 ], x_filter , max ));
1122
+
1123
+ uint16x4_t d0 = highbd_convolve8_4 (v_s0 , v_s1 , v_s2 , v_s3 , v_s4 , v_s5 ,
1124
+ v_s6 , v_s7 , y_filter , max );
1125
+ uint16x4_t d1 = highbd_convolve8_4 (v_s1 , v_s2 , v_s3 , v_s4 , v_s5 , v_s6 ,
1126
+ v_s7 , v_s8 , y_filter , max );
1127
+ uint16x4_t d2 = highbd_convolve8_4 (v_s2 , v_s3 , v_s4 , v_s5 , v_s6 , v_s7 ,
1128
+ v_s8 , v_s9 , y_filter , max );
1129
+ uint16x4_t d3 = highbd_convolve8_4 (v_s3 , v_s4 , v_s5 , v_s6 , v_s7 , v_s8 ,
1130
+ v_s9 , v_s10 , y_filter , max );
1131
+
1132
+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1133
+
1134
+ v_s0 = v_s4 ;
1135
+ v_s1 = v_s5 ;
1136
+ v_s2 = v_s6 ;
1137
+ v_s3 = v_s7 ;
1138
+ v_s4 = v_s8 ;
1139
+ v_s5 = v_s9 ;
1140
+ v_s6 = v_s10 ;
1141
+ s += 4 * src_stride ;
1142
+ d += 4 * dst_stride ;
1143
+ h -= 4 ;
1144
+ } while (h != 0 );
1145
+
1146
+ return ;
1147
+ }
1148
+
1149
+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
1150
+
1151
+ do {
1152
+ const int16_t * s = (const int16_t * )src ;
1153
+ uint16_t * d = dst ;
1154
+ int height = h ;
1155
+
1156
+ int16x8_t h_s0 [8 ], h_s1 [8 ], h_s2 [8 ], h_s3 [8 ], h_s4 [8 ], h_s5 [8 ], h_s6 [8 ];
1157
+ load_s16_8x8 (s + 0 * src_stride , 1 , & h_s0 [0 ], & h_s0 [1 ], & h_s0 [2 ], & h_s0 [3 ],
1158
+ & h_s0 [4 ], & h_s0 [5 ], & h_s0 [6 ], & h_s0 [7 ]);
1159
+ load_s16_8x8 (s + 1 * src_stride , 1 , & h_s1 [0 ], & h_s1 [1 ], & h_s1 [2 ], & h_s1 [3 ],
1160
+ & h_s1 [4 ], & h_s1 [5 ], & h_s1 [6 ], & h_s1 [7 ]);
1161
+ load_s16_8x8 (s + 2 * src_stride , 1 , & h_s2 [0 ], & h_s2 [1 ], & h_s2 [2 ], & h_s2 [3 ],
1162
+ & h_s2 [4 ], & h_s2 [5 ], & h_s2 [6 ], & h_s2 [7 ]);
1163
+ load_s16_8x8 (s + 3 * src_stride , 1 , & h_s3 [0 ], & h_s3 [1 ], & h_s3 [2 ], & h_s3 [3 ],
1164
+ & h_s3 [4 ], & h_s3 [5 ], & h_s3 [6 ], & h_s3 [7 ]);
1165
+ load_s16_8x8 (s + 4 * src_stride , 1 , & h_s4 [0 ], & h_s4 [1 ], & h_s4 [2 ], & h_s4 [3 ],
1166
+ & h_s4 [4 ], & h_s4 [5 ], & h_s4 [6 ], & h_s4 [7 ]);
1167
+ load_s16_8x8 (s + 5 * src_stride , 1 , & h_s5 [0 ], & h_s5 [1 ], & h_s5 [2 ], & h_s5 [3 ],
1168
+ & h_s5 [4 ], & h_s5 [5 ], & h_s5 [6 ], & h_s5 [7 ]);
1169
+ load_s16_8x8 (s + 6 * src_stride , 1 , & h_s6 [0 ], & h_s6 [1 ], & h_s6 [2 ], & h_s6 [3 ],
1170
+ & h_s6 [4 ], & h_s6 [5 ], & h_s6 [6 ], & h_s6 [7 ]);
1171
+
1172
+ int16x8_t v_s0 = vreinterpretq_s16_u16 (
1173
+ highbd_convolve8_8 (h_s0 [0 ], h_s0 [1 ], h_s0 [2 ], h_s0 [3 ], h_s0 [4 ], h_s0 [5 ],
1174
+ h_s0 [6 ], h_s0 [7 ], x_filter , max ));
1175
+ int16x8_t v_s1 = vreinterpretq_s16_u16 (
1176
+ highbd_convolve8_8 (h_s1 [0 ], h_s1 [1 ], h_s1 [2 ], h_s1 [3 ], h_s1 [4 ], h_s1 [5 ],
1177
+ h_s1 [6 ], h_s1 [7 ], x_filter , max ));
1178
+ int16x8_t v_s2 = vreinterpretq_s16_u16 (
1179
+ highbd_convolve8_8 (h_s2 [0 ], h_s2 [1 ], h_s2 [2 ], h_s2 [3 ], h_s2 [4 ], h_s2 [5 ],
1180
+ h_s2 [6 ], h_s2 [7 ], x_filter , max ));
1181
+ int16x8_t v_s3 = vreinterpretq_s16_u16 (
1182
+ highbd_convolve8_8 (h_s3 [0 ], h_s3 [1 ], h_s3 [2 ], h_s3 [3 ], h_s3 [4 ], h_s3 [5 ],
1183
+ h_s3 [6 ], h_s3 [7 ], x_filter , max ));
1184
+ int16x8_t v_s4 = vreinterpretq_s16_u16 (
1185
+ highbd_convolve8_8 (h_s4 [0 ], h_s4 [1 ], h_s4 [2 ], h_s4 [3 ], h_s4 [4 ], h_s4 [5 ],
1186
+ h_s4 [6 ], h_s4 [7 ], x_filter , max ));
1187
+ int16x8_t v_s5 = vreinterpretq_s16_u16 (
1188
+ highbd_convolve8_8 (h_s5 [0 ], h_s5 [1 ], h_s5 [2 ], h_s5 [3 ], h_s5 [4 ], h_s5 [5 ],
1189
+ h_s5 [6 ], h_s5 [7 ], x_filter , max ));
1190
+ int16x8_t v_s6 = vreinterpretq_s16_u16 (
1191
+ highbd_convolve8_8 (h_s6 [0 ], h_s6 [1 ], h_s6 [2 ], h_s6 [3 ], h_s6 [4 ], h_s6 [5 ],
1192
+ h_s6 [6 ], h_s6 [7 ], x_filter , max ));
1193
+
1194
+ s += 7 * src_stride ;
1195
+
1196
+ do {
1197
+ int16x8_t h_s7 [8 ], h_s8 [8 ], h_s9 [8 ], h_s10 [8 ];
1198
+ load_s16_8x8 (s + 0 * src_stride , 1 , & h_s7 [0 ], & h_s7 [1 ], & h_s7 [2 ],
1199
+ & h_s7 [3 ], & h_s7 [4 ], & h_s7 [5 ], & h_s7 [6 ], & h_s7 [7 ]);
1200
+ load_s16_8x8 (s + 1 * src_stride , 1 , & h_s8 [0 ], & h_s8 [1 ], & h_s8 [2 ],
1201
+ & h_s8 [3 ], & h_s8 [4 ], & h_s8 [5 ], & h_s8 [6 ], & h_s8 [7 ]);
1202
+ load_s16_8x8 (s + 2 * src_stride , 1 , & h_s9 [0 ], & h_s9 [1 ], & h_s9 [2 ],
1203
+ & h_s9 [3 ], & h_s9 [4 ], & h_s9 [5 ], & h_s9 [6 ], & h_s9 [7 ]);
1204
+ load_s16_8x8 (s + 3 * src_stride , 1 , & h_s10 [0 ], & h_s10 [1 ], & h_s10 [2 ],
1205
+ & h_s10 [3 ], & h_s10 [4 ], & h_s10 [5 ], & h_s10 [6 ], & h_s10 [7 ]);
1206
+
1207
+ int16x8_t v_s7 = vreinterpretq_s16_u16 (
1208
+ highbd_convolve8_8 (h_s7 [0 ], h_s7 [1 ], h_s7 [2 ], h_s7 [3 ], h_s7 [4 ],
1209
+ h_s7 [5 ], h_s7 [6 ], h_s7 [7 ], x_filter , max ));
1210
+ int16x8_t v_s8 = vreinterpretq_s16_u16 (
1211
+ highbd_convolve8_8 (h_s8 [0 ], h_s8 [1 ], h_s8 [2 ], h_s8 [3 ], h_s8 [4 ],
1212
+ h_s8 [5 ], h_s8 [6 ], h_s8 [7 ], x_filter , max ));
1213
+ int16x8_t v_s9 = vreinterpretq_s16_u16 (
1214
+ highbd_convolve8_8 (h_s9 [0 ], h_s9 [1 ], h_s9 [2 ], h_s9 [3 ], h_s9 [4 ],
1215
+ h_s9 [5 ], h_s9 [6 ], h_s9 [7 ], x_filter , max ));
1216
+ int16x8_t v_s10 = vreinterpretq_s16_u16 (
1217
+ highbd_convolve8_8 (h_s10 [0 ], h_s10 [1 ], h_s10 [2 ], h_s10 [3 ], h_s10 [4 ],
1218
+ h_s10 [5 ], h_s10 [6 ], h_s10 [7 ], x_filter , max ));
1219
+
1220
+ uint16x8_t d0 = highbd_convolve8_8 (v_s0 , v_s1 , v_s2 , v_s3 , v_s4 , v_s5 ,
1221
+ v_s6 , v_s7 , y_filter , max );
1222
+ uint16x8_t d1 = highbd_convolve8_8 (v_s1 , v_s2 , v_s3 , v_s4 , v_s5 , v_s6 ,
1223
+ v_s7 , v_s8 , y_filter , max );
1224
+ uint16x8_t d2 = highbd_convolve8_8 (v_s2 , v_s3 , v_s4 , v_s5 , v_s6 , v_s7 ,
1225
+ v_s8 , v_s9 , y_filter , max );
1226
+ uint16x8_t d3 = highbd_convolve8_8 (v_s3 , v_s4 , v_s5 , v_s6 , v_s7 , v_s8 ,
1227
+ v_s9 , v_s10 , y_filter , max );
1228
+
1229
+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
1230
+
1231
+ v_s0 = v_s4 ;
1232
+ v_s1 = v_s5 ;
1233
+ v_s2 = v_s6 ;
1234
+ v_s3 = v_s7 ;
1235
+ v_s4 = v_s8 ;
1236
+ v_s5 = v_s9 ;
1237
+ v_s6 = v_s10 ;
1238
+ s += 4 * src_stride ;
1239
+ d += 4 * dst_stride ;
1240
+ height -= 4 ;
1241
+ } while (height != 0 );
1242
+ src += 8 ;
1243
+ dst += 8 ;
1244
+ w -= 8 ;
1245
+ } while (w != 0 );
1246
+ }
1247
+
928
1248
void vpx_highbd_convolve8_neon (const uint16_t * src , ptrdiff_t src_stride ,
929
1249
uint16_t * dst , ptrdiff_t dst_stride ,
930
1250
const InterpKernel * filter , int x0_q4 ,
@@ -936,24 +1256,27 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
936
1256
return ;
937
1257
}
938
1258
939
- // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the
940
- // maximum buffer size to 64 * (64 + 7).
941
- DECLARE_ALIGNED (32 , uint16_t , im_block [64 * 71 ]);
942
- const int im_stride = 64 ;
1259
+ const int x_filter_taps = vpx_get_filter_taps (filter [x0_q4 ]) <= 4 ? 4 : 8 ;
1260
+ const int y_filter_taps = vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ? 4 : 8 ;
1261
+ // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2
1262
+ // lines post both horizontally and vertically.
1263
+ const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1 ;
1264
+ const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1 ) * src_stride ;
943
1265
944
- const int vert_filter_taps = vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ? 4 : 8 ;
945
- // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
946
- // and vert_filter_taps / 2 lines post.
947
- const int im_height = h + vert_filter_taps - 1 ;
948
- const ptrdiff_t border_offset = vert_filter_taps / 2 - 1 ;
1266
+ if (x_filter_taps == 4 && y_filter_taps == 4 ) {
1267
+ const int16x4_t x_filter = vld1_s16 (filter [x0_q4 ] + 2 );
1268
+ const int16x4_t y_filter = vld1_s16 (filter [y0_q4 ] + 2 );
949
1269
950
- highbd_convolve8_2d_horiz_neon (src - src_stride * border_offset , src_stride ,
951
- im_block , im_stride , filter , x0_q4 , x_step_q4 ,
952
- y0_q4 , y_step_q4 , w , im_height , bd );
1270
+ highbd_convolve_2d_4tap_neon (src - horiz_offset - vert_offset , src_stride ,
1271
+ dst , dst_stride , w , h , x_filter , y_filter , bd );
1272
+ return ;
1273
+ }
1274
+
1275
+ const int16x8_t x_filter = vld1q_s16 (filter [x0_q4 ]);
1276
+ const int16x8_t y_filter = vld1q_s16 (filter [y0_q4 ]);
953
1277
954
- vpx_highbd_convolve8_vert_neon (im_block + im_stride * border_offset ,
955
- im_stride , dst , dst_stride , filter , x0_q4 ,
956
- x_step_q4 , y0_q4 , y_step_q4 , w , h , bd );
1278
+ highbd_convolve_2d_8tap_neon (src - horiz_offset - vert_offset , src_stride ,
1279
+ dst , dst_stride , w , h , x_filter , y_filter , bd );
957
1280
}
958
1281
959
1282
void vpx_highbd_convolve8_avg_neon (const uint16_t * src , ptrdiff_t src_stride ,
0 commit comments