@@ -672,23 +672,115 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
672
672
}
673
673
}
674
674
675
- static INLINE void vpx_highbd_convolve8_2d_horiz_neon (
675
+ static INLINE void highbd_convolve_4tap_2d_horiz_neon (
676
676
const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
677
- ptrdiff_t dst_stride , const InterpKernel * filter , int x0_q4 , int x_step_q4 ,
678
- int y0_q4 , int y_step_q4 , int w , int h , int bd ) {
679
- assert ((intptr_t )dst % 4 == 0 );
680
- assert (dst_stride % 4 == 0 );
681
- assert (x_step_q4 == 16 );
682
- assert (h % 4 == 3 && h >= 7 );
677
+ ptrdiff_t dst_stride , int w , int h , const int16x4_t filter , int bd ) {
678
+ if (w == 4 ) {
679
+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
680
+ const int16_t * s = (const int16_t * )src ;
681
+ uint16_t * d = dst ;
683
682
684
- (void )x_step_q4 ;
685
- (void )y0_q4 ;
686
- (void )y_step_q4 ;
683
+ do {
684
+ int16x4_t s0 [4 ], s1 [4 ], s2 [4 ], s3 [4 ];
685
+ load_s16_4x4 (s + 0 * src_stride , 1 , & s0 [0 ], & s0 [1 ], & s0 [2 ], & s0 [3 ]);
686
+ load_s16_4x4 (s + 1 * src_stride , 1 , & s1 [0 ], & s1 [1 ], & s1 [2 ], & s1 [3 ]);
687
+ load_s16_4x4 (s + 2 * src_stride , 1 , & s2 [0 ], & s2 [1 ], & s2 [2 ], & s2 [3 ]);
688
+ load_s16_4x4 (s + 3 * src_stride , 1 , & s3 [0 ], & s3 [1 ], & s3 [2 ], & s3 [3 ]);
687
689
688
- const int16x8_t filters = vld1q_s16 (filter [x0_q4 ]);
690
+ uint16x4_t d0 =
691
+ highbd_convolve4_4 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], filter , max );
692
+ uint16x4_t d1 =
693
+ highbd_convolve4_4 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], filter , max );
694
+ uint16x4_t d2 =
695
+ highbd_convolve4_4 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], filter , max );
696
+ uint16x4_t d3 =
697
+ highbd_convolve4_4 (s3 [0 ], s3 [1 ], s3 [2 ], s3 [3 ], filter , max );
689
698
690
- src -= 3 ;
699
+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
700
+
701
+ s += 4 * src_stride ;
702
+ d += 4 * dst_stride ;
703
+ h -= 4 ;
704
+ } while (h != 3 );
705
+
706
+ // Process final three rows (h % 4 == 3). See vpx_highbd_convolve8_neon()
707
+ // below for further details on possible values of block height.
708
+ int16x4_t s0 [4 ], s1 [4 ], s2 [4 ];
709
+ load_s16_4x4 (s + 0 * src_stride , 1 , & s0 [0 ], & s0 [1 ], & s0 [2 ], & s0 [3 ]);
710
+ load_s16_4x4 (s + 1 * src_stride , 1 , & s1 [0 ], & s1 [1 ], & s1 [2 ], & s1 [3 ]);
711
+ load_s16_4x4 (s + 2 * src_stride , 1 , & s2 [0 ], & s2 [1 ], & s2 [2 ], & s2 [3 ]);
712
+
713
+ uint16x4_t d0 = highbd_convolve4_4 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], filter , max );
714
+ uint16x4_t d1 = highbd_convolve4_4 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], filter , max );
715
+ uint16x4_t d2 = highbd_convolve4_4 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], filter , max );
716
+
717
+ store_u16_4x3 (d , dst_stride , d0 , d1 , d2 );
718
+ } else {
719
+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
720
+
721
+ do {
722
+ const int16_t * s = (const int16_t * )src ;
723
+ uint16_t * d = dst ;
724
+ int width = w ;
725
+
726
+ do {
727
+ int16x8_t s0 [4 ], s1 [4 ], s2 [4 ], s3 [4 ];
728
+ load_s16_8x4 (s + 0 * src_stride , 1 , & s0 [0 ], & s0 [1 ], & s0 [2 ], & s0 [3 ]);
729
+ load_s16_8x4 (s + 1 * src_stride , 1 , & s1 [0 ], & s1 [1 ], & s1 [2 ], & s1 [3 ]);
730
+ load_s16_8x4 (s + 2 * src_stride , 1 , & s2 [0 ], & s2 [1 ], & s2 [2 ], & s2 [3 ]);
731
+ load_s16_8x4 (s + 3 * src_stride , 1 , & s3 [0 ], & s3 [1 ], & s3 [2 ], & s3 [3 ]);
732
+
733
+ uint16x8_t d0 =
734
+ highbd_convolve4_8 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], filter , max );
735
+ uint16x8_t d1 =
736
+ highbd_convolve4_8 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], filter , max );
737
+ uint16x8_t d2 =
738
+ highbd_convolve4_8 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], filter , max );
739
+ uint16x8_t d3 =
740
+ highbd_convolve4_8 (s3 [0 ], s3 [1 ], s3 [2 ], s3 [3 ], filter , max );
741
+
742
+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
743
+
744
+ s += 8 ;
745
+ d += 8 ;
746
+ width -= 8 ;
747
+ } while (width != 0 );
748
+ src += 4 * src_stride ;
749
+ dst += 4 * dst_stride ;
750
+ h -= 4 ;
751
+ } while (h != 3 );
752
+
753
+ // Process final three rows (h % 4 == 3). See vpx_highbd_convolve8_neon()
754
+ // below for further details on possible values of block height.
755
+ const int16_t * s = (const int16_t * )src ;
756
+ uint16_t * d = dst ;
757
+ int width = w ;
691
758
759
+ do {
760
+ int16x8_t s0 [4 ], s1 [4 ], s2 [4 ];
761
+ load_s16_8x4 (s + 0 * src_stride , 1 , & s0 [0 ], & s0 [1 ], & s0 [2 ], & s0 [3 ]);
762
+ load_s16_8x4 (s + 1 * src_stride , 1 , & s1 [0 ], & s1 [1 ], & s1 [2 ], & s1 [3 ]);
763
+ load_s16_8x4 (s + 2 * src_stride , 1 , & s2 [0 ], & s2 [1 ], & s2 [2 ], & s2 [3 ]);
764
+
765
+ uint16x8_t d0 =
766
+ highbd_convolve4_8 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], filter , max );
767
+ uint16x8_t d1 =
768
+ highbd_convolve4_8 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], filter , max );
769
+ uint16x8_t d2 =
770
+ highbd_convolve4_8 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], filter , max );
771
+
772
+ store_u16_8x3 (d , dst_stride , d0 , d1 , d2 );
773
+
774
+ s += 8 ;
775
+ d += 8 ;
776
+ width -= 8 ;
777
+ } while (width != 0 );
778
+ }
779
+ }
780
+
781
+ static INLINE void highbd_convolve_8tap_2d_horiz_neon (
782
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
783
+ ptrdiff_t dst_stride , int w , int h , const int16x8_t filter , int bd ) {
692
784
if (w == 4 ) {
693
785
const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
694
786
const int16_t * s = (const int16_t * )src ;
@@ -706,13 +798,13 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
706
798
& s3 [4 ], & s3 [5 ], & s3 [6 ], & s3 [7 ]);
707
799
708
800
uint16x4_t d0 = highbd_convolve8_4 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], s0 [4 ],
709
- s0 [5 ], s0 [6 ], s0 [7 ], filters , max );
801
+ s0 [5 ], s0 [6 ], s0 [7 ], filter , max );
710
802
uint16x4_t d1 = highbd_convolve8_4 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], s1 [4 ],
711
- s1 [5 ], s1 [6 ], s1 [7 ], filters , max );
803
+ s1 [5 ], s1 [6 ], s1 [7 ], filter , max );
712
804
uint16x4_t d2 = highbd_convolve8_4 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], s2 [4 ],
713
- s2 [5 ], s2 [6 ], s2 [7 ], filters , max );
805
+ s2 [5 ], s2 [6 ], s2 [7 ], filter , max );
714
806
uint16x4_t d3 = highbd_convolve8_4 (s3 [0 ], s3 [1 ], s3 [2 ], s3 [3 ], s3 [4 ],
715
- s3 [5 ], s3 [6 ], s3 [7 ], filters , max );
807
+ s3 [5 ], s3 [6 ], s3 [7 ], filter , max );
716
808
717
809
store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
718
810
@@ -732,11 +824,11 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
732
824
& s2 [5 ], & s2 [6 ], & s2 [7 ]);
733
825
734
826
uint16x4_t d0 = highbd_convolve8_4 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], s0 [4 ], s0 [5 ],
735
- s0 [6 ], s0 [7 ], filters , max );
827
+ s0 [6 ], s0 [7 ], filter , max );
736
828
uint16x4_t d1 = highbd_convolve8_4 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], s1 [4 ], s1 [5 ],
737
- s1 [6 ], s1 [7 ], filters , max );
829
+ s1 [6 ], s1 [7 ], filter , max );
738
830
uint16x4_t d2 = highbd_convolve8_4 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], s2 [4 ], s2 [5 ],
739
- s2 [6 ], s2 [7 ], filters , max );
831
+ s2 [6 ], s2 [7 ], filter , max );
740
832
741
833
store_u16_4x3 (d , dst_stride , d0 , d1 , d2 );
742
834
} else {
@@ -759,13 +851,13 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
759
851
& s3 [4 ], & s3 [5 ], & s3 [6 ], & s3 [7 ]);
760
852
761
853
uint16x8_t d0 = highbd_convolve8_8 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], s0 [4 ],
762
- s0 [5 ], s0 [6 ], s0 [7 ], filters , max );
854
+ s0 [5 ], s0 [6 ], s0 [7 ], filter , max );
763
855
uint16x8_t d1 = highbd_convolve8_8 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], s1 [4 ],
764
- s1 [5 ], s1 [6 ], s1 [7 ], filters , max );
856
+ s1 [5 ], s1 [6 ], s1 [7 ], filter , max );
765
857
uint16x8_t d2 = highbd_convolve8_8 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], s2 [4 ],
766
- s2 [5 ], s2 [6 ], s2 [7 ], filters , max );
858
+ s2 [5 ], s2 [6 ], s2 [7 ], filter , max );
767
859
uint16x8_t d3 = highbd_convolve8_8 (s3 [0 ], s3 [1 ], s3 [2 ], s3 [3 ], s3 [4 ],
768
- s3 [5 ], s3 [6 ], s3 [7 ], filters , max );
860
+ s3 [5 ], s3 [6 ], s3 [7 ], filter , max );
769
861
770
862
store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
771
863
@@ -794,11 +886,11 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
794
886
& s2 [4 ], & s2 [5 ], & s2 [6 ], & s2 [7 ]);
795
887
796
888
uint16x8_t d0 = highbd_convolve8_8 (s0 [0 ], s0 [1 ], s0 [2 ], s0 [3 ], s0 [4 ],
797
- s0 [5 ], s0 [6 ], s0 [7 ], filters , max );
889
+ s0 [5 ], s0 [6 ], s0 [7 ], filter , max );
798
890
uint16x8_t d1 = highbd_convolve8_8 (s1 [0 ], s1 [1 ], s1 [2 ], s1 [3 ], s1 [4 ],
799
- s1 [5 ], s1 [6 ], s1 [7 ], filters , max );
891
+ s1 [5 ], s1 [6 ], s1 [7 ], filter , max );
800
892
uint16x8_t d2 = highbd_convolve8_8 (s2 [0 ], s2 [1 ], s2 [2 ], s2 [3 ], s2 [4 ],
801
- s2 [5 ], s2 [6 ], s2 [7 ], filters , max );
893
+ s2 [5 ], s2 [6 ], s2 [7 ], filter , max );
802
894
803
895
store_u16_8x3 (d , dst_stride , d0 , d1 , d2 );
804
896
@@ -809,6 +901,30 @@ static INLINE void vpx_highbd_convolve8_2d_horiz_neon(
809
901
}
810
902
}
811
903
904
+ static INLINE void highbd_convolve8_2d_horiz_neon (
905
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
906
+ ptrdiff_t dst_stride , const InterpKernel * filter , int x0_q4 , int x_step_q4 ,
907
+ int y0_q4 , int y_step_q4 , int w , int h , int bd ) {
908
+ assert ((intptr_t )dst % 4 == 0 );
909
+ assert (dst_stride % 4 == 0 );
910
+ assert (x_step_q4 == 16 );
911
+ assert (h % 4 == 3 && h >= 7 );
912
+
913
+ (void )x_step_q4 ;
914
+ (void )y0_q4 ;
915
+ (void )y_step_q4 ;
916
+
917
+ if (vpx_get_filter_taps (filter [x0_q4 ]) <= 4 ) {
918
+ const int16x4_t x_filter_4tap = vld1_s16 (filter [x0_q4 ] + 2 );
919
+ highbd_convolve_4tap_2d_horiz_neon (src - 1 , src_stride , dst , dst_stride , w ,
920
+ h , x_filter_4tap , bd );
921
+ } else {
922
+ const int16x8_t x_filter_8tap = vld1q_s16 (filter [x0_q4 ]);
923
+ highbd_convolve_8tap_2d_horiz_neon (src - 3 , src_stride , dst , dst_stride , w ,
924
+ h , x_filter_8tap , bd );
925
+ }
926
+ }
927
+
812
928
void vpx_highbd_convolve8_neon (const uint16_t * src , ptrdiff_t src_stride ,
813
929
uint16_t * dst , ptrdiff_t dst_stride ,
814
930
const InterpKernel * filter , int x0_q4 ,
@@ -825,14 +941,15 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
825
941
DECLARE_ALIGNED (32 , uint16_t , im_block [64 * 71 ]);
826
942
const int im_stride = 64 ;
827
943
828
- // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior and
829
- // SUBPEL_TAPS / 2 lines post.
830
- const int im_height = h + SUBPEL_TAPS - 1 ;
831
- const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1 ;
944
+ const int vert_filter_taps = vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ? 4 : 8 ;
945
+ // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior
946
+ // and vert_filter_taps / 2 lines post.
947
+ const int im_height = h + vert_filter_taps - 1 ;
948
+ const ptrdiff_t border_offset = vert_filter_taps / 2 - 1 ;
832
949
833
- vpx_highbd_convolve8_2d_horiz_neon (
834
- src - src_stride * border_offset , src_stride , im_block , im_stride , filter ,
835
- x0_q4 , x_step_q4 , y0_q4 , y_step_q4 , w , im_height , bd );
950
+ highbd_convolve8_2d_horiz_neon ( src - src_stride * border_offset , src_stride ,
951
+ im_block , im_stride , filter , x0_q4 , x_step_q4 ,
952
+ y0_q4 , y_step_q4 , w , im_height , bd );
836
953
837
954
vpx_highbd_convolve8_vert_neon (im_block + im_stride * border_offset ,
838
955
im_stride , dst , dst_stride , filter , x0_q4 ,
@@ -853,12 +970,13 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
853
970
// See above for buffer size derivation.
854
971
DECLARE_ALIGNED (32 , uint16_t , im_block [64 * 71 ]);
855
972
const int im_stride = 64 ;
973
+ // Averaging convolution always uses an 8-tap filter.
856
974
const int im_height = h + SUBPEL_TAPS - 1 ;
857
975
const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1 ;
858
976
859
- vpx_highbd_convolve8_2d_horiz_neon (
860
- src - src_stride * border_offset , src_stride , im_block , im_stride , filter ,
861
- x0_q4 , x_step_q4 , y0_q4 , y_step_q4 , w , im_height , bd );
977
+ highbd_convolve8_2d_horiz_neon ( src - src_stride * border_offset , src_stride ,
978
+ im_block , im_stride , filter , x0_q4 , x_step_q4 ,
979
+ y0_q4 , y_step_q4 , w , im_height , bd );
862
980
863
981
vpx_highbd_convolve8_avg_vert_neon (im_block + im_stride * border_offset ,
864
982
im_stride , dst , dst_stride , filter , x0_q4 ,
0 commit comments