@@ -376,24 +376,78 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
376
376
}
377
377
}
378
378
379
- void vpx_highbd_convolve8_vert_neon (const uint16_t * src , ptrdiff_t src_stride ,
380
- uint16_t * dst , ptrdiff_t dst_stride ,
381
- const InterpKernel * filter , int x0_q4 ,
382
- int x_step_q4 , int y0_q4 , int y_step_q4 ,
383
- int w , int h , int bd ) {
384
- if (y_step_q4 != 16 ) {
385
- vpx_highbd_convolve8_vert_c (src , src_stride , dst , dst_stride , filter , x0_q4 ,
386
- x_step_q4 , y0_q4 , y_step_q4 , w , h , bd );
387
- return ;
388
- }
379
+ static INLINE void highbd_convolve_4tap_vert_neon (
380
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
381
+ ptrdiff_t dst_stride , int w , int h , const int16x4_t filter , int bd ) {
382
+ if (w == 4 ) {
383
+ const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
384
+ const int16_t * s = (const int16_t * )src ;
385
+ uint16_t * d = dst ;
389
386
390
- assert (( intptr_t ) dst % 4 == 0 ) ;
391
- assert ( dst_stride % 4 == 0 );
387
+ int16x4_t s0 , s1 , s2 ;
388
+ load_s16_4x3 ( s , src_stride , & s0 , & s1 , & s2 );
392
389
393
- const int16x8_t filters = vld1q_s16 ( filter [ y0_q4 ]) ;
390
+ s += 3 * src_stride ;
394
391
395
- src -= 3 * src_stride ;
392
+ do {
393
+ int16x4_t s3 , s4 , s5 , s6 ;
394
+ load_s16_4x4 (s , src_stride , & s3 , & s4 , & s5 , & s6 );
395
+
396
+ uint16x4_t d0 = highbd_convolve4_4 (s0 , s1 , s2 , s3 , filter , max );
397
+ uint16x4_t d1 = highbd_convolve4_4 (s1 , s2 , s3 , s4 , filter , max );
398
+ uint16x4_t d2 = highbd_convolve4_4 (s2 , s3 , s4 , s5 , filter , max );
399
+ uint16x4_t d3 = highbd_convolve4_4 (s3 , s4 , s5 , s6 , filter , max );
400
+
401
+ store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
402
+
403
+ s0 = s4 ;
404
+ s1 = s5 ;
405
+ s2 = s6 ;
406
+ s += 4 * src_stride ;
407
+ d += 4 * dst_stride ;
408
+ h -= 4 ;
409
+ } while (h != 0 );
410
+ } else {
411
+ const uint16x8_t max = vdupq_n_u16 ((1 << bd ) - 1 );
412
+
413
+ do {
414
+ const int16_t * s = (const int16_t * )src ;
415
+ uint16_t * d = dst ;
416
+ int height = h ;
417
+
418
+ int16x8_t s0 , s1 , s2 ;
419
+ load_s16_8x3 (s , src_stride , & s0 , & s1 , & s2 );
420
+
421
+ s += 3 * src_stride ;
422
+
423
+ do {
424
+ int16x8_t s3 , s4 , s5 , s6 ;
425
+ load_s16_8x4 (s , src_stride , & s3 , & s4 , & s5 , & s6 );
426
+
427
+ uint16x8_t d0 = highbd_convolve4_8 (s0 , s1 , s2 , s3 , filter , max );
428
+ uint16x8_t d1 = highbd_convolve4_8 (s1 , s2 , s3 , s4 , filter , max );
429
+ uint16x8_t d2 = highbd_convolve4_8 (s2 , s3 , s4 , s5 , filter , max );
430
+ uint16x8_t d3 = highbd_convolve4_8 (s3 , s4 , s5 , s6 , filter , max );
431
+
432
+ store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
396
433
434
+ s0 = s4 ;
435
+ s1 = s5 ;
436
+ s2 = s6 ;
437
+ s += 4 * src_stride ;
438
+ d += 4 * dst_stride ;
439
+ height -= 4 ;
440
+ } while (height != 0 );
441
+ src += 8 ;
442
+ dst += 8 ;
443
+ w -= 8 ;
444
+ } while (w != 0 );
445
+ }
446
+ }
447
+
448
+ static INLINE void highbd_convolve_8tap_vert_neon (
449
+ const uint16_t * src , ptrdiff_t src_stride , uint16_t * dst ,
450
+ ptrdiff_t dst_stride , int w , int h , const int16x8_t filter , int bd ) {
397
451
if (w == 4 ) {
398
452
const uint16x4_t max = vdup_n_u16 ((1 << bd ) - 1 );
399
453
const int16_t * s = (const int16_t * )src ;
@@ -409,13 +463,13 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
409
463
load_s16_4x4 (s , src_stride , & s7 , & s8 , & s9 , & s10 );
410
464
411
465
uint16x4_t d0 =
412
- highbd_convolve8_4 (s0 , s1 , s2 , s3 , s4 , s5 , s6 , s7 , filters , max );
466
+ highbd_convolve8_4 (s0 , s1 , s2 , s3 , s4 , s5 , s6 , s7 , filter , max );
413
467
uint16x4_t d1 =
414
- highbd_convolve8_4 (s1 , s2 , s3 , s4 , s5 , s6 , s7 , s8 , filters , max );
468
+ highbd_convolve8_4 (s1 , s2 , s3 , s4 , s5 , s6 , s7 , s8 , filter , max );
415
469
uint16x4_t d2 =
416
- highbd_convolve8_4 (s2 , s3 , s4 , s5 , s6 , s7 , s8 , s9 , filters , max );
470
+ highbd_convolve8_4 (s2 , s3 , s4 , s5 , s6 , s7 , s8 , s9 , filter , max );
417
471
uint16x4_t d3 =
418
- highbd_convolve8_4 (s3 , s4 , s5 , s6 , s7 , s8 , s9 , s10 , filters , max );
472
+ highbd_convolve8_4 (s3 , s4 , s5 , s6 , s7 , s8 , s9 , s10 , filter , max );
419
473
420
474
store_u16_4x4 (d , dst_stride , d0 , d1 , d2 , d3 );
421
475
@@ -448,13 +502,13 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
448
502
load_s16_8x4 (s , src_stride , & s7 , & s8 , & s9 , & s10 );
449
503
450
504
uint16x8_t d0 =
451
- highbd_convolve8_8 (s0 , s1 , s2 , s3 , s4 , s5 , s6 , s7 , filters , max );
505
+ highbd_convolve8_8 (s0 , s1 , s2 , s3 , s4 , s5 , s6 , s7 , filter , max );
452
506
uint16x8_t d1 =
453
- highbd_convolve8_8 (s1 , s2 , s3 , s4 , s5 , s6 , s7 , s8 , filters , max );
507
+ highbd_convolve8_8 (s1 , s2 , s3 , s4 , s5 , s6 , s7 , s8 , filter , max );
454
508
uint16x8_t d2 =
455
- highbd_convolve8_8 (s2 , s3 , s4 , s5 , s6 , s7 , s8 , s9 , filters , max );
509
+ highbd_convolve8_8 (s2 , s3 , s4 , s5 , s6 , s7 , s8 , s9 , filter , max );
456
510
uint16x8_t d3 =
457
- highbd_convolve8_8 (s3 , s4 , s5 , s6 , s7 , s8 , s9 , s10 , filters , max );
511
+ highbd_convolve8_8 (s3 , s4 , s5 , s6 , s7 , s8 , s9 , s10 , filter , max );
458
512
459
513
store_u16_8x4 (d , dst_stride , d0 , d1 , d2 , d3 );
460
514
@@ -476,6 +530,36 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
476
530
}
477
531
}
478
532
533
+ void vpx_highbd_convolve8_vert_neon (const uint16_t * src , ptrdiff_t src_stride ,
534
+ uint16_t * dst , ptrdiff_t dst_stride ,
535
+ const InterpKernel * filter , int x0_q4 ,
536
+ int x_step_q4 , int y0_q4 , int y_step_q4 ,
537
+ int w , int h , int bd ) {
538
+ if (y_step_q4 != 16 ) {
539
+ vpx_highbd_convolve8_vert_c (src , src_stride , dst , dst_stride , filter , x0_q4 ,
540
+ x_step_q4 , y0_q4 , y_step_q4 , w , h , bd );
541
+ return ;
542
+ }
543
+
544
+ assert ((intptr_t )dst % 4 == 0 );
545
+ assert (dst_stride % 4 == 0 );
546
+ assert (y_step_q4 == 16 );
547
+
548
+ (void )x_step_q4 ;
549
+ (void )y0_q4 ;
550
+ (void )y_step_q4 ;
551
+
552
+ if (vpx_get_filter_taps (filter [y0_q4 ]) <= 4 ) {
553
+ const int16x4_t y_filter_4tap = vld1_s16 (filter [y0_q4 ] + 2 );
554
+ highbd_convolve_4tap_vert_neon (src - src_stride , src_stride , dst ,
555
+ dst_stride , w , h , y_filter_4tap , bd );
556
+ } else {
557
+ const int16x8_t y_filter_8tap = vld1q_s16 (filter [y0_q4 ]);
558
+ highbd_convolve_8tap_vert_neon (src - 3 * src_stride , src_stride , dst ,
559
+ dst_stride , w , h , y_filter_8tap , bd );
560
+ }
561
+ }
562
+
479
563
void vpx_highbd_convolve8_avg_vert_neon (const uint16_t * src ,
480
564
ptrdiff_t src_stride , uint16_t * dst ,
481
565
ptrdiff_t dst_stride ,
0 commit comments