Skip to content

Commit 3127962

Browse files
committed
Specialise highbd Neon vert convolution for 4-tap filters
Add a highbd Neon implementation of vertical convolution specialised for executing with 4-tap filters. This new path is also used when executing with bilinear (2-tap) filters. Change-Id: I30469c7b8e6ccff31d96588a3e4c21b401f1ed09
1 parent 70b14bf commit 3127962

File tree

2 files changed

+124
-22
lines changed

2 files changed

+124
-22
lines changed

vpx_dsp/arm/highbd_vpx_convolve8_neon.c

Lines changed: 106 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -376,24 +376,78 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
376376
}
377377
}
378378

379-
void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
380-
uint16_t *dst, ptrdiff_t dst_stride,
381-
const InterpKernel *filter, int x0_q4,
382-
int x_step_q4, int y0_q4, int y_step_q4,
383-
int w, int h, int bd) {
384-
if (y_step_q4 != 16) {
385-
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
386-
x_step_q4, y0_q4, y_step_q4, w, h, bd);
387-
return;
388-
}
379+
static INLINE void highbd_convolve_4tap_vert_neon(
380+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
381+
ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) {
382+
if (w == 4) {
383+
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
384+
const int16_t *s = (const int16_t *)src;
385+
uint16_t *d = dst;
389386

390-
assert((intptr_t)dst % 4 == 0);
391-
assert(dst_stride % 4 == 0);
387+
int16x4_t s0, s1, s2;
388+
load_s16_4x3(s, src_stride, &s0, &s1, &s2);
392389

393-
const int16x8_t filters = vld1q_s16(filter[y0_q4]);
390+
s += 3 * src_stride;
394391

395-
src -= 3 * src_stride;
392+
do {
393+
int16x4_t s3, s4, s5, s6;
394+
load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6);
395+
396+
uint16x4_t d0 = highbd_convolve4_4(s0, s1, s2, s3, filter, max);
397+
uint16x4_t d1 = highbd_convolve4_4(s1, s2, s3, s4, filter, max);
398+
uint16x4_t d2 = highbd_convolve4_4(s2, s3, s4, s5, filter, max);
399+
uint16x4_t d3 = highbd_convolve4_4(s3, s4, s5, s6, filter, max);
400+
401+
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
402+
403+
s0 = s4;
404+
s1 = s5;
405+
s2 = s6;
406+
s += 4 * src_stride;
407+
d += 4 * dst_stride;
408+
h -= 4;
409+
} while (h != 0);
410+
} else {
411+
const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
412+
413+
do {
414+
const int16_t *s = (const int16_t *)src;
415+
uint16_t *d = dst;
416+
int height = h;
417+
418+
int16x8_t s0, s1, s2;
419+
load_s16_8x3(s, src_stride, &s0, &s1, &s2);
420+
421+
s += 3 * src_stride;
422+
423+
do {
424+
int16x8_t s3, s4, s5, s6;
425+
load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6);
426+
427+
uint16x8_t d0 = highbd_convolve4_8(s0, s1, s2, s3, filter, max);
428+
uint16x8_t d1 = highbd_convolve4_8(s1, s2, s3, s4, filter, max);
429+
uint16x8_t d2 = highbd_convolve4_8(s2, s3, s4, s5, filter, max);
430+
uint16x8_t d3 = highbd_convolve4_8(s3, s4, s5, s6, filter, max);
431+
432+
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
396433

434+
s0 = s4;
435+
s1 = s5;
436+
s2 = s6;
437+
s += 4 * src_stride;
438+
d += 4 * dst_stride;
439+
height -= 4;
440+
} while (height != 0);
441+
src += 8;
442+
dst += 8;
443+
w -= 8;
444+
} while (w != 0);
445+
}
446+
}
447+
448+
static INLINE void highbd_convolve_8tap_vert_neon(
449+
const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,
450+
ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) {
397451
if (w == 4) {
398452
const uint16x4_t max = vdup_n_u16((1 << bd) - 1);
399453
const int16_t *s = (const int16_t *)src;
@@ -409,13 +463,13 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
409463
load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
410464

411465
uint16x4_t d0 =
412-
highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
466+
highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
413467
uint16x4_t d1 =
414-
highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
468+
highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
415469
uint16x4_t d2 =
416-
highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
470+
highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
417471
uint16x4_t d3 =
418-
highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
472+
highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
419473

420474
store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
421475

@@ -448,13 +502,13 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
448502
load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
449503

450504
uint16x8_t d0 =
451-
highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
505+
highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max);
452506
uint16x8_t d1 =
453-
highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
507+
highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max);
454508
uint16x8_t d2 =
455-
highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
509+
highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max);
456510
uint16x8_t d3 =
457-
highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
511+
highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max);
458512

459513
store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
460514

@@ -476,6 +530,36 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
476530
}
477531
}
478532

533+
void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
534+
uint16_t *dst, ptrdiff_t dst_stride,
535+
const InterpKernel *filter, int x0_q4,
536+
int x_step_q4, int y0_q4, int y_step_q4,
537+
int w, int h, int bd) {
538+
if (y_step_q4 != 16) {
539+
vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
540+
x_step_q4, y0_q4, y_step_q4, w, h, bd);
541+
return;
542+
}
543+
544+
assert((intptr_t)dst % 4 == 0);
545+
assert(dst_stride % 4 == 0);
546+
assert(y_step_q4 == 16);
547+
548+
(void)x_step_q4;
549+
(void)y0_q4;
550+
(void)y_step_q4;
551+
552+
if (vpx_get_filter_taps(filter[y0_q4]) <= 4) {
553+
const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2);
554+
highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst,
555+
dst_stride, w, h, y_filter_4tap, bd);
556+
} else {
557+
const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]);
558+
highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst,
559+
dst_stride, w, h, y_filter_8tap, bd);
560+
}
561+
}
562+
479563
void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
480564
ptrdiff_t src_stride, uint16_t *dst,
481565
ptrdiff_t dst_stride,

vpx_dsp/arm/mem_neon.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,15 @@ static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p,
448448
vst1_u16(s, s2);
449449
}
450450

451+
static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p,
452+
int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) {
453+
*s0 = vld1_s16(s);
454+
s += p;
455+
*s1 = vld1_s16(s);
456+
s += p;
457+
*s2 = vld1_s16(s);
458+
}
459+
451460
static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p,
452461
int16x4_t *s0, int16x4_t *s1, int16x4_t *s2,
453462
int16x4_t *s3) {
@@ -491,6 +500,15 @@ static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p,
491500
*s6 = vld1_s16(s);
492501
}
493502

503+
static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p,
504+
int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) {
505+
*s0 = vld1q_s16(s);
506+
s += p;
507+
*s1 = vld1q_s16(s);
508+
s += p;
509+
*s2 = vld1q_s16(s);
510+
}
511+
494512
static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p,
495513
int16x8_t *s0, int16x8_t *s1, int16x8_t *s2,
496514
int16x8_t *s3) {

0 commit comments

Comments
 (0)