Skip to content

Commit 89cbf20

Browse files
gramner-twooriolesbarrbrain
authored andcommitted
x86: Fix clipping in high bit-depth AVX2 4x16 IDCT
Certain clips were incorrectly performed on negated values, which caused things to be off-by-one in both directions. Correct this by negating such values prior to clipping instead of afterwards.
1 parent 09e7123 commit 89cbf20

File tree

1 file changed

+40
-26
lines changed

1 file changed

+40
-26
lines changed

src/x86/itx16_avx2.asm

+40-26
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,14 @@ iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
4343
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
4444
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
4545

46-
%macro COEF_PAIR 2
46+
%macro COEF_PAIR 2-3 0
4747
pd_%1_%2: dd %1, %1, %2, %2
4848
%define pd_%1 (pd_%1_%2 + 4*0)
4949
%define pd_%2 (pd_%1_%2 + 4*2)
50+
%if %3
51+
dd -%2, -%2
52+
%define pd_%2_m%2 pd_%2
53+
%endif
5054
%endmacro
5155

5256
COEF_PAIR 201, 995
@@ -56,8 +60,8 @@ COEF_PAIR 1380, 601
5660
COEF_PAIR 1751, 2440
5761
COEF_PAIR 2598, 1189
5862
COEF_PAIR 2751, 2106
59-
COEF_PAIR 2896, 1567
60-
COEF_PAIR 2896, 3784
63+
COEF_PAIR 2896, 1567, 1
64+
COEF_PAIR 2896, 3784, 1
6165
COEF_PAIR 3035, 3513
6266
COEF_PAIR 3166, 3920
6367
COEF_PAIR 3703, 3290
@@ -217,7 +221,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
217221

218222
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
219223
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
220-
; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2
224+
; flags: 1 = packed, 2 = inv_dst2
221225
; skip round/shift if rnd is not a number
222226
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
223227
%if %8 < 32
@@ -244,7 +248,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
244248
pmulld m%1, m%5
245249
pmulld m%2, m%5
246250
%endif
247-
%if %9 & 4
251+
%if %9 & 2
248252
psubd m%4, m%6, m%4
249253
psubd m%2, m%4, m%2
250254
%else
@@ -253,17 +257,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
253257
%endif
254258
paddd m%2, m%4
255259
%endif
256-
%if %9 & 2 ; invert the upper half of dst1 before rounding
257-
vbroadcasti128 m%4, [pw_2048_m2048]
258-
psubd m%1, m%3
259-
psignd m%1, m%4
260-
paddd m%1, m%6
261-
%else
262260
%ifnum %6
263261
paddd m%1, m%6
264262
%endif
265263
psubd m%1, m%3
266-
%endif
267264
%ifnum %6
268265
psrad m%2, 12
269266
psrad m%1, 12
@@ -2957,7 +2954,7 @@ ALIGN function_align
29572954
vpbroadcastd m15, [pd_3784]
29582955
vpbroadcastd m10, [pd_1567]
29592956
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
2960-
ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
2957+
ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
29612958
psubd m3, m1, m4 ; t10
29622959
paddd m1, m4 ; t9
29632960
psubd m4, m0, m2 ; t11a
@@ -3538,13 +3535,30 @@ ALIGN function_align
35383535
.pass1_main2:
35393536
ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
35403537
ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
3541-
psubd m4, m10, m5 ; t9 -t10
3538+
vbroadcasti128 m12, [pd_3784_m3784]
3539+
psubd m4, m10, m5
35423540
paddd m10, m5 ; t8 t11
3543-
psubd m5, m11, m6 ; t14 -t13
3541+
psignd m4, m12 ; t9 t10
3542+
psubd m5, m11, m6
35443543
paddd m11, m6 ; t15 t12
3545-
REPX {pmaxsd x, m8}, m4, m5, m10, m11
3546-
REPX {pminsd x, m9}, m4, m5, m10, m11
3547-
ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2
3544+
psignd m5, m12 ; t14 t13
3545+
vpbroadcastd m6, [pd_1567]
3546+
vpbroadcastd m13, [pd_3784]
3547+
REPX {pmaxsd x, m8}, m5, m4
3548+
REPX {pminsd x, m9}, m5, m4
3549+
pmulld m12, m5
3550+
pmulld m5, m6
3551+
vbroadcasti128 m6, [pd_1567_m1567]
3552+
pmulld m13, m4
3553+
pmulld m4, m6
3554+
REPX {pmaxsd x, m8}, m10, m11, m0, m1
3555+
REPX {pminsd x, m9}, m10, m11, m0, m1
3556+
paddd m12, m7
3557+
paddd m5, m7
3558+
paddd m4, m12
3559+
psubd m5, m13
3560+
psrad m4, 12 ; t14a t10a
3561+
psrad m5, 12 ; t9a t13a
35483562
vpbroadcastd m12, [pd_2896]
35493563
punpckhqdq m6, m11, m5
35503564
punpcklqdq m11, m4
@@ -3558,8 +3572,8 @@ ALIGN function_align
35583572
REPX {pminsd x, m9}, m5, m6
35593573
pmulld m5, m12
35603574
pmulld m6, m12
3561-
REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10
3562-
REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10
3575+
REPX {pmaxsd x, m8}, m2, m3, m11, m10
3576+
REPX {pminsd x, m9}, m2, m3, m11, m10
35633577
ret
35643578
ALIGN function_align
35653579
.pass1_main3:
@@ -5829,7 +5843,7 @@ ALIGN function_align
58295843
vpbroadcastd m15, [pd_4017]
58305844
vpbroadcastd m10, [pd_799]
58315845
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
5832-
ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
5846+
ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
58335847
psubd m3, m0, m6 ; t19a
58345848
paddd m0, m6 ; t16a
58355849
psubd m6, m7, m1 ; t28a
@@ -5898,7 +5912,7 @@ ALIGN function_align
58985912
vpbroadcastd m15, [pd_2276]
58995913
vpbroadcastd m10, [pd_3406]
59005914
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
5901-
ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
5915+
ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
59025916
psubd m3, m0, m6 ; t27a
59035917
paddd m0, m6 ; t24a
59045918
psubd m6, m7, m1 ; t20a
@@ -5911,8 +5925,8 @@ ALIGN function_align
59115925
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
59125926
vpbroadcastd m15, [pd_3784]
59135927
vpbroadcastd m10, [pd_1567]
5914-
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
5915-
ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
5928+
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
5929+
ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
59165930
mova m9, [r6-32*4] ; t16a
59175931
mova m10, [r6-32*3] ; t17
59185932
psubd m2, m9, m7 ; t23
@@ -7695,7 +7709,7 @@ ALIGN function_align
76957709
REPX {pmaxsd x, m12}, m8, m1, m6, m2
76967710
REPX {pminsd x, m13}, m8, m1, m6, m2
76977711
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
7698-
ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
7712+
ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
76997713
REPX {pmaxsd x, m12}, m0, m3, m7, m4
77007714
REPX {pminsd x, m13}, m0, m3, m7, m4
77017715
vpbroadcastd m10, [r5+4*10]
@@ -7750,7 +7764,7 @@ ALIGN function_align
77507764
REPX {pmaxsd x, m12}, m8, m1, m3, m4
77517765
REPX {pminsd x, m13}, m8, m1, m3, m4
77527766
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
7753-
ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
7767+
ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
77547768
REPX {pmaxsd x, m12}, m0, m2, m5, m7
77557769
REPX {pminsd x, m13}, m0, m5, m2, m7
77567770
psubd m6, m2, m7 ; t48a

0 commit comments

Comments
 (0)