@@ -43,10 +43,14 @@ iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
43
43
idct4_shuf: db 0 , 1 , 4 , 5 , 12 , 13 , 8 , 9 , 2 , 3 , 6 , 7 , 14 , 15 , 10 , 11
44
44
idct32_shuf: db 0 , 1 , 8 , 9 , 4 , 5 , 12 , 13 , 2 , 3 , 10 , 11 , 6 , 7 , 14 , 15
45
45
46
- %macro COEF_PAIR 2
46
+ %macro COEF_PAIR 2 - 3 0
47
47
pd_%1_% 2 : dd % 1 , % 1 , % 2 , % 2
48
48
%define pd_% 1 (pd_%1_% 2 + 4 * 0 )
49
49
%define pd_% 2 (pd_%1_% 2 + 4 * 2 )
50
+ %if % 3
51
+ dd - % 2 , - % 2
52
+ %define pd_%2_m% 2 pd_% 2
53
+ %endif
50
54
%endmacro
51
55
52
56
COEF_PAIR 201 , 995
@@ -56,8 +60,8 @@ COEF_PAIR 1380, 601
56
60
COEF_PAIR 1751 , 2440
57
61
COEF_PAIR 2598 , 1189
58
62
COEF_PAIR 2751 , 2106
59
- COEF_PAIR 2896 , 1567
60
- COEF_PAIR 2896 , 3784
63
+ COEF_PAIR 2896 , 1567 , 1
64
+ COEF_PAIR 2896 , 3784 , 1
61
65
COEF_PAIR 3035 , 3513
62
66
COEF_PAIR 3166 , 3920
63
67
COEF_PAIR 3703 , 3290
@@ -217,7 +221,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
217
221
218
222
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
219
223
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
220
- ; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2
224
+ ; flags: 1 = packed, 2 = inv_dst2
221
225
; skip round/shift if rnd is not a number
222
226
%macro ITX_MULSUB_2D 8 - 9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
223
227
%if % 8 < 32
@@ -244,7 +248,7 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
244
248
pmulld m% 1 , m% 5
245
249
pmulld m% 2 , m% 5
246
250
%endif
247
- %if % 9 & 4
251
+ %if % 9 & 2
248
252
psubd m% 4 , m% 6 , m% 4
249
253
psubd m% 2 , m% 4 , m% 2
250
254
%else
@@ -253,17 +257,10 @@ cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
253
257
%endif
254
258
paddd m% 2 , m% 4
255
259
%endif
256
- %if % 9 & 2 ; invert the upper half of dst1 before rounding
257
- vbroadcasti128 m% 4 , [ pw_2048_m2048 ]
258
- psubd m% 1 , m% 3
259
- psignd m% 1 , m% 4
260
- paddd m% 1 , m% 6
261
- %else
262
260
%ifnum % 6
263
261
paddd m% 1 , m% 6
264
262
%endif
265
263
psubd m% 1 , m% 3
266
- %endif
267
264
%ifnum % 6
268
265
psrad m% 2 , 12
269
266
psrad m% 1 , 12
@@ -2957,7 +2954,7 @@ ALIGN function_align
2957
2954
vpbroadcastd m15 , [ pd_3784 ]
2958
2955
vpbroadcastd m10 , [ pd_1567 ]
2959
2956
ITX_MULSUB_2D 1 , 8 , 3 , 9 , _ , 11 , 10 , 15
2960
- ITX_MULSUB_2D 6 , 4 , 3 , 9 , _ , 11 , 10 , 15 , 4
2957
+ ITX_MULSUB_2D 6 , 4 , 3 , 9 , _ , 11 , 10 , 15 , 2
2961
2958
psubd m3 , m1 , m4 ; t10
2962
2959
paddd m1 , m4 ; t9
2963
2960
psubd m4 , m0 , m2 ; t11a
@@ -3538,13 +3535,30 @@ ALIGN function_align
3538
3535
.pass1_main2:
3539
3536
ITX_MULSUB_2D 10 , 11 , 4 , 12 , 13 , 7 , 401_1931 , 4076_3612 , 1
3540
3537
ITX_MULSUB_2D 5 , 6 , 4 , 12 , 13 , 7 , 3166_3920 , 2598_1189 , 1
3541
- psubd m4 , m10 , m5 ; t9 -t10
3538
+ vbroadcasti128 m12 , [ pd_3784_m3784 ]
3539
+ psubd m4 , m10 , m5
3542
3540
paddd m10 , m5 ; t8 t11
3543
- psubd m5 , m11 , m6 ; t14 -t13
3541
+ psignd m4 , m12 ; t9 t10
3542
+ psubd m5 , m11 , m6
3544
3543
paddd m11 , m6 ; t15 t12
3545
- REPX { pmaxsd x , m8} , m4 , m5 , m10 , m11
3546
- REPX { pminsd x , m9} , m4 , m5 , m10 , m11
3547
- ITX_MULSUB_2D 5 , 4 , 6 , 12 , 13 , 7 , 1567 , 3784 , 2
3544
+ psignd m5 , m12 ; t14 t13
3545
+ vpbroadcastd m6 , [ pd_1567 ]
3546
+ vpbroadcastd m13 , [ pd_3784 ]
3547
+ REPX { pmaxsd x , m8} , m5 , m4
3548
+ REPX { pminsd x , m9} , m5 , m4
3549
+ pmulld m12 , m5
3550
+ pmulld m5 , m6
3551
+ vbroadcasti128 m6 , [ pd_1567_m1567 ]
3552
+ pmulld m13 , m4
3553
+ pmulld m4 , m6
3554
+ REPX { pmaxsd x , m8} , m10 , m11 , m0 , m1
3555
+ REPX { pminsd x , m9} , m10 , m11 , m0 , m1
3556
+ paddd m12 , m7
3557
+ paddd m5 , m7
3558
+ paddd m4 , m12
3559
+ psubd m5 , m13
3560
+ psrad m4 , 12 ; t14a t10a
3561
+ psrad m5 , 12 ; t9a t13a
3548
3562
vpbroadcastd m12 , [ pd_2896 ]
3549
3563
punpckhqdq m6 , m11 , m5
3550
3564
punpcklqdq m11 , m4
@@ -3558,8 +3572,8 @@ ALIGN function_align
3558
3572
REPX { pminsd x , m9} , m5 , m6
3559
3573
pmulld m5 , m12
3560
3574
pmulld m6 , m12
3561
- REPX { pmaxsd x , m8} , m0 , m1 , m2, m3 , m11 , m10
3562
- REPX { pminsd x , m9} , m0 , m1 , m2, m3 , m11 , m10
3575
+ REPX { pmaxsd x , m8} , m2 , m3 , m11 , m10
3576
+ REPX { pminsd x , m9} , m2 , m3 , m11 , m10
3563
3577
ret
3564
3578
ALIGN function_align
3565
3579
.pass1_main3:
@@ -5829,7 +5843,7 @@ ALIGN function_align
5829
5843
vpbroadcastd m15 , [ pd_4017 ]
5830
5844
vpbroadcastd m10 , [ pd_799 ]
5831
5845
ITX_MULSUB_2D 5 , 8 , 3 , 9 , _ , 11 , 10 , 15 ; t17a, t30a
5832
- ITX_MULSUB_2D 2 , 4 , 3 , 9 , _ , 11 , 10 , 15 , 4 ; t29a, t18a
5846
+ ITX_MULSUB_2D 2 , 4 , 3 , 9 , _ , 11 , 10 , 15 , 2 ; t29a, t18a
5833
5847
psubd m3 , m0 , m6 ; t19a
5834
5848
paddd m0 , m6 ; t16a
5835
5849
psubd m6 , m7 , m1 ; t28a
@@ -5898,7 +5912,7 @@ ALIGN function_align
5898
5912
vpbroadcastd m15 , [ pd_2276 ]
5899
5913
vpbroadcastd m10 , [ pd_3406 ]
5900
5914
ITX_MULSUB_2D 4 , 2 , 3 , 9 , _ , 11 , 10 , 15 ; t21a, t26a
5901
- ITX_MULSUB_2D 8 , 5 , 3 , 9 , _ , 11 , 10 , 15 , 4 ; t25a, t22a
5915
+ ITX_MULSUB_2D 8 , 5 , 3 , 9 , _ , 11 , 10 , 15 , 2 ; t25a, t22a
5902
5916
psubd m3 , m0 , m6 ; t27a
5903
5917
paddd m0 , m6 ; t24a
5904
5918
psubd m6 , m7 , m1 ; t20a
@@ -5911,8 +5925,8 @@ ALIGN function_align
5911
5925
REPX { pminsd x , m13} , m3 , m6 , m1 , m4 , m0 , m7 , m5 , m8
5912
5926
vpbroadcastd m15 , [ pd_3784 ]
5913
5927
vpbroadcastd m10 , [ pd_1567 ]
5914
- ITX_MULSUB_2D 4 , 1 , 2 , 9 , _ , 11 , 10 , 15 , 4 ; t26a, t21a
5915
- ITX_MULSUB_2D 3 , 6 , 2 , 9 , _ , 11 , 10 , 15 , 4 ; t27, t20
5928
+ ITX_MULSUB_2D 4 , 1 , 2 , 9 , _ , 11 , 10 , 15 , 2 ; t26a, t21a
5929
+ ITX_MULSUB_2D 3 , 6 , 2 , 9 , _ , 11 , 10 , 15 , 2 ; t27, t20
5916
5930
mova m9 , [ r6 - 32 * 4 ] ; t16a
5917
5931
mova m10 , [ r6 - 32 * 3 ] ; t17
5918
5932
psubd m2 , m9 , m7 ; t23
@@ -7695,7 +7709,7 @@ ALIGN function_align
7695
7709
REPX { pmaxsd x , m12} , m8 , m1 , m6 , m2
7696
7710
REPX { pminsd x , m13} , m8 , m1 , m6 , m2
7697
7711
ITX_MULSUB_2D 1 , 8 , 5 , 9 , _ , 11 , 10 , 15 ; t33a, t62a
7698
- ITX_MULSUB_2D 2 , 6 , 5 , 9 , _ , 11 , 10 , 15 , 4 ; t61a, t34a
7712
+ ITX_MULSUB_2D 2 , 6 , 5 , 9 , _ , 11 , 10 , 15 , 2 ; t61a, t34a
7699
7713
REPX { pmaxsd x , m12} , m0 , m3 , m7 , m4
7700
7714
REPX { pminsd x , m13} , m0 , m3 , m7 , m4
7701
7715
vpbroadcastd m10 , [ r5 + 4 * 10 ]
@@ -7750,7 +7764,7 @@ ALIGN function_align
7750
7764
REPX { pmaxsd x , m12} , m8 , m1 , m3 , m4
7751
7765
REPX { pminsd x , m13} , m8 , m1 , m3 , m4
7752
7766
ITX_MULSUB_2D 1 , 8 , 6 , 9 , _ , 11 , 10 , 15 ; t39a, t56a
7753
- ITX_MULSUB_2D 4 , 3 , 6 , 9 , _ , 11 , 10 , 15 , 4 ; t55a, t40a
7767
+ ITX_MULSUB_2D 4 , 3 , 6 , 9 , _ , 11 , 10 , 15 , 2 ; t55a, t40a
7754
7768
REPX { pmaxsd x , m12} , m0 , m2 , m5 , m7
7755
7769
REPX { pminsd x , m13} , m0 , m5 , m2 , m7
7756
7770
psubd m6 , m2 , m7 ; t48a
0 commit comments