Skip to content

Commit d8ace2d

Browse files
authored
[X86][SDAG] Fix shamt for vXi8 shift expansion (llvm#175308)
The bailing-out logic for 3-stage shifts acts as follows: ``` if (MinLZ < 2) { conditionally shift by 4 } if (MinLZ < 1) { conditionally shift by 2 } conditionally shift by 1 ``` When `MinLZ = 1`, only the shift by 4 and by 1 paths are executed. It is weird. Closes llvm#175303. The original reproducer is equivalent to the existing func `@shl2_v32i8`, so no new tests need to be added.
1 parent c9f13b5 commit d8ace2d

File tree

2 files changed

+32
-32
lines changed

2 files changed

+32
-32
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31301,14 +31301,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3130131301
Amt = DAG.getBitcast(VT, Amt);
3130231302

3130331303
if (Opc == ISD::SHL || Opc == ISD::SRL) {
31304-
if (MinLZ < 2) {
31304+
if (MinLZ < 1) {
3130531305
// r = VSELECT(r, shift(r, 4), a);
3130631306
SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
3130731307
R = SignBitSelect(VT, Amt, M, R);
3130831308
// a += a
3130931309
Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
3131031310
}
31311-
if (MinLZ < 1) {
31311+
if (MinLZ < 2) {
3131231312
// r = VSELECT(r, shift(r, 2), a);
3131331313
SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
3131431314
R = SignBitSelect(VT, Amt, M, R);
@@ -31335,7 +31335,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3133531335
RHi = DAG.getBitcast(ExtVT, RHi);
3133631336

3133731337
SDValue MLo, MHi;
31338-
if (MinLZ < 2) {
31338+
if (MinLZ < 1) {
3133931339
// r = VSELECT(r, shift(r, 4), a);
3134031340
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
3134131341
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
@@ -31345,7 +31345,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3134531345
ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
3134631346
AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
3134731347
}
31348-
if (MinLZ < 1) {
31348+
if (MinLZ < 2) {
3134931349
// r = VSELECT(r, shift(r, 2), a);
3135031350
MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
3135131351
MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

llvm/test/CodeGen/X86/pr162812.ll

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,8 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
241241
; SSE2-NEXT: pcmpgtb %xmm2, %xmm6
242242
; SSE2-NEXT: movdqa %xmm6, %xmm7
243243
; SSE2-NEXT: pandn %xmm0, %xmm7
244-
; SSE2-NEXT: psllw $4, %xmm0
245-
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
244+
; SSE2-NEXT: psllw $2, %xmm0
245+
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
246246
; SSE2-NEXT: pand %xmm8, %xmm0
247247
; SSE2-NEXT: pand %xmm6, %xmm0
248248
; SSE2-NEXT: por %xmm7, %xmm0
@@ -259,7 +259,7 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
259259
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
260260
; SSE2-NEXT: movdqa %xmm2, %xmm5
261261
; SSE2-NEXT: pandn %xmm1, %xmm5
262-
; SSE2-NEXT: psllw $4, %xmm1
262+
; SSE2-NEXT: psllw $2, %xmm1
263263
; SSE2-NEXT: pand %xmm8, %xmm1
264264
; SSE2-NEXT: pand %xmm2, %xmm1
265265
; SSE2-NEXT: por %xmm5, %xmm1
@@ -282,8 +282,8 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
282282
; SSE42-NEXT: pand %xmm3, %xmm6
283283
; SSE42-NEXT: paddb %xmm6, %xmm6
284284
; SSE42-NEXT: movdqa %xmm0, %xmm7
285-
; SSE42-NEXT: psllw $4, %xmm7
286-
; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
285+
; SSE42-NEXT: psllw $2, %xmm7
286+
; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
287287
; SSE42-NEXT: pand %xmm8, %xmm7
288288
; SSE42-NEXT: movdqa %xmm5, %xmm0
289289
; SSE42-NEXT: pblendvb %xmm0, %xmm7, %xmm2
@@ -293,7 +293,7 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
293293
; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm2
294294
; SSE42-NEXT: pand %xmm4, %xmm3
295295
; SSE42-NEXT: movdqa %xmm1, %xmm5
296-
; SSE42-NEXT: psllw $4, %xmm5
296+
; SSE42-NEXT: psllw $2, %xmm5
297297
; SSE42-NEXT: pand %xmm8, %xmm5
298298
; SSE42-NEXT: movdqa %xmm4, %xmm0
299299
; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm1
@@ -307,7 +307,7 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
307307
;
308308
; AVX2-LABEL: shl2_v32i8:
309309
; AVX2: # %bb.0:
310-
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
310+
; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
311311
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
312312
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
313313
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -318,7 +318,7 @@ define <32 x i8> @shl2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
318318
;
319319
; AVX512-LABEL: shl2_v32i8:
320320
; AVX512: # %bb.0:
321-
; AVX512-NEXT: vpsllw $4, %ymm0, %ymm2
321+
; AVX512-NEXT: vpsllw $2, %ymm0, %ymm2
322322
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
323323
; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
324324
; AVX512-NEXT: vpaddb %ymm0, %ymm0, %ymm2
@@ -341,8 +341,8 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
341341
; SSE2-NEXT: pcmpgtb %xmm2, %xmm6
342342
; SSE2-NEXT: movdqa %xmm6, %xmm7
343343
; SSE2-NEXT: pandn %xmm0, %xmm7
344-
; SSE2-NEXT: psrlw $4, %xmm0
345-
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
344+
; SSE2-NEXT: psrlw $2, %xmm0
345+
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
346346
; SSE2-NEXT: pand %xmm8, %xmm0
347347
; SSE2-NEXT: pand %xmm6, %xmm0
348348
; SSE2-NEXT: por %xmm7, %xmm0
@@ -361,7 +361,7 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
361361
; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
362362
; SSE2-NEXT: movdqa %xmm2, %xmm5
363363
; SSE2-NEXT: pandn %xmm1, %xmm5
364-
; SSE2-NEXT: psrlw $4, %xmm1
364+
; SSE2-NEXT: psrlw $2, %xmm1
365365
; SSE2-NEXT: pand %xmm8, %xmm1
366366
; SSE2-NEXT: pand %xmm2, %xmm1
367367
; SSE2-NEXT: por %xmm5, %xmm1
@@ -380,8 +380,8 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
380380
; SSE42-NEXT: movdqa %xmm2, %xmm4
381381
; SSE42-NEXT: movdqa %xmm0, %xmm2
382382
; SSE42-NEXT: movdqa %xmm0, %xmm5
383-
; SSE42-NEXT: psrlw $4, %xmm5
384-
; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
383+
; SSE42-NEXT: psrlw $2, %xmm5
384+
; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
385385
; SSE42-NEXT: pand %xmm6, %xmm5
386386
; SSE42-NEXT: movdqa %xmm4, %xmm0
387387
; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm2
@@ -395,7 +395,7 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
395395
; SSE42-NEXT: movdqa %xmm4, %xmm0
396396
; SSE42-NEXT: pblendvb %xmm0, %xmm7, %xmm2
397397
; SSE42-NEXT: movdqa %xmm1, %xmm4
398-
; SSE42-NEXT: psrlw $4, %xmm4
398+
; SSE42-NEXT: psrlw $2, %xmm4
399399
; SSE42-NEXT: pand %xmm6, %xmm4
400400
; SSE42-NEXT: movdqa %xmm3, %xmm0
401401
; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm1
@@ -411,7 +411,7 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
411411
;
412412
; AVX2-LABEL: lshr2_v32i8:
413413
; AVX2: # %bb.0:
414-
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
414+
; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
415415
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
416416
; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
417417
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
@@ -423,7 +423,7 @@ define <32 x i8> @lshr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
423423
;
424424
; AVX512-LABEL: lshr2_v32i8:
425425
; AVX512: # %bb.0:
426-
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm2
426+
; AVX512-NEXT: vpsrlw $2, %ymm0, %ymm2
427427
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
428428
; AVX512-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
429429
; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm2
@@ -449,7 +449,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
449449
; SSE2-NEXT: pcmpgtw %xmm7, %xmm8
450450
; SSE2-NEXT: movdqa %xmm8, %xmm9
451451
; SSE2-NEXT: pandn %xmm6, %xmm9
452-
; SSE2-NEXT: psraw $4, %xmm6
452+
; SSE2-NEXT: psraw $2, %xmm6
453453
; SSE2-NEXT: pand %xmm8, %xmm6
454454
; SSE2-NEXT: por %xmm9, %xmm6
455455
; SSE2-NEXT: paddw %xmm7, %xmm7
@@ -467,7 +467,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
467467
; SSE2-NEXT: pcmpgtw %xmm2, %xmm7
468468
; SSE2-NEXT: movdqa %xmm7, %xmm8
469469
; SSE2-NEXT: pandn %xmm0, %xmm8
470-
; SSE2-NEXT: psraw $4, %xmm0
470+
; SSE2-NEXT: psraw $2, %xmm0
471471
; SSE2-NEXT: pand %xmm7, %xmm0
472472
; SSE2-NEXT: por %xmm8, %xmm0
473473
; SSE2-NEXT: paddw %xmm2, %xmm2
@@ -487,7 +487,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
487487
; SSE2-NEXT: pcmpgtw %xmm5, %xmm6
488488
; SSE2-NEXT: movdqa %xmm6, %xmm7
489489
; SSE2-NEXT: pandn %xmm2, %xmm7
490-
; SSE2-NEXT: psraw $4, %xmm2
490+
; SSE2-NEXT: psraw $2, %xmm2
491491
; SSE2-NEXT: pand %xmm6, %xmm2
492492
; SSE2-NEXT: por %xmm7, %xmm2
493493
; SSE2-NEXT: paddw %xmm5, %xmm5
@@ -505,7 +505,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
505505
; SSE2-NEXT: pcmpgtw %xmm3, %xmm5
506506
; SSE2-NEXT: movdqa %xmm5, %xmm6
507507
; SSE2-NEXT: pandn %xmm1, %xmm6
508-
; SSE2-NEXT: psraw $4, %xmm1
508+
; SSE2-NEXT: psraw $2, %xmm1
509509
; SSE2-NEXT: pand %xmm5, %xmm1
510510
; SSE2-NEXT: por %xmm6, %xmm1
511511
; SSE2-NEXT: paddw %xmm3, %xmm3
@@ -527,7 +527,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
527527
; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
528528
; SSE42-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
529529
; SSE42-NEXT: movdqa %xmm6, %xmm7
530-
; SSE42-NEXT: psraw $4, %xmm7
530+
; SSE42-NEXT: psraw $2, %xmm7
531531
; SSE42-NEXT: pblendvb %xmm0, %xmm7, %xmm6
532532
; SSE42-NEXT: movdqa %xmm6, %xmm7
533533
; SSE42-NEXT: psraw $1, %xmm7
@@ -537,7 +537,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
537537
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
538538
; SSE42-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
539539
; SSE42-NEXT: movdqa %xmm2, %xmm4
540-
; SSE42-NEXT: psraw $4, %xmm4
540+
; SSE42-NEXT: psraw $2, %xmm4
541541
; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm2
542542
; SSE42-NEXT: movdqa %xmm2, %xmm4
543543
; SSE42-NEXT: psraw $1, %xmm4
@@ -549,7 +549,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
549549
; SSE42-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
550550
; SSE42-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
551551
; SSE42-NEXT: movdqa %xmm4, %xmm5
552-
; SSE42-NEXT: psraw $4, %xmm5
552+
; SSE42-NEXT: psraw $2, %xmm5
553553
; SSE42-NEXT: pblendvb %xmm0, %xmm5, %xmm4
554554
; SSE42-NEXT: movdqa %xmm4, %xmm5
555555
; SSE42-NEXT: psraw $1, %xmm5
@@ -559,7 +559,7 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
559559
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
560560
; SSE42-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
561561
; SSE42-NEXT: movdqa %xmm1, %xmm3
562-
; SSE42-NEXT: psraw $4, %xmm3
562+
; SSE42-NEXT: psraw $2, %xmm3
563563
; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1
564564
; SSE42-NEXT: movdqa %xmm1, %xmm3
565565
; SSE42-NEXT: psraw $1, %xmm3
@@ -575,15 +575,15 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
575575
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
576576
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
577577
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
578-
; AVX2-NEXT: vpsraw $4, %ymm3, %ymm4
578+
; AVX2-NEXT: vpsraw $2, %ymm3, %ymm4
579579
; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
580580
; AVX2-NEXT: vpsraw $1, %ymm3, %ymm4
581581
; AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2
582582
; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
583583
; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
584584
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
585585
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
586-
; AVX2-NEXT: vpsraw $4, %ymm0, %ymm3
586+
; AVX2-NEXT: vpsraw $2, %ymm0, %ymm3
587587
; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
588588
; AVX2-NEXT: vpsraw $1, %ymm0, %ymm3
589589
; AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1
@@ -597,15 +597,15 @@ define <32 x i8> @ashr2_v32i8(<32 x i8> %a, <32 x i8> %mask) {
597597
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
598598
; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
599599
; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
600-
; AVX512-NEXT: vpsraw $4, %ymm3, %ymm4
600+
; AVX512-NEXT: vpsraw $2, %ymm3, %ymm4
601601
; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
602602
; AVX512-NEXT: vpsraw $1, %ymm3, %ymm4
603603
; AVX512-NEXT: vpaddw %ymm2, %ymm2, %ymm2
604604
; AVX512-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
605605
; AVX512-NEXT: vpsrlw $8, %ymm2, %ymm2
606606
; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
607607
; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
608-
; AVX512-NEXT: vpsraw $4, %ymm0, %ymm3
608+
; AVX512-NEXT: vpsraw $2, %ymm0, %ymm3
609609
; AVX512-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
610610
; AVX512-NEXT: vpsraw $1, %ymm0, %ymm3
611611
; AVX512-NEXT: vpaddw %ymm1, %ymm1, %ymm1

0 commit comments

Comments
 (0)