Skip to content

Commit dd882a5

Browse files
committed
[GR-37306] Add AArch64 SIMD intrinsic for StringLatin1.inflate and StringUTF16.compress.
PullRequest: graal/11254
2 parents f08a573 + 33e4f67 commit dd882a5

File tree

11 files changed

+642
-57
lines changed

11 files changed

+642
-57
lines changed

compiler/src/org.graalvm.compiler.asm.aarch64/src/org/graalvm/compiler/asm/aarch64/AArch64ASIMDAssembler.java

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3036,7 +3036,7 @@ public void ushlVVV(ASIMDSize size, ElementSize eSize, Register dst, Register sr
30363036
/**
30373037
* C7.2.391 Unsigned shift left long (immediate).<br>
30383038
* <p>
3039-
* From the manual: " This instruction reads each vector element in the lower half of the source
3039+
* From the manual: "This instruction reads each vector element in the lower half of the source
30403040
* SIMD&FP register, shifts the unsigned integer value left by the specified number of bits ...
30413041
* The destination vector elements are twice as long as the source vector elements."
30423042
*
@@ -3063,12 +3063,12 @@ public void ushllVVI(ElementSize srcESize, Register dst, Register src, int shift
30633063
/**
30643064
* C7.2.391 Unsigned shift left long (immediate).<br>
30653065
* <p>
3066-
* From the manual: " This instruction reads each vector element in the upper half of the source
3066+
* From the manual: "This instruction reads each vector element in the upper half of the source
30673067
* SIMD&FP register, shifts the unsigned integer value left by the specified number of bits ...
30683068
* The destination vector elements are twice as long as the source vector elements."
30693069
*
30703070
* @param srcESize source element size. Cannot be ElementSize.DoubleWord. The destination
3071-
* element size will be double this width.
3071+
* element size will be twice this width.
30723072
* @param dst SIMD register.
30733073
* @param src SIMD register.
30743074
* @param shiftAmt shift left amount.
@@ -3189,7 +3189,8 @@ public void uzp2VVV(ASIMDSize dstSize, ElementSize eSize, Register dst, Register
31893189
* C7.2.402 Extract narrow.<br>
31903190
* <p>
31913191
* From the manual: "This instruction reads each vector element from the source SIMD&FP
3192-
* register, narrows each value to half the original width, and writes the register..."
3192+
* register, narrows each value to half the original width, and writes into the lower half of
3193+
* the destination register..."
31933194
*
31943195
* @param dstESize destination element size. Cannot be ElementSize.DoubleWord. The source
31953196
* element size is twice this width.
@@ -3204,6 +3205,26 @@ public void xtnVV(ElementSize dstESize, Register dst, Register src) {
32043205
twoRegMiscEncoding(ASIMDInstruction.XTN, false, elemSizeXX(dstESize), dst, src);
32053206
}
32063207

3208+
/**
3209+
* C7.2.402 Extract narrow.<br>
3210+
* <p>
3211+
* From the manual: "This instruction reads each vector element from the source SIMD&FP
3212+
* register, narrows each value to half the original width, and writes into the upper half of
3213+
* the destination register..."
3214+
*
3215+
* @param dstESize destination element size. Cannot be ElementSize.DoubleWord. The source
3216+
* element size is twice this width.
3217+
* @param dst SIMD register.
3218+
* @param src SIMD register.
3219+
*/
3220+
public void xtn2VV(ElementSize dstESize, Register dst, Register src) {
3221+
assert dst.getRegisterCategory().equals(SIMD);
3222+
assert src.getRegisterCategory().equals(SIMD);
3223+
assert dstESize != ElementSize.DoubleWord;
3224+
3225+
twoRegMiscEncoding(ASIMDInstruction.XTN, true, elemSizeXX(dstESize), dst, src);
3226+
}
3227+
32073228
/**
32083229
* C7.2.403 Zip vectors (primary).
32093230
* <p>

compiler/src/org.graalvm.compiler.core.aarch64/src/org/graalvm/compiler/core/aarch64/AArch64LIRGenerator.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@
7171
import org.graalvm.compiler.lir.aarch64.AArch64Move.MembarOp;
7272
import org.graalvm.compiler.lir.aarch64.AArch64PauseOp;
7373
import org.graalvm.compiler.lir.aarch64.AArch64SpeculativeBarrier;
74+
import org.graalvm.compiler.lir.aarch64.AArch64StringLatin1InflateOp;
75+
import org.graalvm.compiler.lir.aarch64.AArch64StringUTF16CompressOp;
7476
import org.graalvm.compiler.lir.aarch64.AArch64ZapRegistersOp;
7577
import org.graalvm.compiler.lir.aarch64.AArch64ZapStackOp;
7678
import org.graalvm.compiler.lir.aarch64.AArch64ZeroMemoryOp;
@@ -558,6 +560,18 @@ public Variable emitEncodeArray(Value src, Value dst, Value length, CharsetName
558560
return result;
559561
}
560562

563+
@Override
564+
public void emitStringLatin1Inflate(Value src, Value dst, Value len) {
565+
append(new AArch64StringLatin1InflateOp(this, asAllocatable(src), asAllocatable(dst), asAllocatable(len)));
566+
}
567+
568+
@Override
569+
public Variable emitStringUTF16Compress(Value src, Value dst, Value len) {
570+
Variable result = newVariable(LIRKind.value(AArch64Kind.DWORD));
571+
append(new AArch64StringUTF16CompressOp(this, asAllocatable(src), asAllocatable(dst), asAllocatable(len), result));
572+
return result;
573+
}
574+
561575
@Override
562576
protected JavaConstant zapValueForKind(PlatformKind kind) {
563577
long dead = 0xDEADDEADDEADDEADL;

compiler/src/org.graalvm.compiler.hotspot.aarch64/src/org/graalvm/compiler/hotspot/aarch64/AArch64HotSpotBackendFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2018, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*

compiler/src/org.graalvm.compiler.hotspot/src/org/graalvm/compiler/hotspot/meta/UnimplementedGraalIntrinsics.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,6 @@ public UnimplementedGraalIntrinsics(GraalHotSpotVMConfig config, Architecture ar
392392
if (arch instanceof AArch64) {
393393
add(toBeInvestigated,
394394
"java/lang/StringCoding.hasNegatives([BII)Z",
395-
"java/lang/StringLatin1.inflate([BI[BII)V",
396-
"java/lang/StringLatin1.inflate([BI[CII)V",
397-
"java/lang/StringUTF16.compress([BI[BII)I",
398-
"java/lang/StringUTF16.compress([CI[BII)I",
399395
"java/lang/Thread.onSpinWait()V",
400396
"jdk/internal/util/ArraysSupport.vectorizedMismatch(Ljava/lang/Object;JLjava/lang/Object;JII)I");
401397
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/*
2+
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
3+
* Copyright (c) 2022, Arm Limited. All rights reserved.
4+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5+
*
6+
* This code is free software; you can redistribute it and/or modify it
7+
* under the terms of the GNU General Public License version 2 only, as
8+
* published by the Free Software Foundation. Oracle designates this
9+
* particular file as subject to the "Classpath" exception as provided
10+
* by Oracle in the LICENSE file that accompanied this code.
11+
*
12+
* This code is distributed in the hope that it will be useful, but WITHOUT
13+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15+
* version 2 for more details (a copy is included in the LICENSE file that
16+
* accompanied this code).
17+
*
18+
* You should have received a copy of the GNU General Public License version
19+
* 2 along with this work; if not, write to the Free Software Foundation,
20+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21+
*
22+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
23+
* or visit www.oracle.com if you need additional information or have any
24+
* questions.
25+
*/
26+
package org.graalvm.compiler.lir.aarch64;
27+
28+
import static jdk.vm.ci.aarch64.AArch64.SIMD;
29+
import static jdk.vm.ci.code.ValueUtil.asRegister;
30+
import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
31+
32+
import org.graalvm.compiler.asm.Label;
33+
import org.graalvm.compiler.asm.aarch64.AArch64ASIMDAssembler;
34+
import org.graalvm.compiler.asm.aarch64.AArch64Address;
35+
import org.graalvm.compiler.asm.aarch64.AArch64Assembler;
36+
import org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler;
37+
import org.graalvm.compiler.core.common.LIRKind;
38+
import org.graalvm.compiler.lir.LIRInstructionClass;
39+
import org.graalvm.compiler.lir.Opcode;
40+
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
41+
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
42+
43+
import jdk.vm.ci.aarch64.AArch64Kind;
44+
import jdk.vm.ci.code.Register;
45+
import jdk.vm.ci.meta.AllocatableValue;
46+
47+
@Opcode("AArch64_STRING_INFLATE")
48+
public final class AArch64StringLatin1InflateOp extends AArch64LIRInstruction {
49+
public static final LIRInstructionClass<AArch64StringLatin1InflateOp> TYPE = LIRInstructionClass.create(AArch64StringLatin1InflateOp.class);
50+
51+
private static final int CHUNK_ELEMENT_COUNT = 16;
52+
53+
@Use({REG}) protected AllocatableValue len;
54+
@Alive({REG}) protected AllocatableValue src;
55+
@Alive({REG}) protected AllocatableValue dst;
56+
@Temp({REG}) protected AllocatableValue temp1;
57+
@Temp({REG}) protected AllocatableValue temp2;
58+
@Temp({REG}) protected AllocatableValue temp3;
59+
@Temp({REG}) protected AllocatableValue vectorTemp1;
60+
@Temp({REG}) protected AllocatableValue vectorTemp2;
61+
62+
public AArch64StringLatin1InflateOp(LIRGeneratorTool tool, AllocatableValue src, AllocatableValue dst, AllocatableValue len) {
63+
super(TYPE);
64+
assert len.getPlatformKind().equals(AArch64Kind.DWORD) : len;
65+
assert src.getPlatformKind().equals(AArch64Kind.QWORD) : src;
66+
assert dst.getPlatformKind().equals(AArch64Kind.QWORD) : dst;
67+
68+
this.len = len;
69+
this.src = src;
70+
this.dst = dst;
71+
LIRKind archWordKind = LIRKind.value(AArch64Kind.QWORD);
72+
temp1 = tool.newVariable(archWordKind);
73+
temp2 = tool.newVariable(archWordKind);
74+
temp3 = tool.newVariable(archWordKind);
75+
LIRKind vectorKind = LIRKind.value(tool.target().arch.getLargestStorableKind(SIMD));
76+
vectorTemp1 = tool.newVariable(vectorKind);
77+
vectorTemp2 = tool.newVariable(vectorKind);
78+
79+
}
80+
81+
@Override
82+
public void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
83+
Label simdImpl = new Label();
84+
Label done = new Label();
85+
86+
Register length = asRegister(temp1);
87+
Register srcAddress = asRegister(temp2);
88+
Register destAddress = asRegister(temp3);
89+
90+
// return immediately if length is zero
91+
masm.cbz(32, asRegister(len), done);
92+
93+
/*
94+
* Sign-extend length. Note length is guaranteed to be a non-negative value, so this is
95+
* equivalent to zero-extending length.
96+
*/
97+
masm.sxt(64, 32, length, asRegister(len));
98+
99+
masm.mov(64, srcAddress, asRegister(src));
100+
masm.mov(64, destAddress, asRegister(dst));
101+
102+
masm.compare(64, length, CHUNK_ELEMENT_COUNT);
103+
masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, simdImpl);
104+
105+
emitScalar(masm, srcAddress, destAddress, length);
106+
masm.jmp(done);
107+
108+
masm.bind(simdImpl);
109+
emitSIMD(masm, srcAddress, destAddress, length);
110+
111+
masm.bind(done);
112+
}
113+
114+
private static void emitScalar(AArch64MacroAssembler masm, Register srcAddress, Register destAddress, Register count) {
115+
Label loop = new Label();
116+
117+
try (AArch64MacroAssembler.ScratchRegister scratchReg1 = masm.getScratchRegister()) {
118+
Register val = scratchReg1.getRegister();
119+
120+
masm.align(AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT);
121+
masm.bind(loop);
122+
// ldr zero-extends val to 64 bits
123+
masm.ldr(8, val, AArch64Address.createImmediateAddress(8, AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, srcAddress, 1));
124+
masm.str(16, val, AArch64Address.createImmediateAddress(16, AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, destAddress, 2));
125+
masm.subs(64, count, count, 1);
126+
masm.branchConditionally(AArch64Assembler.ConditionFlag.GT, loop);
127+
}
128+
}
129+
130+
private void emitSIMD(AArch64MacroAssembler masm, Register srcChunkAddress, Register destChunkAddress, Register length) {
131+
Register destLowV = asRegister(vectorTemp1);
132+
Register destHighV = asRegister(vectorTemp2);
133+
134+
Label simdLoop = new Label();
135+
Label done = new Label();
136+
137+
try (AArch64MacroAssembler.ScratchRegister scratchRegister1 = masm.getScratchRegister(); AArch64MacroAssembler.ScratchRegister scratchRegister2 = masm.getScratchRegister()) {
138+
Register endOfSrcAddress = scratchRegister1.getRegister();
139+
Register lastChunkAddress = scratchRegister2.getRegister();
140+
141+
masm.add(64, endOfSrcAddress, srcChunkAddress, length);
142+
masm.sub(64, lastChunkAddress, endOfSrcAddress, CHUNK_ELEMENT_COUNT);
143+
144+
masm.align(AArch64MacroAssembler.PREFERRED_LOOP_ALIGNMENT);
145+
masm.bind(simdLoop);
146+
// load elements
147+
masm.fldr(128, destLowV, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_POST_INDEXED, srcChunkAddress, CHUNK_ELEMENT_COUNT));
148+
// split elements across 2 registers and inflate
149+
masm.neon.uxtl2VV(AArch64ASIMDAssembler.ElementSize.Byte, destHighV, destLowV);
150+
masm.neon.uxtlVV(AArch64ASIMDAssembler.ElementSize.Byte, destLowV, destLowV);
151+
// store inflated elements
152+
masm.fstp(128, destLowV, destHighV, AArch64Address.createImmediateAddress(128, AArch64Address.AddressingMode.IMMEDIATE_PAIR_POST_INDEXED, destChunkAddress, CHUNK_ELEMENT_COUNT * 2));
153+
masm.cmp(64, srcChunkAddress, lastChunkAddress);
154+
masm.branchConditionally(AArch64Assembler.ConditionFlag.LO, simdLoop);
155+
156+
/*
157+
* Process the last chunk. Move the source position back to the last chunk, 16 bytes
158+
* before the end of the input array. Move the destination position back twice the
159+
* movement of source position.
160+
*/
161+
masm.cmp(64, srcChunkAddress, endOfSrcAddress);
162+
masm.branchConditionally(AArch64Assembler.ConditionFlag.HS, done);
163+
masm.sub(64, srcChunkAddress, srcChunkAddress, lastChunkAddress);
164+
masm.sub(64, destChunkAddress, destChunkAddress, srcChunkAddress, AArch64Assembler.ShiftType.LSL, 1);
165+
masm.mov(64, srcChunkAddress, lastChunkAddress);
166+
masm.jmp(simdLoop);
167+
168+
masm.bind(done);
169+
}
170+
}
171+
}

0 commit comments

Comments
 (0)