JBR-3715: partial cherry-pick of PR "Windows AArch64 Support"

implemented correct r18 usage fixed JBR-3715 patch from V.Kempik (cherry picked from commit a08c2a7)
JetBrains · Sep 20, 2021 · e9e9816 · e9e9816
1 parent ca49a70
commit e9e9816
Show file tree

Hide file tree

Showing 16 changed files with 1,388 additions and 1,250 deletions.
diff --git a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk
@@ -99,6 +99,16 @@ ifeq ($(call check-jvm-feature, compiler2), true)
     ADLCFLAGS += -DAIX=1
   else ifeq ($(OPENJDK_TARGET_OS), macosx)
     ADLCFLAGS += -D_ALLBSD_SOURCE=1 -D_GNU_SOURCE=1
+    ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
+      ADLCFLAGS += -DR18_RESERVED=1
+    endif
+  else ifeq ($(OPENJDK_TARGET_OS), windows)
+    ifeq ($(call isTargetCpuBits, 64), true)
+      ADLCFLAGS += -D_WIN64=1
+    endif
+    ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64)
+      ADLCFLAGS += -DR18_RESERVED=1
+    endif
   endif
 
   ifneq ($(OPENJDK_TARGET_OS), windows)

diff --git a/src/hotspot/cpu/aarch64/aarch64-asmtest.py b/src/hotspot/cpu/aarch64/aarch64-asmtest.py
@@ -13,6 +13,8 @@ class Register(Operand):
 
     def generate(self):
         self.number = random.randint(0, 30)
+        if self.number == 18:
+            self.number = 17
         return self
 
     def astr(self, prefix):
@@ -37,6 +39,8 @@ class GeneralRegisterOrZr(Register):
 
     def generate(self):
         self.number = random.randint(0, 31)
+        if self.number == 18:
+            self.number = 16
         return self
 
     def astr(self, prefix = ""):
@@ -54,6 +58,8 @@ def __str__(self):
 class GeneralRegisterOrSp(Register):
     def generate(self):
         self.number = random.randint(0, 31)
+        if self.number == 18:
+            self.number = 15
         return self
 
     def astr(self, prefix = ""):

diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -110,8 +110,8 @@ reg_def R16     ( SOC, SOC, Op_RegI, 16, r16->as_VMReg()        );
 reg_def R16_H   ( SOC, SOC, Op_RegI, 16, r16->as_VMReg()->next());
 reg_def R17     ( SOC, SOC, Op_RegI, 17, r17->as_VMReg()        );
 reg_def R17_H   ( SOC, SOC, Op_RegI, 17, r17->as_VMReg()->next());
-reg_def R18     ( SOC, SOC, Op_RegI, 18, r18->as_VMReg()        );
-reg_def R18_H   ( SOC, SOC, Op_RegI, 18, r18->as_VMReg()->next());
+reg_def R18     ( SOC, SOC, Op_RegI, 18, r18_reserved->as_VMReg()        );
+reg_def R18_H   ( SOC, SOC, Op_RegI, 18, r18_reserved->as_VMReg()->next());
 reg_def R19     ( SOC, SOE, Op_RegI, 19, r19->as_VMReg()        );
 reg_def R19_H   ( SOC, SOE, Op_RegI, 19, r19->as_VMReg()->next());
 reg_def R20     ( SOC, SOE, Op_RegI, 20, r20->as_VMReg()        ); // caller esp
@@ -352,7 +352,6 @@ alloc_class chunk0(
     R15, R15_H,
     R16, R16_H,
     R17, R17_H,
-    R18, R18_H,
 
     // arg registers
     R0, R0_H,
@@ -375,7 +374,7 @@ alloc_class chunk0(
     R26, R26_H,
 
     // non-allocatable registers
-
+    R18, R18_H, // platform
     R27, R27_H, // heapbase
     R28, R28_H, // thread
     R29, R29_H, // fp
@@ -533,7 +532,10 @@ reg_class no_special_reg32_no_fp(
     R15,
     R16,
     R17,
+#ifndef R18_RESERVED
+    // See comment in register_aarch64.hpp
     R18,
+#endif
     R19,
     R20,
     R21,
@@ -566,7 +568,10 @@ reg_class no_special_reg32_with_fp(
     R15,
     R16,
     R17,
+#ifndef R18_RESERVED
+    // See comment in register_aarch64.hpp
     R18,
+#endif
     R19,
     R20,
     R21,
@@ -602,7 +607,10 @@ reg_class no_special_reg_no_fp(
     R15, R15_H,
     R16, R16_H,
     R17, R17_H,
+#ifndef R18_RESERVED
+    // See comment in register_aarch64.hpp
     R18, R18_H,
+#endif
     R19, R19_H,
     R20, R20_H,
     R21, R21_H,
@@ -635,7 +643,10 @@ reg_class no_special_reg_with_fp(
     R15, R15_H,
     R16, R16_H,
     R17, R17_H,
+#ifndef R18_RESERVED
+    // See comment in register_aarch64.hpp
     R18, R18_H,
+#endif
     R19, R19_H,
     R20, R20_H,
     R21, R21_H,
@@ -775,7 +786,10 @@ reg_class no_special_ptr_reg(
     R15, R15_H,
     R16, R16_H,
     R17, R17_H,
+#ifndef R18_RESERVED
+    // See comment in register_aarch64.hpp
     R18, R18_H,
+#endif
     R19, R19_H,
     R20, R20_H,
     R21, R21_H,
@@ -5419,7 +5433,7 @@ pipeline %{
 attributes %{
   // ARM instructions are of fixed length
   fixed_size_instructions;        // Fixed size instructions TODO does
-  max_instructions_per_bundle = 2;   // A53 = 2, A57 = 4
+  max_instructions_per_bundle = 4;   // A53 = 2, A57 = 4
   // ARM instructions come in 32-bit word units
   instruction_unit_size = 4;         // An instruction is 4 bytes long
   instruction_fetch_unit_size = 64;  // The processor fetches one line

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp
diff --git a/src/hotspot/cpu/aarch64/c1_Defs_aarch64.hpp b/src/hotspot/cpu/aarch64/c1_Defs_aarch64.hpp
@@ -44,13 +44,13 @@ enum {
   pd_nof_cpu_regs_frame_map = RegisterImpl::number_of_registers,       // number of registers used during code emission
   pd_nof_fpu_regs_frame_map = FloatRegisterImpl::number_of_registers,  // number of registers used during code emission
 
-  pd_nof_caller_save_cpu_regs_frame_map = 19 - 2,  // number of registers killed by calls
+  pd_nof_caller_save_cpu_regs_frame_map = 19 - 2 /* rscratch1 and rscratch2 */ R18_RESERVED_ONLY(- 1),  // number of registers killed by calls
   pd_nof_caller_save_fpu_regs_frame_map = 32,  // number of registers killed by calls
 
-  pd_first_callee_saved_reg = 19 - 2,
-  pd_last_callee_saved_reg = 26 - 2,
+  pd_first_callee_saved_reg = 19 - 2 /* rscratch1 and rscratch2 */ R18_RESERVED_ONLY(- 1),
+  pd_last_callee_saved_reg = 26 - 2 /* rscratch1 and rscratch2 */ R18_RESERVED_ONLY(- 1),
 
-  pd_last_allocatable_cpu_reg = 16,
+  pd_last_allocatable_cpu_reg = 16 R18_RESERVED_ONLY(- 1),
 
   pd_nof_cpu_regs_reg_alloc
     = pd_last_allocatable_cpu_reg + 1,  // number of registers that are visible to register allocator
@@ -60,9 +60,9 @@ enum {
   pd_nof_fpu_regs_linearscan = pd_nof_fpu_regs_frame_map, // number of registers visible to linear scan
   pd_nof_xmm_regs_linearscan = 0, // like sparc we don't have any of these
   pd_first_cpu_reg = 0,
-  pd_last_cpu_reg = 16,
+  pd_last_cpu_reg = 16 R18_RESERVED_ONLY(- 1),
   pd_first_byte_reg = 0,
-  pd_last_byte_reg = 16,
+  pd_last_byte_reg = 16 R18_RESERVED_ONLY(- 1),
   pd_first_fpu_reg = pd_nof_cpu_regs_frame_map,
   pd_last_fpu_reg =  pd_first_fpu_reg + 31,
 

diff --git a/src/hotspot/cpu/aarch64/c1_FpuStackSim_aarch64.cpp b/src/hotspot/cpu/aarch64/c1_FpuStackSim_aarch64.cpp
@@ -28,3 +28,4 @@
 //--------------------------------------------------------
 
 // No FPU stack on AARCH64
+#include "precompiled.hpp"
diff --git a/src/hotspot/cpu/aarch64/c1_FrameMap_aarch64.cpp b/src/hotspot/cpu/aarch64/c1_FrameMap_aarch64.cpp
@@ -181,7 +181,10 @@ void FrameMap::initialize() {
   map_register(i, r15); r15_opr = LIR_OprFact::single_cpu(i); i++;
   map_register(i, r16); r16_opr = LIR_OprFact::single_cpu(i); i++;
   map_register(i, r17); r17_opr = LIR_OprFact::single_cpu(i); i++;
-  map_register(i, r18); r18_opr = LIR_OprFact::single_cpu(i); i++;
+#ifndef R18_RESERVED
+  // See comment in register_aarch64.hpp
+  map_register(i, r18_reserved); r18_opr = LIR_OprFact::single_cpu(i); i++;
+#endif
   map_register(i, r19); r19_opr = LIR_OprFact::single_cpu(i); i++;
   map_register(i, r20); r20_opr = LIR_OprFact::single_cpu(i); i++;
   map_register(i, r21); r21_opr = LIR_OprFact::single_cpu(i); i++;
@@ -199,6 +202,11 @@ void FrameMap::initialize() {
   map_register(i, r8); r8_opr = LIR_OprFact::single_cpu(i); i++;   // rscratch1
   map_register(i, r9); r9_opr = LIR_OprFact::single_cpu(i); i++;   // rscratch2
 
+#ifdef R18_RESERVED
+  // See comment in register_aarch64.hpp
+  map_register(i, r18_reserved); r18_opr = LIR_OprFact::single_cpu(i); i++;
+#endif
+
   rscratch1_opr = r8_opr;
   rscratch2_opr = r9_opr;
   rscratch1_long_opr = LIR_OprFact::double_cpu(r8_opr->cpu_regnr(), r8_opr->cpu_regnr());
@@ -227,7 +235,10 @@ void FrameMap::initialize() {
   _caller_save_cpu_regs[13] = r15_opr;
   _caller_save_cpu_regs[14] = r16_opr;
   _caller_save_cpu_regs[15] = r17_opr;
+#ifndef R18_RESERVED
+  // See comment in register_aarch64.hpp
   _caller_save_cpu_regs[16] = r18_opr;
+#endif
 
   for (int i = 0; i < 8; i++) {
     _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
@@ -253,7 +264,7 @@ void FrameMap::initialize() {
   r15_oop_opr = as_oop_opr(r15);
   r16_oop_opr = as_oop_opr(r16);
   r17_oop_opr = as_oop_opr(r17);
-  r18_oop_opr = as_oop_opr(r18);
+  r18_oop_opr = as_oop_opr(r18_reserved);
   r19_oop_opr = as_oop_opr(r19);
   r20_oop_opr = as_oop_opr(r20);
   r21_oop_opr = as_oop_opr(r21);

diff --git a/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp b/src/hotspot/cpu/aarch64/globalDefinitions_aarch64.hpp
@@ -53,4 +53,13 @@ const bool CCallingConventionRequiresIntsAsLongs = false;
 
 #define THREAD_LOCAL_POLL
 
+#if defined(__APPLE__) || defined(_WIN64)
+#define R18_RESERVED
+#define R18_RESERVED_ONLY(code) code
+#define NOT_R18_RESERVED(code)
+#else
+#define R18_RESERVED_ONLY(code)
+#define NOT_R18_RESERVED(code) code
+#endif
+
 #endif // CPU_AARCH64_VM_GLOBALDEFINITIONS_AARCH64_HPP
diff --git a/src/hotspot/cpu/aarch64/immediate_aarch64.cpp b/src/hotspot/cpu/aarch64/immediate_aarch64.cpp
@@ -23,6 +23,8 @@
  */
 
 #include <stdlib.h>
+#include "precompiled.hpp"
+#include "utilities/globalDefinitions.hpp"
 #include "immediate_aarch64.hpp"
 
 // there are at most 2^13 possible logical immediate encodings
@@ -243,7 +245,10 @@ int expandLogicalImmediate(u_int32_t immN, u_int32_t immr,
 
 // constructor to initialise the lookup tables
 
-static void initLITables() __attribute__ ((constructor));
+static void initLITables();
+// Use an empty struct with a construtor as MSVC doesn't support `__attribute__ ((constructor))`
+// See https://stackoverflow.com/questions/1113409/attribute-constructor-equivalent-in-vc
+static struct initLITables_t { initLITables_t(void) { initLITables(); } } _initLITables;
 static void initLITables()
 {
   li_table_entry_count = 0;

diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -2544,9 +2544,17 @@ void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
   }
 }
 
+RegSet MacroAssembler::call_clobbered_registers() {
+  RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
+#ifndef R18_RESERVED
+  regs += r18_reserved;
+#endif
+  return regs;
+}
+
 void MacroAssembler::push_call_clobbered_registers() {
   int step = 4 * wordSize;
-  push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
+  push(call_clobbered_registers() - RegSet::of(rscratch1, rscratch2), sp);
   sub(sp, sp, step);
   mov(rscratch1, -step);
   // Push v0-v7, v16-v31.
@@ -2566,7 +2574,7 @@ void MacroAssembler::pop_call_clobbered_registers() {
           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
   }
 
-  pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
+  pop(call_clobbered_registers() - RegSet::of(rscratch1, rscratch2), sp);
 }
 
 void MacroAssembler::push_CPU_state(bool save_vectors) {

diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
@@ -460,6 +460,8 @@ class MacroAssembler: public Assembler {
   void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
   void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
 
+  static RegSet call_clobbered_registers();
+
   // Push and pop everything that might be clobbered by a native
   // runtime call except rscratch1 and rscratch2.  (They are always
   // scratch, so we don't have to protect them.)  Only save the lower

diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp
@@ -689,7 +689,7 @@ void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2
       RECOMP_FOR1_CHECK;
   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
-      jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
+      jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r19;
     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
     // jx = nx - 1
     lea(twoOverPiBase, ExternalAddress(two_over_pi));
@@ -1421,6 +1421,12 @@ void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
   Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
   Register X = r0, absX = r1, n = r2, ix = r3;
   FloatRegister y0 = v4, y1 = v5;
+
+  enter();
+  // r19 is used in TemplateInterpreterGenerator::generate_math_entry
+  RegSet saved_regs = RegSet::of(r19);
+  push (saved_regs, sp);
+
     block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
       fmovd(X, v0);
       mov(rscratch2, 0x3e400000);
@@ -1438,14 +1444,14 @@ void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
       // Set last bit unconditionally to make it NaN
       orr(r10, r10, 1);
       fmovd(v0, r10);
-      ret(lr);
+      b(DONE);
     }
   block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
     bind(TINY_X);
       if (isCos) {
         fmovd(v0, 1.0);
       }
-      ret(lr);
+      b(DONE);
   }
   bind(ARG_REDUCTION); /* argument reduction needed */
     block_comment("n = __ieee754_rem_pio2(x,y);"); {
@@ -1465,7 +1471,7 @@ void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
         tbz(n, 1, DONE);
       }
       fnegd(v0, v0);
-      ret(lr);
+      b(DONE);
     bind(RETURN_SIN);
       generate_kernel_sin(y0, true, dsin_coef);
       if (isCos) {
@@ -1474,7 +1480,7 @@ void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
         tbz(n, 1, DONE);
       }
       fnegd(v0, v0);
-      ret(lr);
+      b(DONE);
     }
   bind(EARLY_CASE);
     eor(y1, T8B, y1, y1);
@@ -1484,5 +1490,7 @@ void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
       generate_kernel_sin(v0, false, dsin_coef);
     }
   bind(DONE);
+    pop(saved_regs, sp);
+    leave();
     ret(lr);
 }
diff --git a/src/hotspot/cpu/aarch64/register_aarch64.cpp b/src/hotspot/cpu/aarch64/register_aarch64.cpp
@@ -38,7 +38,7 @@ const char* RegisterImpl::name() const {
     "c_rarg0", "c_rarg1", "c_rarg2", "c_rarg3", "c_rarg4", "c_rarg5", "c_rarg6", "c_rarg7",
     "rscratch1", "rscratch2",
     "r10", "r11", "r12", "r13", "r14", "r15", "r16",
-    "r17", "r18", "r19",
+    "r17", "r18_tls", "r19",
     "resp", "rdispatch", "rbcp", "r23", "rlocals", "rmonitors", "rcpool", "rheapbase",
     "rthread", "rfp", "lr", "sp"
   };
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,3 +28,4 @@
		//--------------------------------------------------------

		// No FPU stack on AARCH64
		#include "precompiled.hpp"