Skip to content

Commit

Permalink
Card table as DCQ
Browse files Browse the repository at this point in the history
* gc thread cpu time tracking with gc+cpu=debug logging

* use correct young gen RS length prediction for base time calculation after finalizing young gen

* proper accounting of sweep rt continuation to base time
* merge to actual remset size calculation fixes

* fix compilation on aarch64, Windows

* fix compilation on aarch64

* remove trailing spaces from files

* initial riscv implementation
* cleanup s390/ppc
* fix missing ResizeTLABs event in jfr parallel phases test

* ppc barrier attempt (not even built)

* removed necessary C1 slow path stub generation :(

* fix riscv compilation

* fix RISCV barrier, passes javac HelloWorld and its execution

* re-add too-much-deleted stuff
* consider yield time in dirtying rate calculations
* cleanup

* clean up in refinement heuristics

* remove unused g1_young_card_val

* refactoring, renaming

* when calculating total merged cards from RS, compensate for the merge cache
* cleanup, refactoring, renaming

* refactoring of refinement/redirtying stats recording
* disable some expensive logging

* fix merge card cache compensation

* improve documentation about CardValue's LSB discriminating between clean/non-clean
* additional comments to assembly code

* fix too early clearing of refinement statistics after regular refinement completion where cards_to_cset would always be zero
* improved sizing of work for refinement table merge phase
* refactoring, removing fixmes

* aarch64 improved array post barrier

* fix aarch64 array post barrier assembler version

* fix testgclogmessages.java test after recent changes

* arm32 barrier

* currently yield duration only applies to sweeping

* add missing files after rebase

* regularize new_val_maybe_null * ppc build fixes

* more ppc build fixes after bad merge

* too many fixes :(

* cleanup

* fix check for enough space to evacuate

* remove code to take expected old gen surviving words into account when determining eden length

* remove some debug code

* track safepoints in recent refinement epoch to calculate card dirtying time

* remove card_table1 member from g1barrierset

* refactoring, cleanup

* fix issues with tracking gc pauses for card dirtying

* epoch timing fixes; little cleanup

* more time accounting fixes

* some refactoring

* cleanup

* remove more debug logs
* remove parts of already pushed stuff before merge

* improve cpu time output

* synchronize accesses for prediction relevant members between refinement and young gen revise thread

* remove UseNewCode in barrier code
* some assert to check that dirtying cards is done at the right time

* comment why the lock when updating redirtying information

* remove dead code

* some cleanup in code generation

* initial version

* add store_addr == new_val check to all platforms

* remove unnecessary stuff

* fix ppc barrier code (from M. Doerr)
  * too strong different register assertion due to ppc optimization
* fix passing of new_val_may_be_null for c1 barriers

* factor out x.a = x assignments for the C1 compiler.

* remove FIXMEs

* fix s390 barrier code (from A. Kumar)

* removed empty JMVCI write_barrier_post stub because JVMCI users need more changes than that anyway
* added card table base offset constant for use with JVMCI

* add clean_card_val() for JVMCI

* fix node costs for g1 post barrier

* remove trailing whitespace in files

* update post barrier cost estimate
  • Loading branch information
Thomas Schatzl authored and tschatzl committed Dec 17, 2024
1 parent fbbc7c3 commit da1f033
Show file tree
Hide file tree
Showing 103 changed files with 2,937 additions and 4,237 deletions.
2 changes: 1 addition & 1 deletion make/autoconf/flags-cflags.m4
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,7 @@ AC_DEFUN([FLAGS_SETUP_CFLAGS_CPU_DEP],
elif test "x$FLAGS_CPU" = xarm; then
# -Wno-psabi to get rid of annoying "note: the mangling of 'va_list' has changed in GCC 4.4"
$1_CFLAGS_CPU="-fsigned-char -Wno-psabi $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS -DJDK_ARCH_ABI_PROP_NAME='\"\$(JDK_ARCH_ABI_PROP_NAME)\"'"
$1_CFLAGS_CPU_JVM="-DARM"
$1_CFLAGS_CPU_JVM="-DARM -Wno-attributes"
elif test "x$FLAGS_CPU_ARCH" = xppc; then
$1_CFLAGS_CPU_JVM="-minsert-sched-nops=regroup_exact -mno-multiple -mno-string"
if test "x$FLAGS_CPU" = xppc64; then
Expand Down
232 changes: 67 additions & 165 deletions src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,38 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm

void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
Register start, Register count, Register scratch, RegSet saved_regs) {
__ push(saved_regs, sp);
assert_different_registers(start, count, scratch);
assert_different_registers(c_rarg0, count);
__ mov(c_rarg0, start);
__ mov(c_rarg1, count);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
__ pop(saved_regs, sp);

Label done;
Label loop;
Label next;
const Register end = count;

__ cbz(count, done);

__ lea(end, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop
__ sub(end, end, BytesPerHeapOop); // last element address to make inclusive

__ lsr(start, start, CardTable::card_shift());
__ lsr(end, end, CardTable::card_shift());
__ sub(count, end, start); // Number of bytes to mark

__ ldr(scratch, Address(rthread, in_bytes(G1ThreadLocalData::card_table_base_offset())));
__ add(start, start, scratch);

__ bind(loop);
if (UseCondCardMark) {
__ ldrb(scratch, Address(start, count));
// Instead of loading clean_card_val and comparing, we exploit the fact that
// the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
__ tbz(scratch, 0, next);
}
static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
__ strb(zr, Address(start, count));
__ bind(next);
__ subs(count, count, 1);
__ br(Assembler::GE, loop);

__ bind(done);
}

static void generate_queue_test_and_insertion(MacroAssembler* masm, ByteSize index_offset, ByteSize buffer_offset, Label& runtime,
Expand Down Expand Up @@ -203,45 +228,38 @@ void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
static void generate_post_barrier_fast_path(MacroAssembler* masm,
const Register store_addr,
const Register new_val,
const Register thread,
const Register tmp1,
const Register tmp2,
Label& done,
bool new_val_may_be_null) {
bool new_val_maybe_null) {
assert(thread == rthread, "must be");
assert_different_registers(store_addr, new_val, thread, tmp1, tmp2, noreg);

// Does store cross heap regions?
__ eor(tmp1, store_addr, new_val); // tmp1 := store address ^ new value
__ lsr(tmp1, tmp1, G1HeapRegion::LogOfHRGrainBytes); // tmp1 := ((store address ^ new value) >> LogOfHRGrainBytes)
__ cbz(tmp1, done);

// Crosses regions, storing null?
if (new_val_may_be_null) {
if (new_val_maybe_null) {
__ cbz(new_val, done);
}
// Storing region crossing non-null, is card young?

__ lsr(tmp1, store_addr, CardTable::card_shift()); // tmp1 := card address relative to card table base
__ load_byte_map_base(tmp2); // tmp2 := card table base address
__ add(tmp1, tmp1, tmp2); // tmp1 := card address
__ ldrb(tmp2, Address(tmp1)); // tmp2 := card
__ cmpw(tmp2, (int)G1CardTable::g1_young_card_val()); // tmp2 := card == young_card_val?
}

static void generate_post_barrier_slow_path(MacroAssembler* masm,
const Register thread,
const Register tmp1,
const Register tmp2,
Label& done,
Label& runtime) {
__ membar(Assembler::StoreLoad); // StoreLoad membar
__ ldrb(tmp2, Address(tmp1)); // tmp2 := card
__ cbzw(tmp2, done);
// Storing a region crossing, non-null oop, card is clean.
// Dirty card and log.
STATIC_ASSERT(CardTable::dirty_card_val() == 0);
__ strb(zr, Address(tmp1)); // *(card address) := dirty_card_val
generate_queue_test_and_insertion(masm,
G1ThreadLocalData::dirty_card_queue_index_offset(),
G1ThreadLocalData::dirty_card_queue_buffer_offset(),
runtime,
thread, tmp1, tmp2, rscratch1);
__ b(done);
Address card_table_addr(thread, in_bytes(G1ThreadLocalData::card_table_base_offset()));
__ ldr(tmp2, card_table_addr); // tmp2 := card table base address
__ add(tmp1, tmp1, tmp2); // tmp1 := card address
if (UseCondCardMark) {
__ ldrb(tmp2, Address(tmp1)); // tmp2 := card
// Instead of loading clean_card_val and comparing, we exploit the fact that
// the LSB of non-clean cards is always 0, and the LSB of clean cards 1.
__ tbz(tmp2, 0, done);
}
static_assert(G1CardTable::dirty_card_val() == 0, "must be to use zr");
__ strb(zr, Address(tmp1)); // *(card address) := dirty_card_val
}

void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
Expand All @@ -250,27 +268,8 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
Register thread,
Register tmp1,
Register tmp2) {
assert(thread == rthread, "must be");
assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
rscratch1);
assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
&& tmp2 != noreg, "expecting a register");

Label done;
Label runtime;

generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, done, true /* new_val_may_be_null */);
// If card is young, jump to done
__ br(Assembler::EQ, done);
generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, done, runtime);

__ bind(runtime);
// save the live input values
RegSet saved = RegSet::of(store_addr);
__ push(saved, sp);
__ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), tmp1, thread);
__ pop(saved, sp);

generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, false /* new_val_maybe_null */);
__ bind(done);
}

Expand Down Expand Up @@ -330,38 +329,10 @@ void G1BarrierSetAssembler::g1_write_barrier_post_c2(MacroAssembler* masm,
Register thread,
Register tmp1,
Register tmp2,
G1PostBarrierStubC2* stub) {
assert(thread == rthread, "must be");
assert_different_registers(store_addr, new_val, thread, tmp1, tmp2,
rscratch1);
assert(store_addr != noreg && new_val != noreg && tmp1 != noreg
&& tmp2 != noreg, "expecting a register");

stub->initialize_registers(thread, tmp1, tmp2);

bool new_val_may_be_null = (stub->barrier_data() & G1C2BarrierPostNotNull) == 0;
generate_post_barrier_fast_path(masm, store_addr, new_val, tmp1, tmp2, *stub->continuation(), new_val_may_be_null);
// If card is not young, jump to stub (slow path)
__ br(Assembler::NE, *stub->entry());

__ bind(*stub->continuation());
}

void G1BarrierSetAssembler::generate_c2_post_barrier_stub(MacroAssembler* masm,
G1PostBarrierStubC2* stub) const {
Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
Label runtime;
Register thread = stub->thread();
Register tmp1 = stub->tmp1(); // tmp1 holds the card address.
Register tmp2 = stub->tmp2();
assert(stub->tmp3() == noreg, "not needed in this platform");

__ bind(*stub->entry());
generate_post_barrier_slow_path(masm, thread, tmp1, tmp2, *stub->continuation(), runtime);

__ bind(runtime);
generate_c2_barrier_runtime_call(masm, stub, tmp1, CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry));
__ b(*stub->continuation());
bool new_val_maybe_null) {
Label done;
generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, new_val_maybe_null);
__ bind(done);
}

#endif // COMPILER2
Expand Down Expand Up @@ -457,20 +428,19 @@ void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrier
__ b(*stub->continuation());
}

void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
__ bind(*stub->entry());
assert(stub->addr()->is_register(), "Precondition.");
assert(stub->new_val()->is_register(), "Precondition.");
Register new_val_reg = stub->new_val()->as_register();
__ cbz(new_val_reg, *stub->continuation());
ce->store_parameter(stub->addr()->as_pointer_register(), 0);
__ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
__ b(*stub->continuation());
}

#undef __

void G1BarrierSetAssembler::g1_write_barrier_post_c1(MacroAssembler* masm,
Register store_addr,
Register new_val,
Register thread,
Register tmp1,
Register tmp2) {
Label done;
generate_post_barrier_fast_path(masm, store_addr, new_val, thread, tmp1, tmp2, done, true /* new_val_maybe_null */);
masm->bind(done);
}

#define __ sasm->

void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
Expand Down Expand Up @@ -522,74 +492,6 @@ void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler*
__ epilogue();
}

void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
__ prologue("g1_post_barrier", false);

// arg0: store_address
Address store_addr(rfp, 2*BytesPerWord);

BarrierSet* bs = BarrierSet::barrier_set();
CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
CardTable* ct = ctbs->card_table();

Label done;
Label runtime;

// At this point we know new_value is non-null and the new_value crosses regions.
// Must check to see if card is already dirty

const Register thread = rthread;

Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));

const Register card_offset = rscratch2;
// LR is free here, so we can use it to hold the byte_map_base.
const Register byte_map_base = lr;

assert_different_registers(card_offset, byte_map_base, rscratch1);

__ load_parameter(0, card_offset);
__ lsr(card_offset, card_offset, CardTable::card_shift());
__ load_byte_map_base(byte_map_base);
__ ldrb(rscratch1, Address(byte_map_base, card_offset));
__ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val());
__ br(Assembler::EQ, done);

assert((int)CardTable::dirty_card_val() == 0, "must be 0");

__ membar(Assembler::StoreLoad);
__ ldrb(rscratch1, Address(byte_map_base, card_offset));
__ cbzw(rscratch1, done);

// storing region crossing non-null, card is clean.
// dirty card and log.
__ strb(zr, Address(byte_map_base, card_offset));

// Convert card offset into an address in card_addr
Register card_addr = card_offset;
__ add(card_addr, byte_map_base, card_addr);

__ ldr(rscratch1, queue_index);
__ cbz(rscratch1, runtime);
__ sub(rscratch1, rscratch1, wordSize);
__ str(rscratch1, queue_index);

// Reuse LR to hold buffer_addr
const Register buffer_addr = lr;

__ ldr(buffer_addr, buffer);
__ str(card_addr, Address(buffer_addr, rscratch1));
__ b(done);

__ bind(runtime);
__ push_call_clobbered_registers();
__ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
__ pop_call_clobbered_registers();
__ bind(done);
__ epilogue();
}

#undef __

#endif // COMPILER1
15 changes: 8 additions & 7 deletions src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
class LIR_Assembler;
class StubAssembler;
class G1PreBarrierStub;
class G1PostBarrierStub;
class G1PreBarrierStubC2;
class G1PostBarrierStubC2;

class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
protected:
Expand Down Expand Up @@ -65,10 +63,15 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
public:
#ifdef COMPILER1
void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);

void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);

void g1_write_barrier_post_c1(MacroAssembler* masm,
Register store_addr,
Register new_val,
Register thread,
Register tmp1,
Register tmp2);
#endif

#ifdef COMPILER2
Expand All @@ -87,9 +90,7 @@ class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
Register thread,
Register tmp1,
Register tmp2,
G1PostBarrierStubC2* c2_stub);
void generate_c2_post_barrier_stub(MacroAssembler* masm,
G1PostBarrierStubC2* stub) const;
bool new_val_maybe_null);
#endif

void load_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
Expand Down
6 changes: 3 additions & 3 deletions src/hotspot/cpu/aarch64/gc/g1/g1_aarch64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ static void write_barrier_post(MacroAssembler* masm,
Register new_val,
Register tmp1,
Register tmp2) {
if (!G1PostBarrierStubC2::needs_barrier(node)) {
if (!G1BarrierStubC2::needs_post_barrier(node)) {
return;
}
Assembler::InlineSkippedInstructionsCounter skip_counter(masm);
G1BarrierSetAssembler* g1_asm = static_cast<G1BarrierSetAssembler*>(BarrierSet::barrier_set()->barrier_set_assembler());
G1PostBarrierStubC2* const stub = G1PostBarrierStubC2::create(node);
g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, stub);
bool new_val_maybe_null = G1BarrierStubC2::post_new_val_maybe_null(node);
g1_asm->g1_write_barrier_post_c2(masm, store_addr, new_val, rthread, tmp1, tmp2, new_val_maybe_null);
}

%}
Expand Down
Loading

0 comments on commit da1f033

Please sign in to comment.