diff --git a/src/Makefile b/src/Makefile index b49d27e05ff285..38e63181e479c0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -123,9 +123,9 @@ UV_HEADERS += uv/*.h endif PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h) ifneq (${MMTK_PLAN},None) - PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-mmtk.h) + PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-mmtk.h gc-wb-mmtk.h) else - PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-stock.h) + PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,gc-tls-stock.h gc-wb-stock.h) endif ifeq ($(OS),WINNT) PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h) diff --git a/src/gc-wb-mmtk.h b/src/gc-wb-mmtk.h new file mode 100644 index 00000000000000..e433067b6dd787 --- /dev/null +++ b/src/gc-wb-mmtk.h @@ -0,0 +1,34 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +/* + write barriers which should be inlined by the compiler +*/ + +#ifndef JL_GC_WB_H +#define JL_GC_WB_H + +#ifdef __cplusplus +extern "C" { +#endif + +// GC write barriers + +// TODO: implement these functions for MMTk +STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT +{ +} + +STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t* +{ +} + +STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT +{ +} + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gc-wb-stock.h b/src/gc-wb-stock.h new file mode 100644 index 00000000000000..2f91dc17c660e6 --- /dev/null +++ b/src/gc-wb-stock.h @@ -0,0 +1,50 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +/* + write barriers which should be inlined by the compiler +*/ + +#ifndef JL_GC_WB_H +#define JL_GC_WB_H + +#ifdef __cplusplus +extern "C" { +#endif + +// GC write barriers + +STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT +{ + // parent and ptr isa jl_value_t* + if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 /* GC_OLD_MARKED */ && // parent is old and not in remset + (jl_astaggedvalue(ptr)->bits.gc & 1 /* GC_MARKED */) == 0)) // ptr is young + jl_gc_queue_root((jl_value_t*)parent); +} + +STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t* +{ + // if ptr is old + if (__unlikely(jl_astaggedvalue(ptr)->bits.gc == 3 /* GC_OLD_MARKED */)) { + jl_gc_queue_root((jl_value_t*)ptr); + } +} + +STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT +{ + // 3 == GC_OLD_MARKED + // ptr is an immutable object + if (__likely(jl_astaggedvalue(parent)->bits.gc != 3)) + return; // parent is young or in remset + if (__likely(jl_astaggedvalue(ptr)->bits.gc == 3)) + return; // ptr is old and not in remset (thus it does not point to young) + jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(ptr); + const jl_datatype_layout_t *ly = dt->layout; + if (ly->npointers) + jl_gc_queue_multiroot((jl_value_t*)parent, ptr, dt); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/julia.h b/src/julia.h index a80a69049ccb2e..ae4214d57bd90e 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1137,37 +1137,11 @@ JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz); // thread-local allocator of the current thread. JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value); -// GC write barriers - -STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT -{ - // parent and ptr isa jl_value_t* - if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 /* GC_OLD_MARKED */ && // parent is old and not in remset - (jl_astaggedvalue(ptr)->bits.gc & 1 /* GC_MARKED */) == 0)) // ptr is young - jl_gc_queue_root((jl_value_t*)parent); -} - -STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t* -{ - // if ptr is old - if (__unlikely(jl_astaggedvalue(ptr)->bits.gc == 3 /* GC_OLD_MARKED */)) { - jl_gc_queue_root((jl_value_t*)ptr); - } -} - -STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT -{ - // 3 == GC_OLD_MARKED - // ptr is an immutable object - if (__likely(jl_astaggedvalue(parent)->bits.gc != 3)) - return; // parent is young or in remset - if (__likely(jl_astaggedvalue(ptr)->bits.gc == 3)) - return; // ptr is old and not in remset (thus it does not point to young) - jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(ptr); - const jl_datatype_layout_t *ly = dt->layout; - if (ly->npointers) - jl_gc_queue_multiroot((jl_value_t*)parent, ptr, dt); -} +#ifndef MMTK_GC +#include "gc-wb-stock.h" +#else +#include "gc-wb-mmtk.h" +#endif JL_DLLEXPORT void jl_gc_safepoint(void); JL_DLLEXPORT int jl_safepoint_suspend_thread(int tid, int waitstate); diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index 7b2a4bb033203f..c87857d3334620 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -413,4 +413,31 @@ struct FinalLowerGC: private JuliaPassContext { void lowerSafepoint(CallInst *target, Function &F); }; +// The functions below are common to all GC implementations + +// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize +// constant store loop to produce a `memset_pattern16` with a global variable +// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. +// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled +// for NI pointers. +static SmallVector *FindRefinements(Value *V, State *S) +{ + if (!S) + return nullptr; + auto it = S->AllPtrNumbering.find(V); + if (it == S->AllPtrNumbering.end()) + return nullptr; + auto rit = S->Refinements.find(it->second); + return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; +} + +inline bool IsPermRooted(Value *V, State *S) +{ + if (isa(V)) + return true; + if (auto *RefinePtr = FindRefinements(V, S)) + return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; + return false; +} + #endif // LLVM_GC_PASSES_H diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp index 5539c8dbcf1533..e6a499c191dd1a 100644 --- a/src/llvm-late-gc-lowering-mmtk.cpp +++ b/src/llvm-late-gc-lowering-mmtk.cpp @@ -94,3 +94,48 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) } return target; } + +// FIXME: implement this function for MMTk lowering its specific write barrier +void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { + auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); + for (auto CI : WriteBarriers) { + auto parent = CI->getArgOperand(0); + if (std::all_of(CI->op_begin() + 1, CI->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { + CI->eraseFromParent(); + continue; + } + if (CFGModified) { + *CFGModified = true; + } + + IRBuilder<> builder(CI); + builder.SetCurrentDebugLocation(CI->getDebugLoc()); + auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); + auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); + auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); + builder.SetInsertPoint(mayTrigTerm); + mayTrigTerm->getParent()->setName("may_trigger_wb"); + Value *anyChldNotMarked = NULL; + for (unsigned i = 1; i < CI->arg_size(); i++) { + Value *child = CI->getArgOperand(i); + Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); + Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); + anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; + } + assert(anyChldNotMarked); // handled by all_of test above + MDBuilder MDB(parent->getContext()); + SmallVector Weights{1, 9}; + auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, + MDB.createBranchWeights(Weights)); + trigTerm->getParent()->setName("trigger_wb"); + builder.SetInsertPoint(trigTerm); + if (CI->getCalledOperand() == write_barrier_func) { + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); + } + else { + assert(false); + } + CI->eraseFromParent(); + } +} diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp index 2a114877733961..d1894877cfe7cb 100644 --- a/src/llvm-late-gc-lowering-stock.cpp +++ b/src/llvm-late-gc-lowering-stock.cpp @@ -7,3 +7,47 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) // Do nothing for the stock GC return target; } + +void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { + auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); + for (auto CI : WriteBarriers) { + auto parent = CI->getArgOperand(0); + if (std::all_of(CI->op_begin() + 1, CI->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { + CI->eraseFromParent(); + continue; + } + if (CFGModified) { + *CFGModified = true; + } + + IRBuilder<> builder(CI); + builder.SetCurrentDebugLocation(CI->getDebugLoc()); + auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); + auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); + auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); + builder.SetInsertPoint(mayTrigTerm); + mayTrigTerm->getParent()->setName("may_trigger_wb"); + Value *anyChldNotMarked = NULL; + for (unsigned i = 1; i < CI->arg_size(); i++) { + Value *child = CI->getArgOperand(i); + Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); + Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); + anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; + } + assert(anyChldNotMarked); // handled by all_of test above + MDBuilder MDB(parent->getContext()); + SmallVector Weights{1, 9}; + auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, + MDB.createBranchWeights(Weights)); + trigTerm->getParent()->setName("trigger_wb"); + builder.SetInsertPoint(trigTerm); + if (CI->getCalledOperand() == write_barrier_func) { + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); + } + else { + assert(false); + } + CI->eraseFromParent(); + } +} diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 7d6fba65a79e74..96a8281ae6bdee 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1868,31 +1868,6 @@ Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value * return load; } -// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize -// constant store loop to produce a `memset_pattern16` with a global variable -// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. -// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled -// for NI pointers. -static SmallVector *FindRefinements(Value *V, State *S) -{ - if (!S) - return nullptr; - auto it = S->AllPtrNumbering.find(V); - if (it == S->AllPtrNumbering.end()) - return nullptr; - auto rit = S->Refinements.find(it->second); - return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; -} - -static bool IsPermRooted(Value *V, State *S) -{ - if (isa(V)) - return true; - if (auto *RefinePtr = FindRefinements(V, S)) - return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; - return false; -} - static inline void UpdatePtrNumbering(Value *From, Value *To, State *S) { if (!S) @@ -1911,50 +1886,6 @@ MDNode *createMutableTBAAAccessTag(MDNode *Tag) { return MDBuilder(Tag->getContext()).createMutableTBAAAccessTag(Tag); } -void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { - auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); - for (auto CI : WriteBarriers) { - auto parent = CI->getArgOperand(0); - if (std::all_of(CI->op_begin() + 1, CI->op_end(), - [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { - CI->eraseFromParent(); - continue; - } - if (CFGModified) { - *CFGModified = true; - } - - IRBuilder<> builder(CI); - builder.SetCurrentDebugLocation(CI->getDebugLoc()); - auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); - auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); - auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); - builder.SetInsertPoint(mayTrigTerm); - mayTrigTerm->getParent()->setName("may_trigger_wb"); - Value *anyChldNotMarked = NULL; - for (unsigned i = 1; i < CI->arg_size(); i++) { - Value *child = CI->getArgOperand(i); - Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); - Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); - anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; - } - assert(anyChldNotMarked); // handled by all_of test above - MDBuilder MDB(parent->getContext()); - SmallVector Weights{1, 9}; - auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, - MDB.createBranchWeights(Weights)); - trigTerm->getParent()->setName("trigger_wb"); - builder.SetInsertPoint(trigTerm); - if (CI->getCalledOperand() == write_barrier_func) { - builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); - } - else { - assert(false); - } - CI->eraseFromParent(); - } -} - bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { auto T_int32 = Type::getInt32Ty(F.getContext()); auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());