Skip to content

Commit 0090789

Browse files
authored
Handle atomic accesses in ConstantFieldPropagation (WebAssembly#7159)
Sequentially consistent gets that are optimized out need to have seqcst fences inserted in their place to keep the same effect on global ordering of sequentially consistent operations. In principle, acquire gets could be similarly optimized with an acquire fence in their place, but acquire fences synchronize more strongly than acquire gets, so this may have a negative performance impact. For now, inhibit optimization of acquire gets.
1 parent c744bd1 commit 0090789

File tree

2 files changed

+150
-8
lines changed

2 files changed

+150
-8
lines changed

src/passes/ConstantFieldPropagation.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -139,18 +139,28 @@ struct FunctionOptimizer : public WalkerPass<PostWalker<FunctionOptimizer>> {
139139
if (!info.hasNoted()) {
140140
// This field is never written at all. That means that we do not even
141141
// construct any data of this type, and so it is a logic error to reach
142-
// this location in the code. (Unless we are in an open-world
143-
// situation, which we assume we are not in.) Replace this get with a
144-
// trap. Note that we do not need to care about the nullability of the
145-
// reference, as if it should have trapped, we are replacing it with
146-
// another trap, which we allow to reorder (but we do need to care about
147-
// side effects in the reference, so keep it around).
142+
// this location in the code. (Unless we are in an open-world situation,
143+
// which we assume we are not in.) Replace this get with a trap. Note that
144+
// we do not need to care about the nullability of the reference, as if it
145+
// should have trapped, we are replacing it with another trap, which we
146+
// allow to reorder (but we do need to care about side effects in the
147+
// reference, so keep it around). We also do not need to care about
148+
// synchronization since trapping accesses do not synchronize with other
149+
// accesses.
148150
replaceCurrent(builder.makeSequence(builder.makeDrop(curr->ref),
149151
builder.makeUnreachable()));
150152
changed = true;
151153
return;
152154
}
153155

156+
if (curr->order == MemoryOrder::AcqRel) {
157+
// Removing an acquire get and preserving its synchronization properties
158+
// would require inserting an acquire fence, but the fence would have
159+
// stronger synchronization properties so might be more expensive.
160+
// Instead, just skip the optimization.
161+
return;
162+
}
163+
154164
// If the value is not a constant, then it is unknown and we must give up
155165
// on simply applying a constant. However, we can try to use a ref.test, if
156166
// that is allowed.
@@ -166,8 +176,17 @@ struct FunctionOptimizer : public WalkerPass<PostWalker<FunctionOptimizer>> {
166176
// constant value. (Leave it to further optimizations to get rid of the
167177
// ref.)
168178
auto* value = makeExpression(info, heapType, curr);
169-
replaceCurrent(builder.makeSequence(
170-
builder.makeDrop(builder.makeRefAs(RefAsNonNull, curr->ref)), value));
179+
auto* replacement = builder.blockify(
180+
builder.makeDrop(builder.makeRefAs(RefAsNonNull, curr->ref)));
181+
// If this get is sequentially consistent, then it synchronizes with other
182+
// threads at least by participating in the global order of sequentially
183+
// consistent operations. Preserve that effect by replacing the access with
184+
// a fence.
185+
assert(curr->order != MemoryOrder::AcqRel);
186+
if (curr->order == MemoryOrder::SeqCst) {
187+
replacement = builder.blockify(replacement, builder.makeAtomicFence());
188+
}
189+
replaceCurrent(builder.blockify(replacement, value));
171190
changed = true;
172191
}
173192

test/lit/passes/cfp.wast

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2830,3 +2830,126 @@
28302830
)
28312831
)
28322832
)
2833+
2834+
;; Atomic accesses require special handling
2835+
(module
2836+
(rec
2837+
;; CHECK: (rec
2838+
;; CHECK-NEXT: (type $shared (shared (struct (field (mut i32)))))
2839+
(type $shared (shared (struct (mut i32))))
2840+
;; CHECK: (type $unwritten (shared (struct (field (mut i32)))))
2841+
(type $unwritten (shared (struct (mut i32))))
2842+
)
2843+
2844+
;; CHECK: (type $2 (func))
2845+
2846+
;; CHECK: (type $3 (func (param (ref $shared))))
2847+
2848+
;; CHECK: (type $4 (func (param (ref $unwritten))))
2849+
2850+
;; CHECK: (func $init (type $2)
2851+
;; CHECK-NEXT: (drop
2852+
;; CHECK-NEXT: (struct.new_default $shared)
2853+
;; CHECK-NEXT: )
2854+
;; CHECK-NEXT: )
2855+
(func $init
2856+
(drop
2857+
(struct.new_default $shared)
2858+
)
2859+
)
2860+
2861+
;; CHECK: (func $gets (type $3) (param $0 (ref $shared))
2862+
;; CHECK-NEXT: (drop
2863+
;; CHECK-NEXT: (block (result i32)
2864+
;; CHECK-NEXT: (drop
2865+
;; CHECK-NEXT: (ref.as_non_null
2866+
;; CHECK-NEXT: (local.get $0)
2867+
;; CHECK-NEXT: )
2868+
;; CHECK-NEXT: )
2869+
;; CHECK-NEXT: (i32.const 0)
2870+
;; CHECK-NEXT: )
2871+
;; CHECK-NEXT: )
2872+
;; CHECK-NEXT: (drop
2873+
;; CHECK-NEXT: (struct.atomic.get acqrel $shared 0
2874+
;; CHECK-NEXT: (local.get $0)
2875+
;; CHECK-NEXT: )
2876+
;; CHECK-NEXT: )
2877+
;; CHECK-NEXT: (drop
2878+
;; CHECK-NEXT: (block (result i32)
2879+
;; CHECK-NEXT: (drop
2880+
;; CHECK-NEXT: (ref.as_non_null
2881+
;; CHECK-NEXT: (local.get $0)
2882+
;; CHECK-NEXT: )
2883+
;; CHECK-NEXT: )
2884+
;; CHECK-NEXT: (atomic.fence)
2885+
;; CHECK-NEXT: (i32.const 0)
2886+
;; CHECK-NEXT: )
2887+
;; CHECK-NEXT: )
2888+
;; CHECK-NEXT: )
2889+
(func $gets (param (ref $shared))
2890+
(drop
2891+
(struct.get $shared 0
2892+
(local.get 0)
2893+
)
2894+
)
2895+
(drop
2896+
;; This is not optimized because we wouldn't want to replace it with a
2897+
;; stronger acquire fence.
2898+
(struct.atomic.get acqrel $shared 0
2899+
(local.get 0)
2900+
)
2901+
)
2902+
(drop
2903+
;; This can be optimized, but requires a seqcst fence.
2904+
(struct.atomic.get $shared 0
2905+
(local.get 0)
2906+
)
2907+
)
2908+
)
2909+
2910+
;; CHECK: (func $traps (type $4) (param $0 (ref $unwritten))
2911+
;; CHECK-NEXT: (drop
2912+
;; CHECK-NEXT: (block
2913+
;; CHECK-NEXT: (drop
2914+
;; CHECK-NEXT: (local.get $0)
2915+
;; CHECK-NEXT: )
2916+
;; CHECK-NEXT: (unreachable)
2917+
;; CHECK-NEXT: )
2918+
;; CHECK-NEXT: )
2919+
;; CHECK-NEXT: (drop
2920+
;; CHECK-NEXT: (block
2921+
;; CHECK-NEXT: (drop
2922+
;; CHECK-NEXT: (local.get $0)
2923+
;; CHECK-NEXT: )
2924+
;; CHECK-NEXT: (unreachable)
2925+
;; CHECK-NEXT: )
2926+
;; CHECK-NEXT: )
2927+
;; CHECK-NEXT: (drop
2928+
;; CHECK-NEXT: (block
2929+
;; CHECK-NEXT: (drop
2930+
;; CHECK-NEXT: (local.get $0)
2931+
;; CHECK-NEXT: )
2932+
;; CHECK-NEXT: (unreachable)
2933+
;; CHECK-NEXT: )
2934+
;; CHECK-NEXT: )
2935+
;; CHECK-NEXT: )
2936+
(func $traps (param (ref $unwritten))
2937+
;; This are all optimizable because they are known to trap. No fences are
2938+
;; necessary.
2939+
(drop
2940+
(struct.get $unwritten 0
2941+
(local.get 0)
2942+
)
2943+
)
2944+
(drop
2945+
(struct.atomic.get acqrel $unwritten 0
2946+
(local.get 0)
2947+
)
2948+
)
2949+
(drop
2950+
(struct.atomic.get $unwritten 0
2951+
(local.get 0)
2952+
)
2953+
)
2954+
)
2955+
)

0 commit comments

Comments
 (0)