[Codegen][GPU] Sink out shared memory and barriers in vector distribution (#15496)
This handles shared memory allocations as well as barriers in vector
distribution patterns by simply sinking/hoisting them out of warp
execute regions where appropriate. Because upstream takes a non-specific
view on the synchronization primitives for these patterns we handle it
in IREE.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
index 0c61ae5..f6ad92d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/VectorReductionToGPU.cpp
@@ -95,6 +95,13 @@
return true;
if (isUniformLoad(op))
return true;
+ // Shared memory is already scoped to the workgroup and can safely be
+ // hoisted out of the the warp op.
+ if (auto allocOp = dyn_cast<memref::AllocOp>(op)) {
+ if (hasSharedMemoryAddressSpace(allocOp.getType())) {
+ return true;
+ }
+ }
return false;
};
@@ -146,6 +153,26 @@
}
};
+/// Pattern to sink `gpu.barrier` ops out of a `warp_execute_on_lane_0` op.
+class WarpOpBarrier : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
+ using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
+ PatternRewriter &rewriter) const override {
+ auto yield = cast<vector::YieldOp>(
+ warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
+ Operation *lastNode = yield->getPrevNode();
+ auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
+ if (!barrierOp)
+ return failure();
+
+ rewriter.setInsertionPointAfter(warpOp);
+ (void)rewriter.create<gpu::BarrierOp>(barrierOp.getLoc());
+ rewriter.eraseOp(barrierOp);
+ return success();
+ }
+};
+
static Value simpleWarpShuffleFunction(Location loc, OpBuilder &builder,
Value val, Value srcIdx,
int64_t warpSz) {
@@ -251,6 +278,7 @@
vector::populateDistributeReduction(patterns, groupReductionFn);
vector::populateDistributeTransferWriteOpPatterns(patterns,
distributionFn);
+ patterns.add<WarpOpBarrier>(patterns.getContext(), 3);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
index ee47cc7..51ff15a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir
@@ -213,3 +213,47 @@
// CHECK: vector.reduction
// CHECK-COUNT-5: gpu.shuffle
// CHECK: scf.yield
+
+// -----
+
+#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>
+ ]>
+]>
+hal.executable private @shared_memory_copy {
+ hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
+ hal.executable.export @shared_memory_copy layout(#pipeline_layout) attributes {
+ workgroup_size = [32 : index, 1 : index, 1 : index]
+ }
+ builtin.module {
+ func.func @shared_memory_copy() {
+ %c0 = arith.constant 0 : index
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %c32 = arith.constant 32 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
+ %2 = vector.transfer_read %0[%workgroup_id_x, %c0], %cst_0 {in_bounds = [true]} : memref<128x32xf32>, vector<32xf32>
+ vector.transfer_write %2, %alloc[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, #gpu.address_space<workgroup>>
+ gpu.barrier
+ %3 = vector.transfer_read %alloc[%c0], %cst_0 {in_bounds = [true]} : memref<32xf32, #gpu.address_space<workgroup>>, vector<32xf32>
+ vector.transfer_write %3, %1[%workgroup_id_x, %c0] {in_bounds = [true]} : vector<32xf32>, memref<128x32xf32>
+ return
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: func.func @shared_memory_copy() {
+// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
+// CHECK: vector.transfer_read {{.*}} : memref<128x32xf32>, vector<1xf32>
+// CHECK: vector.transfer_write {{.*}} %[[ALLOC]]{{.*}} : vector<1xf32>, memref<32xf32, #gpu.address_space<workgroup>>
+// CHECK: gpu.barrier
+// CHECK: vector.transfer_read %[[ALLOC]]{{.*}} : memref<32xf32, #gpu.address_space<workgroup>>, vector<1xf32>
+// CHECK: vector.transfer_write {{.*}} : vector<1xf32>, memref<128x32xf32>
+// CHECK: return