[Codegen][GPU] Add a pass for basic distribution verification (#18236)
After various levels of tiling to warps/threads and then bufferization
in GPU codegen pipelines, but before resolving distributed loops like
`scf.forall` ops, we have an opportunity for additional verification
that all operations were properly mapped to threads. In particular, any
operation that vectorized/bufferized to an operation with a write effect
must now be within a *thread* distributed context or else there is
almost certainly a write race. Such cases means something went wrong in
earlier passes and is a compiler failure.
Note: this is only added for the LLVMGPUTileAndFuse pipeline because
other pipelines allow for write effecting ops like
`memref.copy` to persist past `scf.forall` resolution.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index da45408..4a0b879 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -72,6 +72,7 @@
"GPUTileReduction.cpp",
"GPUVectorAlloc.cpp",
"GPUVectorDistribution.cpp",
+ "GPUVerifyDistribution.cpp",
"Passes.cpp",
"VectorReductionToGPU.cpp",
"WorkgroupReordering.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index e22fa03..eb51b3e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -70,6 +70,7 @@
"GPUTileReduction.cpp"
"GPUVectorAlloc.cpp"
"GPUVectorDistribution.cpp"
+ "GPUVerifyDistribution.cpp"
"Passes.cpp"
"VectorReductionToGPU.cpp"
"WorkgroupReordering.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp
new file mode 100644
index 0000000..273cadf
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp
@@ -0,0 +1,91 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUVERIFYDISTRIBUTIONPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+template <typename... Type>
+bool forallOpHasMappingType(scf::ForallOp forallOp) {
+ std::optional<ArrayAttr> mapping = forallOp.getMapping();
+ if (!mapping || mapping.value().empty()) {
+ return false;
+ }
+
+ return isa<Type...>(*mapping.value().begin());
+}
+
+template <typename... Type>
+bool operationHasParentForallOfMappingType(Operation *op) {
+ auto parentForallOp = op->getParentOfType<scf::ForallOp>();
+ while (parentForallOp) {
+ if (forallOpHasMappingType<Type...>(parentForallOp)) {
+ return true;
+ }
+ parentForallOp = parentForallOp->getParentOfType<scf::ForallOp>();
+ }
+ return false;
+}
+
+/// Pass to verify that writes only happen in distributed contexts. Code in
+/// shared contexts are executed uniformly across all threads after resolution
+/// of distributed contexts (i.e. scf.forall), thus operations with write
+/// memory effects are inherently
+struct GPUVerifyDistributionPass final
+ : impl::GPUVerifyDistributionPassBase<GPUVerifyDistributionPass> {
+
+ void runOnOperation() override {
+ FunctionOpInterface funcOp = getOperation();
+
+ WalkResult res = funcOp.walk([](Operation *op) {
+ if (auto forallOp = dyn_cast<scf::ForallOp>(op)) {
+ std::optional<ArrayAttr> mapping = forallOp.getMapping();
+ if (!mapping || mapping.value().empty()) {
+ forallOp->emitOpError("requires a mapping attribute.");
+ return WalkResult::interrupt();
+ }
+
+ if (isa<IREE::GPU::LaneIdAttr>(*mapping.value().begin()) &&
+ !operationHasParentForallOfMappingType<
+ mlir::gpu::GPUWarpMappingAttr>(forallOp)) {
+ forallOp->emitOpError("lane distributed scf.forall must have a "
+ "parent subgroup distributed loop.");
+ return WalkResult::interrupt();
+ }
+ return WalkResult::advance();
+ }
+ if (auto memoryEffectOp = dyn_cast<MemoryEffectOpInterface>(op)) {
+ if (memoryEffectOp.hasEffect<MemoryEffects::Write>() &&
+ !operationHasParentForallOfMappingType<
+ mlir::gpu::GPUThreadMappingAttr, IREE::GPU::LaneIdAttr>(op)) {
+ op->emitOpError("write affecting operations are restricted to lane "
+ "or thread distributed contexts.");
+ return WalkResult::interrupt();
+ }
+ }
+ return WalkResult::advance();
+ });
+
+ if (res.wasInterrupted()) {
+ return signalPassFailure();
+ }
+ }
+};
+
+} // namespace
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 36507fd..cec8ba4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -184,6 +184,11 @@
let dependentDialects = ["::mlir::scf::SCFDialect"];
}
+def GPUVerifyDistributionPass :
+ InterfacePass<"iree-codegen-gpu-verify-distribution", "mlir::FunctionOpInterface"> {
+ let summary = "Pass to verify writes before resolving distributed contexts.";
+}
+
def GPUVectorAllocPass :
InterfacePass<"iree-codegen-gpu-vector-alloc", "mlir::FunctionOpInterface"> {
let summary = "Pass to create allocations for contraction inputs to copy "
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 14dbfda..5854bd5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -38,6 +38,7 @@
"gpu_tile_reduction.mlir",
"gpu_vector_alloc.mlir",
"gpu_vector_distribution.mlir",
+ "gpu_verify_distribution.mlir",
"reduce_bank_conflicts.mlir",
"transform_gpu_distribute_shared_memory.mlir",
"transform_gpu_reduce_bank_conflicts.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index 9ccc268..a61138b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -34,6 +34,7 @@
"gpu_tile_reduction.mlir"
"gpu_vector_alloc.mlir"
"gpu_vector_distribution.mlir"
+ "gpu_verify_distribution.mlir"
"reduce_bank_conflicts.mlir"
"transform_gpu_distribute_shared_memory.mlir"
"transform_gpu_reduce_bank_conflicts.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir
new file mode 100644
index 0000000..cf65a02
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir
@@ -0,0 +1,29 @@
+// RUN: iree-opt %s --split-input-file --verify-diagnostics \
+// RUN: --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-verify-distribution))"
+
+func.func @unmapped_forall(%out : memref<32xi32>) {
+ // expected-error @+1 {{requires a mapping attribute}}
+ scf.forall (%arg0) in (32) {
+ }
+ return
+}
+
+// -----
+
+func.func @write_in_warp_forall(%out : memref<32xi32>) {
+ %c0 = arith.constant 0 : i32
+ scf.forall (%arg0) in (32) {
+ // expected-error@+1 {{write affecting operations are restricted to lane or thread distributed contexts}}
+ memref.store %c0, %out[%arg0] : memref<32xi32>
+ } {mapping = [#gpu.warp<x>]}
+ return
+}
+
+// -----
+
+func.func @lane_forall_no_warp_parent(%out : memref<32xi32>) {
+ // expected-error@+1 {{lane distributed scf.forall must have a parent subgroup distributed loop}}
+ scf.forall (%arg0) in (32) {
+ } {mapping = [#iree_gpu.lane_id<0>]}
+ return
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 02d0342..0fd672b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -382,6 +382,7 @@
addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false);
// Step 8. Resolve remaining parallel loops.
+ funcPassManager.addPass(createGPUVerifyDistributionPass());
funcPassManager.addPass(createGPUDistributePass());
// Vectorize copies that came out of bufferization.