[Codegen][GPU] Add a pass for basic distribution verification (#18236)

After various levels of tiling to warps/threads and then bufferization
in GPU codegen pipelines, but before resolving distributed loops like
`scf.forall` ops, we have an opportunity for additional verification
that all operations were properly mapped to threads. In particular, any
operation that vectorized/bufferized to an operation with a write effect
must now be within a *thread* distributed context or else there is
almost certainly a write race. Such cases means something went wrong in
earlier passes and is a compiler failure.

Note: this is only added for the LLVMGPUTileAndFuse pipeline because
other pipelines allow for write effecting ops like
`memref.copy` to persist past `scf.forall` resolution.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index da45408..4a0b879 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -72,6 +72,7 @@
         "GPUTileReduction.cpp",
         "GPUVectorAlloc.cpp",
         "GPUVectorDistribution.cpp",
+        "GPUVerifyDistribution.cpp",
         "Passes.cpp",
         "VectorReductionToGPU.cpp",
         "WorkgroupReordering.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index e22fa03..eb51b3e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -70,6 +70,7 @@
     "GPUTileReduction.cpp"
     "GPUVectorAlloc.cpp"
     "GPUVectorDistribution.cpp"
+    "GPUVerifyDistribution.cpp"
     "Passes.cpp"
     "VectorReductionToGPU.cpp"
     "WorkgroupReordering.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp
new file mode 100644
index 0000000..273cadf
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp
@@ -0,0 +1,91 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUVERIFYDISTRIBUTIONPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+template <typename... Type>
+bool forallOpHasMappingType(scf::ForallOp forallOp) {
+  std::optional<ArrayAttr> mapping = forallOp.getMapping();
+  if (!mapping || mapping.value().empty()) {
+    return false;
+  }
+
+  return isa<Type...>(*mapping.value().begin());
+}
+
+template <typename... Type>
+bool operationHasParentForallOfMappingType(Operation *op) {
+  auto parentForallOp = op->getParentOfType<scf::ForallOp>();
+  while (parentForallOp) {
+    if (forallOpHasMappingType<Type...>(parentForallOp)) {
+      return true;
+    }
+    parentForallOp = parentForallOp->getParentOfType<scf::ForallOp>();
+  }
+  return false;
+}
+
+/// Pass to verify that writes only happen in distributed contexts. Code in
+/// shared contexts are executed uniformly across all threads after resolution
+/// of distributed contexts (i.e. scf.forall), thus operations with write
+/// memory effects are inherently
+struct GPUVerifyDistributionPass final
+    : impl::GPUVerifyDistributionPassBase<GPUVerifyDistributionPass> {
+
+  void runOnOperation() override {
+    FunctionOpInterface funcOp = getOperation();
+
+    WalkResult res = funcOp.walk([](Operation *op) {
+      if (auto forallOp = dyn_cast<scf::ForallOp>(op)) {
+        std::optional<ArrayAttr> mapping = forallOp.getMapping();
+        if (!mapping || mapping.value().empty()) {
+          forallOp->emitOpError("requires a mapping attribute.");
+          return WalkResult::interrupt();
+        }
+
+        if (isa<IREE::GPU::LaneIdAttr>(*mapping.value().begin()) &&
+            !operationHasParentForallOfMappingType<
+                mlir::gpu::GPUWarpMappingAttr>(forallOp)) {
+          forallOp->emitOpError("lane distributed scf.forall must have a "
+                                "parent subgroup distributed loop.");
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      }
+      if (auto memoryEffectOp = dyn_cast<MemoryEffectOpInterface>(op)) {
+        if (memoryEffectOp.hasEffect<MemoryEffects::Write>() &&
+            !operationHasParentForallOfMappingType<
+                mlir::gpu::GPUThreadMappingAttr, IREE::GPU::LaneIdAttr>(op)) {
+          op->emitOpError("write affecting operations are restricted to lane "
+                          "or thread distributed contexts.");
+          return WalkResult::interrupt();
+        }
+      }
+      return WalkResult::advance();
+    });
+
+    if (res.wasInterrupted()) {
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 36507fd..cec8ba4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -184,6 +184,11 @@
   let dependentDialects = ["::mlir::scf::SCFDialect"];
 }
 
+def GPUVerifyDistributionPass :
+    InterfacePass<"iree-codegen-gpu-verify-distribution", "mlir::FunctionOpInterface"> {
+  let summary = "Pass to verify writes before resolving distributed contexts.";
+}
+
 def GPUVectorAllocPass :
     InterfacePass<"iree-codegen-gpu-vector-alloc", "mlir::FunctionOpInterface"> {
   let summary = "Pass to create allocations for contraction inputs to copy "
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 14dbfda..5854bd5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -38,6 +38,7 @@
             "gpu_tile_reduction.mlir",
             "gpu_vector_alloc.mlir",
             "gpu_vector_distribution.mlir",
+            "gpu_verify_distribution.mlir",
             "reduce_bank_conflicts.mlir",
             "transform_gpu_distribute_shared_memory.mlir",
             "transform_gpu_reduce_bank_conflicts.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index 9ccc268..a61138b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -34,6 +34,7 @@
     "gpu_tile_reduction.mlir"
     "gpu_vector_alloc.mlir"
     "gpu_vector_distribution.mlir"
+    "gpu_verify_distribution.mlir"
     "reduce_bank_conflicts.mlir"
     "transform_gpu_distribute_shared_memory.mlir"
     "transform_gpu_reduce_bank_conflicts.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir
new file mode 100644
index 0000000..cf65a02
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir
@@ -0,0 +1,29 @@
+// RUN: iree-opt %s --split-input-file --verify-diagnostics \
+// RUN:   --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-verify-distribution))"
+
+func.func @unmapped_forall(%out : memref<32xi32>) {
+  // expected-error @+1 {{requires a mapping attribute}}
+  scf.forall (%arg0) in (32) {
+  }
+  return
+}
+
+// -----
+
+func.func @write_in_warp_forall(%out : memref<32xi32>) {
+  %c0 = arith.constant 0 : i32
+  scf.forall (%arg0) in (32) {
+    // expected-error@+1 {{write affecting operations are restricted to lane or thread distributed contexts}}
+    memref.store %c0, %out[%arg0] : memref<32xi32>
+  } {mapping = [#gpu.warp<x>]}
+  return
+}
+
+// -----
+
+func.func @lane_forall_no_warp_parent(%out : memref<32xi32>) {
+  // expected-error@+1 {{lane distributed scf.forall must have a parent subgroup distributed loop}}
+  scf.forall (%arg0) in (32) {
+  } {mapping = [#iree_gpu.lane_id<0>]}
+  return
+}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 02d0342..0fd672b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -382,6 +382,7 @@
   addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false);
 
   // Step 8. Resolve remaining parallel loops.
+  funcPassManager.addPass(createGPUVerifyDistributionPass());
   funcPassManager.addPass(createGPUDistributePass());
 
   // Vectorize copies that came out of bufferization.