Adds LLVMGPUCheckIRBeforeLLVMConversionPass to check shared memory allocation before lowering to LLVM (#12743)

Applies a static shared memory check before llvm conversion. Current
limit is set to 163KB based on `sm_86` (maximum we expect to see at any
given point). Will followup with target specific checks based on sm.

Similar to LLVMCPUCheckIRBeforeLLVMConversionPass but unable to reuse
due to different allocation types.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index 57fd5ed..7b66eb5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -20,6 +20,7 @@
         "ConvertToROCDL.cpp",
         "ExtractAddressComputationGPUPass.cpp",
         "KernelConfig.cpp",
+        "LLVMGPUCheckIRBeforeLLVMConversion.cpp",
         "LLVMGPUDistribute.cpp",
         "LLVMGPULowerExecutableTarget.cpp",
         "LLVMGPUPackSharedMemoryAlloc.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 24f489d..3fc98ef 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -24,6 +24,7 @@
     "ConvertToROCDL.cpp"
     "ExtractAddressComputationGPUPass.cpp"
     "KernelConfig.cpp"
+    "LLVMGPUCheckIRBeforeLLVMConversion.cpp"
     "LLVMGPUDistribute.cpp"
     "LLVMGPULowerExecutableTarget.cpp"
     "LLVMGPUPackSharedMemoryAlloc.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCheckIRBeforeLLVMConversion.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCheckIRBeforeLLVMConversion.cpp
new file mode 100644
index 0000000..3bd3c21
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCheckIRBeforeLLVMConversion.cpp
@@ -0,0 +1,96 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/PassDetail.h"
+#include "iree/compiler/Codegen/Passes.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace iree_compiler {
+
+static llvm::cl::opt<int> clMaxGPUSharedMemSize(
+    "iree-llvmgpu-shared-mem-allocation-limit",
+    llvm::cl::desc("maximum allowed shared memory size in bytes"),
+    llvm::cl::init(163 * 1024));
+
+namespace {
+struct LLVMGPUCheckIRBeforeLLVMConversionPass
+    : LLVMGPUCheckIRBeforeLLVMConversionBase<
+          LLVMGPUCheckIRBeforeLLVMConversionPass> {
+  void runOnOperation() override;
+};
+}  // namespace
+
+static int shapedTypeStaticSize(ShapedType shapedType) {
+  int allocSize = 1;
+  for (auto dimSize : shapedType.getShape()) {
+    if (ShapedType::isDynamic(dimSize)) continue;
+    allocSize *= dimSize;
+  }
+  if (auto elementType = shapedType.getElementType().dyn_cast<ShapedType>()) {
+    allocSize *= shapedTypeStaticSize(elementType);
+  } else {
+    allocSize *= shapedType.getElementType().getIntOrFloatBitWidth();
+  }
+  return allocSize;
+}
+
+/// Returns success if the total shared memory allocation size is less than the
+/// limit set by clMaxGPUSharedMemSize.
+static LogicalResult checkGPUAllocationSize(func::FuncOp funcOp) {
+  if (funcOp.getBody().empty()) return success();
+
+  SmallVector<memref::AllocOp> allocOps;
+  funcOp.walk([&](memref::AllocOp allocOp) { allocOps.push_back(allocOp); });
+  if (allocOps.empty()) {
+    return success();
+  }
+
+  int cumSize = 0;
+  for (auto allocOp : allocOps) {
+    auto allocType = allocOp.getType().cast<MemRefType>();
+    if (!hasSharedMemoryAddressSpace(allocType)) {
+      continue;
+    }
+    if (!allocOp.getDynamicSizes().empty()) {
+      return allocOp.emitOpError(
+          "dynamic shared memory allocations unsupported.");
+    }
+    int allocSize = shapedTypeStaticSize(allocType);
+    if (allocOp.getAlignment()) {
+      int64_t alignmentInBits = *allocOp.getAlignment() * 8;
+      allocSize =
+          (llvm::divideCeil(allocSize, alignmentInBits) * alignmentInBits);
+    }
+    cumSize += allocSize / 8;
+  }
+  if (cumSize > clMaxGPUSharedMemSize) {
+    return funcOp.emitOpError("exceeded GPU memory limit of ")
+           << clMaxGPUSharedMemSize.getValue() << " bytes for function. Got "
+           << cumSize << " bytes";
+  }
+  return success();
+}
+
+void LLVMGPUCheckIRBeforeLLVMConversionPass::runOnOperation() {
+  auto moduleOp = getOperation();
+  for (auto funcOp : moduleOp.getOps<func::FuncOp>()) {
+    if (failed(checkGPUAllocationSize(funcOp))) {
+      return signalPassFailure();
+    }
+  }
+}
+
+std::unique_ptr<OperationPass<ModuleOp>>
+createLLVMGPUCheckIRBeforeLLVMConversionPass() {
+  return std::make_unique<LLVMGPUCheckIRBeforeLLVMConversionPass>();
+}
+
+}  // namespace iree_compiler
+}  // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index f75cf6a..5566877 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -504,6 +504,9 @@
   addLowerAndOptimzeAddressComputation(pm);
   // THIS NEEDS TO RUN BEFORE SCF ->CF OFF
 
+  // Run checks on shared memory usage.
+  pm.addPass(createLLVMGPUCheckIRBeforeLLVMConversionPass());
+
   // SCF -> STD
   pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index 831ced7..cc52276 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -167,9 +167,6 @@
            << pipelineName;
   }
 
-  // Verify shared memory usage is within the limit.
-  // TODO(KoolJBlack): working on adding check shared memory usage.
-
   // Return success for SIMT/CUDA cores.
   if (pipeline.getValue() ==
       IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index fd3b1cf..1609fc8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -19,6 +19,7 @@
     srcs = enforce_glob(
         [
             "attention.mlir",
+            "check_ir_before_llvm_conversion.mlir",
             "conv_pipeline_test.mlir",
             "convert_to_nvvm.mlir",
             "convert_to_rocdl.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index e78caf7..5b16012 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -15,6 +15,7 @@
     lit
   SRCS
     "attention.mlir"
+    "check_ir_before_llvm_conversion.mlir"
     "conv_pipeline_test.mlir"
     "convert_to_nvvm.mlir"
     "convert_to_rocdl.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/check_ir_before_llvm_conversion.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/check_ir_before_llvm_conversion.mlir
new file mode 100644
index 0000000..73e9a68
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/check_ir_before_llvm_conversion.mlir
@@ -0,0 +1,9 @@
+// RUN: iree-opt --iree-llvmgpu-check-ir-before-llvm-conversion %s --verify-diagnostics -split-input-file
+
+module {
+  // expected-error @+1 {{'func.func' op exceeded GPU memory limit of 166912 bytes for function. Got 274432 bytes}}
+  func.func @shared_mem_alloc(%arg0: index) {
+    %alloc = memref.alloc() : memref<274432xi8, #gpu.address_space<workgroup>>
+    return
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Passes.h b/compiler/src/iree/compiler/Codegen/Passes.h
index 49eb4b5..2e07a02 100644
--- a/compiler/src/iree/compiler/Codegen/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Passes.h
@@ -601,6 +601,10 @@
 std::unique_ptr<OperationPass<func::FuncOp>>
 createLLVMGPUPackSharedMemoryAlloc();
 
+/// Checks GPU backend specific IR constraints such as shared memory limits.
+std::unique_ptr<OperationPass<ModuleOp>>
+createLLVMGPUCheckIRBeforeLLVMConversionPass();
+
 //------------------------------------------------------------------------------
 // SPIR-V Passes
 //------------------------------------------------------------------------------
diff --git a/compiler/src/iree/compiler/Codegen/Passes.td b/compiler/src/iree/compiler/Codegen/Passes.td
index 3fa680c..2b7e11b 100644
--- a/compiler/src/iree/compiler/Codegen/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Passes.td
@@ -733,6 +733,12 @@
   let constructor = "mlir::iree_compiler::createLLVMGPUTensorPadPass()";
 }
 
+def LLVMGPUCheckIRBeforeLLVMConversion :
+    Pass<"iree-llvmgpu-check-ir-before-llvm-conversion", "ModuleOp"> {
+  let summary = "Checks GPU backend specific IR constraints such as shared memory limits";
+  let constructor = "mlir::iree_compiler::createLLVMGPUCheckIRBeforeLLVMConversionPass()";
+}
+
 //------------------------------------------------------------------------------
 // SPIR-V
 //------------------------------------------------------------------------------