Add gpu::BarrierOp conversion for ROCDL (#15570)
Added the gpu::BarrierOp conversion from amdgpu to rocdl and from rocdl
to llvm.
Co-authored-by: Harsh Menon <harsh@nod-labs.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index f1746e4..60e518b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -93,6 +93,10 @@
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
"//llvm-external-projects/iree-dialects:IREELinalgTransformDialectPasses",
"@llvm-project//llvm:Support",
+ "@llvm-project//mlir:AMDGPUDialect",
+ "@llvm-project//mlir:AMDGPUToROCDL",
+ "@llvm-project//mlir:AMDGPUTransforms",
+ "@llvm-project//mlir:AMDGPUUtils",
"@llvm-project//mlir:AffineDialect",
"@llvm-project//mlir:AffineToStandard",
"@llvm-project//mlir:Analysis",
@@ -131,6 +135,8 @@
"@llvm-project//mlir:PDLInterpDialect",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ROCDLDialect",
+ "@llvm-project//mlir:ROCDLTarget",
+ "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
"@llvm-project//mlir:SCFDialect",
"@llvm-project//mlir:SCFToControlFlow",
"@llvm-project//mlir:SCFTransforms",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 8aa547d..025ed5d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -72,6 +72,10 @@
IREELinalgTransformDialect
IREELinalgTransformDialectPasses
LLVMSupport
+ MLIRAMDGPUDialect
+ MLIRAMDGPUToROCDL
+ MLIRAMDGPUTransforms
+ MLIRAMDGPUUtils
MLIRAffineDialect
MLIRAffineToStandard
MLIRAnalysis
@@ -109,6 +113,8 @@
MLIRPDLInterpDialect
MLIRPass
MLIRROCDLDialect
+ MLIRROCDLTarget
+ MLIRROCDLToLLVMIRTranslation
MLIRSCFDialect
MLIRSCFToControlFlow
MLIRSCFTransforms
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index a3ab761..f7d33f8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -11,6 +11,7 @@
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
@@ -22,7 +23,9 @@
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
@@ -36,6 +39,49 @@
namespace {
+static StringRef getTargetArch(func::FuncOp entryPoint) {
+ if (auto variantOp =
+ entryPoint->getParentOfType<IREE::HAL::ExecutableVariantOp>()) {
+ IREE::HAL::ExecutableTargetAttr targetAttr = variantOp.getTarget();
+ if (auto config = targetAttr.getConfiguration()) {
+ if (auto attr = config.getAs<StringAttr>("target_arch")) {
+ return attr.getValue();
+ }
+ }
+ }
+ return "";
+}
+
+static StringRef getChipset(ModuleOp m) {
+ for (func::FuncOp funcOp : m.getOps<func::FuncOp>()) {
+ if (isEntryPoint(funcOp)) {
+ return getTargetArch(funcOp);
+ }
+ }
+ return "";
+}
+
+// Transform gpu.barrier -> amdgpu.lds_barrier
+// IREE code generation currently only ever needs to synchronize for
+// LDS operations. This conversion is to make the barrier operations
+// LDS specific because the gpu.barrier contains global memory
+// operations as well.
+struct ReplaceGPUBarrierWithLDSBarrier
+ : public OpRewritePattern<gpu::BarrierOp> {
+ using OpRewritePattern<gpu::BarrierOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(gpu::BarrierOp op,
+ PatternRewriter &rewriter) const override {
+ OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.replaceOpWithNewOp<amdgpu::LDSBarrierOp>(op);
+ return success();
+ }
+};
+
+void populateConvertGPUToAMDGPUPatterns(RewritePatternSet &patterns) {
+ patterns.add<ReplaceGPUBarrierWithLDSBarrier>(patterns.getContext());
+}
+
/// A pass that replaces all occurrences of GPU device operations with their
/// corresponding ROCDL equivalent.
///
@@ -43,7 +89,8 @@
/// code.
struct ConvertToROCDLPass : public ConvertToROCDLBase<ConvertToROCDLPass> {
void getDependentDialects(DialectRegistry ®istry) const override {
- registry.insert<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
+ registry.insert<LLVM::LLVMDialect, ROCDL::ROCDLDialect,
+ amdgpu::AMDGPUDialect, gpu::GPUDialect>();
}
void runOnOperation() override {
ModuleOp m = getOperation();
@@ -71,6 +118,7 @@
// Run Vector -> Vector transformations ahead of conversion to LLVM.
{
RewritePatternSet patterns(&getContext());
+ populateConvertGPUToAMDGPUPatterns(patterns);
populateDropSharedMemoryDeallocOpPatterns(patterns);
populateScalarizeMathOps(patterns);
populateConvertSharedMemoryAllocOps(patterns);
@@ -120,6 +168,10 @@
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
arith::populateArithToLLVMConversionPatterns(converter, llvmPatterns);
+ StringRef chipset = getChipset(m);
+ FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+ populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
+ *maybeChipset);
populateVectorToLLVMConversionPatterns(converter, llvmPatterns);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns,
gpu::amd::Runtime::Unknown);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
index 6273a8d..fd52a45 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir
@@ -73,3 +73,28 @@
}
// CHECK-LABEL: llvm.func @reduction_maximum
// CHECK: llvm.intr.vector.reduce.fmax({{.*}}) : (vector<2xf32>) -> f32
+
+// -----
+// Test that gpu barriers be lowered to `s_waitcnt lgkmcnt(0)\0As_barrier` on rocm
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<4, storage_buffer>
+ ]>,
+ #hal.descriptor_set.layout<1, bindings = [
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable @matmul_dispatch_0 {
+ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
+ hal.executable.export @matmul_dispatch_0 layout(#pipeline_layout)
+ builtin.module {
+ func.func @matmul_dispatch_0() {
+ gpu.barrier
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: llvm.func @matmul_dispatch_0
+// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier", "" : () -> ()
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
index 5309016..8818485 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/BUILD.bazel
@@ -43,11 +43,16 @@
"@llvm-project//llvm:MC",
"@llvm-project//llvm:Support",
"@llvm-project//llvm:Target",
+ "@llvm-project//mlir:AMDGPUDialect",
+ "@llvm-project//mlir:AMDGPUToROCDL",
+ "@llvm-project//mlir:AMDGPUTransforms",
+ "@llvm-project//mlir:AMDGPUUtils",
"@llvm-project//mlir:BuiltinToLLVMIRTranslation",
"@llvm-project//mlir:LLVMDialect",
"@llvm-project//mlir:LLVMToLLVMIRTranslation",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ROCDLDialect",
+ "@llvm-project//mlir:ROCDLTarget",
"@llvm-project//mlir:ROCDLToLLVMIRTranslation",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:ToLLVMIRTranslation",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
index b0b2e6a..e64b04a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/CMakeLists.txt
@@ -31,11 +31,16 @@
LLVMSupport
LLVMTarget
LLVMipo
+ MLIRAMDGPUDialect
+ MLIRAMDGPUToROCDL
+ MLIRAMDGPUTransforms
+ MLIRAMDGPUUtils
MLIRBuiltinToLLVMIRTranslation
MLIRLLVMDialect
MLIRLLVMToLLVMIRTranslation
MLIRPass
MLIRROCDLDialect
+ MLIRROCDLTarget
MLIRROCDLToLLVMIRTranslation
MLIRSupport
MLIRTargetLLVMIRExport
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
index db6a7fe..3aa6fe8 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Target/ROCM/ROCMTarget.cpp
@@ -22,6 +22,7 @@
#include "llvm/Support/Path.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Support/LogicalResult.h"
@@ -85,6 +86,7 @@
mlir::registerLLVMDialectTranslation(registry);
mlir::registerROCDLDialectTranslation(registry);
registry.insert<IREE::Codegen::IREECodegenDialect>();
+ registry.insert<amdgpu::AMDGPUDialect>();
}
IREE::HAL::DeviceTargetAttr
@@ -320,6 +322,7 @@
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUTargetMC();
LLVMInitializeAMDGPUTargetInfo();
+ LLVMInitializeAMDGPUAsmParser();
LLVMInitializeAMDGPUAsmPrinter();
return std::make_shared<ROCMTargetBackend>();
});