[Transform] Add transform.iree.pack_shared_memory_alloc (#14503)
This patch adds the `transform.iree.pack_shared_memory_alloc` op. This
op simply takes in a funcOp and applies LLVMGPUPackSharedMemoryAlloc
pass on it.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
index 7b2e880..79152a9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
@@ -9,54 +9,11 @@
#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
-#include "iree/compiler/Codegen/Transforms/Transforms.h"
-#include "iree/compiler/Codegen/Utils/GPUUtils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/IR/Dominance.h"
namespace mlir {
namespace iree_compiler {
-/// Insert barriers and wait operations if there are allocs of a different alias
-/// group before the given alloc.
-static void addBarrier(func::FuncOp funcOp, Operation *alloc,
- ArrayRef<Operation *> aliasGroup) {
- Block *entryBlock = &(*funcOp.getBlocks().begin());
- bool needBarrier = false;
- if (alloc->getBlock() != entryBlock) {
- needBarrier = true;
- } else {
- for (Operation &op : entryBlock->getOperations()) {
- if (&op == alloc)
- break;
- if (op.getNumRegions() != 0) {
- needBarrier = true;
- break;
- }
- if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
- needBarrier = true;
- break;
- }
- }
- }
- if (!needBarrier)
- return;
- OpBuilder builder(alloc);
- // TODO: make it a option if needed.
- bool hasAsyncCopies = true;
- if (hasAsyncCopies) {
- Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
- funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
- SmallVector<Value>());
- builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
- builder.getI32IntegerAttr(0));
- }
- builder.create<gpu::BarrierOp>(alloc->getLoc());
-}
-
namespace {
struct LLVMGPUPackSharedMemoryAllocPass
@@ -67,35 +24,7 @@
registry.insert<nvgpu::NVGPUDialect>();
}
- void runOnOperation() override {
- func::FuncOp funcOp = getOperation();
- DominanceInfo dominators(funcOp);
- SmallVector<Operation *> allocs;
- funcOp.walk([&](memref::AllocOp alloc) {
- if (hasSharedMemoryAddressSpace(alloc.getType())) {
- allocs.push_back(alloc);
- }
- });
- // First sink the alloc as low as possible in the CFG.
- sinkOpsInCFG(allocs, dominators);
- SmallVector<AliasGroup> aliasGroups;
- analyseAllocsForPacking(funcOp, allocs, aliasGroups);
- // If there is 1 or less alias group there is nothing to do.
- if (aliasGroups.size() <= 1)
- return;
-
- // Pack all the allocations into one i8 alloc.
- // We may need to add extra barriers to make sure we are done writting or
- // reading from the previous alias group before starting a new one.
- for (size_t i = 0; i < aliasGroups.size(); i++) {
- for (Operation *alloc : aliasGroups[i]) {
- addBarrier(funcOp, alloc, aliasGroups[i]);
- }
- }
-
- OpBuilder builder(funcOp.getContext());
- packAllocs(builder, funcOp, aliasGroups);
- }
+ void runOnOperation() override { packSharedMemoryAlloc(getOperation()); }
};
} // namespace
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 1d3e79e..c2c5245 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -18,6 +18,10 @@
namespace mlir {
namespace iree_compiler {
+//===----------------------------------------------------------------------===//
+// Passes
+//===----------------------------------------------------------------------===//
+
/// Lowering using SIMT CUDA core operations.
void addGPUMatmulSimtPassPipeline(OpPassManager &pm);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index 8a84318..83b3844 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -48,6 +48,7 @@
#define DBGS_VECTOR_TO_MMA() (dbgs() << '[' << DEBUG_VECTOR_TO_MMA << "] ")
using namespace mlir;
+using namespace mlir::iree_compiler;
using namespace mlir::iree_compiler::IREE;
iree_compiler::IREE::transform_dialect::LLVMGPUExtensions::LLVMGPUExtensions() {
@@ -1478,5 +1479,20 @@
return DiagnosedSilenceableFailure::success();
}
+DiagnosedSilenceableFailure
+transform_dialect::PackSharedMemoryAllocOp::applyToOne(
+ transform::TransformRewriter &rewriter, func::FuncOp target,
+ transform::ApplyToEachResultList &results,
+ transform::TransformState &state) {
+ packSharedMemoryAlloc(target);
+ return DiagnosedSilenceableFailure::success();
+}
+
+void transform_dialect::PackSharedMemoryAllocOp::getEffects(
+ SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+ transform::onlyReadsHandle(getTarget(), effects);
+ transform::modifiesPayload(effects);
+}
+
#define GET_OP_CLASSES
#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index 718a82d..9c2f6ee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -648,4 +648,38 @@
}];
}
+def PackSharedMemoryAllocOp : Op<Transform_Dialect, "iree.pack_shared_memory_alloc",
+ [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+ TransformEachOpTrait,
+ TransformOpInterface,
+ ReportTrackingListenerFailuresOpTrait]> {
+ let summary = "Pack shared memory allocation to reduce memory usage";
+ let description = [{
+ Looks for allocs in shared memory space with overlapping liveness and
+ groups them, then packs all the allocations in each group into one i8
+ alloc. Also adds barriers to make sure we are done writing/reading
+ from the previous alias group before starting a new one.
+
+ #### Return modes
+
+ It does not consume the target handle and always return success.
+ }];
+
+ let arguments = (
+ ins TransformHandleTypeInterface:$target
+ );
+ let results = (outs);
+
+ let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
+ let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
+
+ let extraClassDeclaration = [{
+ ::mlir::DiagnosedSilenceableFailure applyToOne(
+ ::mlir::transform::TransformRewriter &rewriter,
+ ::mlir::func::FuncOp funcOp,
+ ::mlir::transform::ApplyToEachResultList &results,
+ ::mlir::transform::TransformState &state);
+ }];
+}
+
#endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
index c139d79..62e6766 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
@@ -24,6 +24,7 @@
"LLVMGPUUtils.h",
],
deps = [
+ "//compiler/src/iree/compiler/Codegen/Transforms",
"//compiler/src/iree/compiler/Codegen/Utils",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:AffineDialect",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
index dfb1223..8609095 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
@@ -29,6 +29,7 @@
MLIRMemRefDialect
MLIRNVGPUDialect
MLIRVectorDialect
+ iree::compiler::Codegen::Transforms
iree::compiler::Codegen::Utils
PUBLIC
)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
index 824251b..997f150 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
@@ -6,6 +6,7 @@
#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
+#include "iree/compiler/Codegen/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
@@ -348,5 +349,71 @@
}
}
+/// Insert barriers and wait operations if there are allocs of a different alias
+/// group before the given alloc.
+static void addBarrier(func::FuncOp funcOp, Operation *alloc,
+ ArrayRef<Operation *> aliasGroup) {
+ Block *entryBlock = &(*funcOp.getBlocks().begin());
+ bool needBarrier = false;
+ if (alloc->getBlock() != entryBlock) {
+ needBarrier = true;
+ } else {
+ for (Operation &op : entryBlock->getOperations()) {
+ if (&op == alloc)
+ break;
+ if (op.getNumRegions() != 0) {
+ needBarrier = true;
+ break;
+ }
+ if (isa<memref::AllocOp>(&op) && !llvm::is_contained(aliasGroup, &op)) {
+ needBarrier = true;
+ break;
+ }
+ }
+ }
+ if (!needBarrier)
+ return;
+ OpBuilder builder(alloc);
+ // TODO: make it a option if needed.
+ bool hasAsyncCopies = true;
+ if (hasAsyncCopies) {
+ Value groupToken = builder.create<nvgpu::DeviceAsyncCreateGroupOp>(
+ funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
+ SmallVector<Value>());
+ builder.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
+ builder.getI32IntegerAttr(0));
+ }
+ builder.create<gpu::BarrierOp>(alloc->getLoc());
+}
+
+void packSharedMemoryAlloc(func::FuncOp funcOp) {
+ DominanceInfo dominators(funcOp);
+ SmallVector<Operation *> allocs;
+ funcOp.walk([&](memref::AllocOp alloc) {
+ if (hasSharedMemoryAddressSpace(alloc.getType())) {
+ allocs.push_back(alloc);
+ }
+ });
+ // First sink the alloc as low as possible in the CFG.
+ sinkOpsInCFG(allocs, dominators);
+ SmallVector<AliasGroup> aliasGroups;
+ analyseAllocsForPacking(funcOp, allocs, aliasGroups);
+ // If there is 1 or less alias group there is nothing to do.
+ if (aliasGroups.size() <= 1)
+ return;
+
+ // Pack all the allocations into one i8 alloc.
+ // We may need to add extra barriers to make sure we are done writting or
+ // reading from the previous alias group before starting a new one.
+ for (size_t i = 0; i < aliasGroups.size(); i++) {
+ for (Operation *alloc : aliasGroups[i]) {
+ addBarrier(funcOp, alloc, aliasGroups[i]);
+ }
+ }
+
+ OpBuilder builder(funcOp.getContext());
+ packAllocs(builder, funcOp, aliasGroups);
+}
+
} // namespace iree_compiler
} // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
index 2c96986..ff6d3f8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
@@ -25,6 +25,14 @@
/// Function to reorder transposes and elementwise ops.
void reorderTranspose(RewriterBase &rewriter, func::FuncOp funcOp);
+/// Look for allocs in shared memory space with overlapping liveness,
+/// group them, and then pack all the allocations in each group into one i8
+/// alloc.
+///
+/// Also adds barriers to make sure we are done writing/reading
+/// from the previous alias group before starting a new one.
+void packSharedMemoryAlloc(func::FuncOp funcOp);
+
} // namespace iree_compiler
} // namespace mlir
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 86fbf96..6fd9833 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -50,6 +50,7 @@
"transform_dialect_vector_distribution.mlir",
"transform_dialect_bufferize.mlir",
"transform_dialect_eliminate_gpu_barriers.mlir",
+ "transform_dialect_pack_shared_memory_alloc.mlir",
"transform_dialect_promote_operands.mlir",
"transform_distribute_forall.mlir",
"transform_gpu_pipelining.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index b3c7270..13430df 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -45,6 +45,7 @@
"transform_dialect_bufferize.mlir"
"transform_dialect_eliminate_gpu_barriers.mlir"
"transform_dialect_hoist_allocs.mlir"
+ "transform_dialect_pack_shared_memory_alloc.mlir"
"transform_dialect_promote_operands.mlir"
"transform_dialect_vector_distribution.mlir"
"transform_distribute_forall.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
new file mode 100644
index 0000000..da0e7bc
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_pack_shared_memory_alloc.mlir
@@ -0,0 +1,33 @@
+// RUN: iree-opt %s --iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
+
+// CHECK-LABEL: shared_memory_disjoint
+// CHECK-NOT: gpu.barrier
+// CHECK-DAG: %[[PACKED:.+]] = memref.alloc() : memref<1024xi8, #gpu.address_space<workgroup>>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[C512:.+]] = arith.constant 512 : index
+// CHECK: memref.view %[[PACKED]][%[[C512]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<128xf32, #gpu.address_space<workgroup>>
+// CHECK: nvgpu.device_async_create_group
+// CHECK: nvgpu.device_async_wait %0 {numGroups = 0 : i32}
+// CHECK: gpu.barrier
+// CHECK: memref.view %[[PACKED]][%[[C0]]][] : memref<1024xi8, #gpu.address_space<workgroup>> to memref<32xf32, #gpu.address_space<workgroup>>
+func.func @shared_memory_disjoint() {
+ %c0 = arith.constant 0 : index
+ %cst_f32 = arith.constant 0.000000e+00 : f32
+ %cst_i8 = arith.constant 0 : i8
+ %0 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+ %1 = memref.alloc() : memref<128xf32, #gpu.address_space<workgroup>>
+ %2 = memref.alloc() : memref<32xf32, #gpu.address_space<workgroup>>
+ memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+ memref.store %cst_f32, %1[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+ memref.store %cst_f32, %0[%c0] : memref<128xf32, #gpu.address_space<workgroup>>
+ memref.store %cst_f32, %2[%c0] : memref<32xf32, #gpu.address_space<workgroup>>
+ return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+ %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.iree.pack_shared_memory_alloc %0 : (!transform.any_op) -> ()
+ transform.iree.apply_cse %0 : !transform.any_op
+}