[GPU] Bail out in GPUReduceBankConflicts if we have collapse_shape user (#18863)
This is unsupported by upstream and can lead to a compiler error.
https://github.com/llvm/llvm-project/issues/112994
Progress towards: https://github.com/iree-org/iree/issues/18858
---------
Signed-off-by: Nirvedh <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp
index 807ab9d..51898ad 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUReduceBankConflicts.cpp
@@ -18,6 +18,23 @@
namespace {
+/// Check if AllocOp has a CollapseShapeOp user.
+static bool hasCollapseShapeUser(memref::AllocOp allocOp) {
+ SmallVector<Operation *> users(allocOp->getUsers());
+ while (!users.empty()) {
+ auto user = users.pop_back_val();
+ if (isa<memref::CollapseShapeOp>(user)) {
+ return true;
+ }
+ if (isa<ViewLikeOpInterface>(user)) {
+ for (auto u : user->getUsers()) {
+ users.push_back(u);
+ }
+ }
+ }
+ return false;
+}
+
/// Pad out the inner dimension of the `memref.alloc` op in order reduce the
/// chances to have bank conflicts when reading 2D shapes within shared memory.
static void padAlloc(MLIRContext *context, memref::AllocOp allocOp,
@@ -28,6 +45,12 @@
int64_t innerDim = allocOpShape.back();
if (ShapedType::isDynamic(innerDim))
return;
+
+ // Return if we have CollapseShape op as an user as padding in that case is
+ // unsupported.
+ if (hasCollapseShapeUser(allocOp))
+ return;
+
Type elType = allocOp.getType().getElementType();
unsigned bitwidth =
mlir::DataLayout::closest(allocOp).getTypeSizeInBits(elType);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir
index befb244..b934772 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/reduce_bank_conflicts.mlir
@@ -48,6 +48,66 @@
}
// -----
+// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape
+// CHECK: %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[C:.*]] = memref.collapse_shape %[[A]] {{\[}}[0], [1, 2], [3, 4]]
+// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into
+// CHECK-SAME: memref<4x32x64xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST_0]] {in_bounds = [true]} :
+// CHECK-SAME: memref<1024x1024xf32>, vector<4xf32>
+// CHECK: vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} :
+// CHECK-SAME: vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+
+
+func.func @no_pad_alloc_collapse_shape(%a: memref<1024x1024xf32>) {
+ %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+ %1 = memref.collapse_shape %0 [[0], [1, 2], [3, 4]]
+ : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into memref<4x32x64xf32, #gpu.address_space<workgroup>>
+ %c0 = arith.constant 0 : index
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} :
+ memref<1024x1024xf32>, vector<4xf32>
+ vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} :
+ vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @no_pad_alloc_collapse_shape_throughsubview
+// CHECK: %[[A:.*]] = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[S:.*]] = memref.subview %[[A]][0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1] :
+// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> to
+// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[C:.*]] = memref.collapse_shape %[[S]] {{\[}}[0], [1, 2], [3, 4]]
+// CHECK-SAME: memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into
+// CHECK-SAME: memref<4x32x64xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[VEC_READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]] {in_bounds = [true]} :
+// CHECK-SAME: memref<1024x1024xf32>, vector<4xf32>
+// CHECK: vector.transfer_write %[[VEC_READ]], %[[C]][%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true]} :
+// CHECK-SAME: vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+
+
+func.func @no_pad_alloc_collapse_shape_throughsubview(%a: memref<1024x1024xf32>) {
+ %0 = memref.alloc() : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+ %subview = memref.subview %0[0, 0, 0, 0, 0] [4, 2, 16, 8, 8] [1, 1, 1, 1, 1]
+ : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> to memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>>
+ %1 = memref.collapse_shape %subview [[0], [1, 2], [3, 4]]
+ : memref<4x2x16x8x8xf32, #gpu.address_space<workgroup>> into memref<4x32x64xf32, #gpu.address_space<workgroup>>
+ %c0 = arith.constant 0 : index
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %3 = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true]} :
+ memref<1024x1024xf32>, vector<4xf32>
+ vector.transfer_write %3, %1[%c0, %c0, %c0] {in_bounds = [true]} :
+ vector<4xf32>, memref<4x32x64xf32, #gpu.address_space<workgroup>>
+ return
+}
+
+// -----
// CHECK-LABEL: func.func @pad_alloc_negative
// CHECK: memref.alloc(%{{.*}}) : memref<?x32x64xf32, #gpu.address_space<workgroup>