[GPU] Disable prefetching for loops with no computation (#19695)
There is no point in prefetching if you dont have any compute ops in the
loop.
Currently attempting this is leading to a bug described in
https://github.com/iree-org/iree/issues/19612
The proposed solution implemented in this PR is if the loop has no
compute ops then bail out.
Fixes : https://github.com/iree-org/iree/issues/19612
---------
Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
index ed96a13..f91ec6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
@@ -205,6 +205,12 @@
getValueDependencies(compute, computeDependencies);
}
}
+ // If `scf.yeild` is the only compute op then there is no value in doing
+ // prefetching.
+ if (computeDependencies.size() == 1) {
+ LDBG("Loop does not have compute so not doing prefetching." << forOp);
+ return failure();
+ }
// Restore the original order.
for (auto &op : forOp.getBody()->getOperations()) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
index 87cfeb5..e5d585b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
@@ -134,3 +134,19 @@
vector.transfer_write %0, %arg0[%c0] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
return
}
+
+// -----
+// CHECK-LABEL: @noprefetch_copyback
+func.func @noprefetch_copyback(%arg0: memref<128xf32>, %arg1: memref<128xf32>) {
+ %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %c128 = arith.constant 128 : index
+ %c1 = arith.constant 1 : index
+ %c0 = arith.constant 0 : index
+ scf.for %arg2 = %c0 to %c128 step %c1{
+ %1 = vector.transfer_read %arg0[%arg2], %cst_0 : memref<128xf32>, vector<1xf32>
+ vector.transfer_write %1, %arg1[%arg2] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+ }
+ return
+}
+// CHECK-NOT: gpu.barrier