[GPU] Disable prefetching for loops with no computation (#19695)

There is no point in prefetching if you dont have any compute ops in the
loop.
Currently attempting this is leading to a bug described in
https://github.com/iree-org/iree/issues/19612
The proposed solution implemented in this PR is if the loop has no
compute ops then bail out.
Fixes : https://github.com/iree-org/iree/issues/19612

---------

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
index ed96a13..f91ec6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/PrefetchSharedMemoryCopy.cpp
@@ -205,6 +205,12 @@
         getValueDependencies(compute, computeDependencies);
       }
     }
+    // If `scf.yeild` is the only compute op then there is no value in doing
+    // prefetching.
+    if (computeDependencies.size() == 1) {
+      LDBG("Loop does not have compute so not doing prefetching." << forOp);
+      return failure();
+    }
 
     // Restore the original order.
     for (auto &op : forOp.getBody()->getOperations()) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
index 87cfeb5..e5d585b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/prefetch_shared_memory.mlir
@@ -134,3 +134,19 @@
   vector.transfer_write %0, %arg0[%c0] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
   return
 }
+
+// -----
+// CHECK-LABEL: @noprefetch_copyback
+func.func @noprefetch_copyback(%arg0: memref<128xf32>, %arg1: memref<128xf32>) {
+  %cst = arith.constant dense<0.000000e+00> : vector<1xf32>
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  scf.for %arg2 = %c0 to %c128 step %c1{
+    %1 = vector.transfer_read %arg0[%arg2], %cst_0 : memref<128xf32>, vector<1xf32>
+    vector.transfer_write %1, %arg1[%arg2] {in_bounds = [true]} : vector<1xf32>, memref<128xf32>
+  }
+  return
+}
+// CHECK-NOT: gpu.barrier