[Codegen] Use safer hoisting in OptimizeTensorInsertExtractSlices (#23280)

Use the `moveLoopInvariantCodeFromGuaranteedLoops` transform instead of
the `moveLoopInvariantCode` transform in the
OptimizeTensorInsertExtractSlices pass. This transform is safer, because
it validates that loops will be executed at least once before hoisting
loop invariant code. Hoisting from loops that may not execute is not an
optimization, so this is a better version of the transformation.

The new safer transform also hoists from linalg.generic ops, so the
`moveLoopInvariantCodeFromGenericOps` is removed, since it is no longer
used.

This PR also removes the
`_batch_matmul_narrow_n_2_dispatch_4_unpack_i32` test, which was doing
nothing but checking that a tensor.empty op gets hoisted from an scf.for
loop (which cannot be guaranteed to execute). Hoisting empty tensors is
not the job of this pass, and the test is verbose, so the test is simply
removed.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
index ef4fbe5..0eabebe 100644
--- a/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeTensorInsertExtractSlices.cpp
@@ -225,23 +225,6 @@
   }
 }
 
-void moveLoopInvariantCodeFromGenericOps(Operation *op) {
-  // linalg.generic operations are also loop-like, but they don't have
-  // LoopLikeOpInterface implemented for them.
-  op->walk([&](linalg::GenericOp genericOp) {
-    moveLoopInvariantCode(
-        &genericOp.getBodyRegion(),
-        [&](Value value, Region *) {
-          return !genericOp->isAncestor(value.getParentRegion()->getParentOp());
-        },
-        [&](Operation *op, Region *) {
-          return !isa<linalg::IndexOp>(op) && isMemoryEffectFree(op) &&
-                 isSpeculatable(op);
-        },
-        [&](Operation *op, Region *) { op->moveBefore(genericOp); });
-  });
-}
-
 namespace {
 struct CastLikeExtractSliceOpFolder final
     : OpRewritePattern<tensor::ExtractSliceOp> {
@@ -382,11 +365,8 @@
     extractSliceOp->moveAfter(latestInsertionPoint);
   });
 
-  funcOp.walk([&](scf::ForOp forOp) { moveLoopInvariantCode(forOp); });
-  LDBG() << "after hoisting loop invariant code\n" << funcOp;
-
-  moveLoopInvariantCodeFromGenericOps(funcOp);
-  LDBG() << "after hoisting loop invariant code out of generic ops\n" << funcOp;
+  moveLoopInvariantCodeFromGuaranteedLoops(funcOp);
+  LDBG() << "after hoisting loop invariant code\n" << funcOp << "\n";
 
   // TODO: walking in some reverse / inside-out order would be more efficient
   // and would capture more cases.
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir
index d17dcd0..a3d36fb 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir
@@ -229,42 +229,6 @@
 
 // -----
 
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  #hal.pipeline.binding<storage_buffer>,
-  #hal.pipeline.binding<storage_buffer>
-]>
-func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDataTiling>} {
-  %c0_i32 = arith.constant 0 : i32
-  %c2 = arith.constant 2 : index
-  %c128 = arith.constant 128 : index
-  %c0 = arith.constant 0 : index
-  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>>
-  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  %workgroup_id_x = hal.interface.workgroup.id[0] : index
-  %workgroup_count_x = hal.interface.workgroup.count[0] : index
-  scf.for %arg0 = %workgroup_id_x to %c2 step %workgroup_count_x {
-    %2 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>> -> tensor<1x3x2xi32>
-    %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0, 0], sizes = [1, 1, 1, 2, 8], strides = [1, 1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x1x1x2x8xi32>> -> tensor<1x1x1x2x8xi32>
-    %4 = vector.transfer_read %3[%c0, %c0, %c0, %c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<1x1x1x2x8xi32>, vector<2x8xi32>
-    %5 = vector.transpose %4, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-    %6 = tensor.empty() : tensor<3x2xi32>
-    %7 = vector.transfer_write %5, %6[%c0, %c0] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-    %inserted_slice = tensor.insert_slice %7 into %2[0, 0, 0] [1, 3, 2] [1, 1, 1] : tensor<3x2xi32> into tensor<1x3x2xi32>
-    iree_tensor_ext.dispatch.tensor.store %inserted_slice, %1, offsets = [%arg0, 0, 0], sizes = [1, 3, 2], strides = [1, 1, 1] : tensor<1x3x2xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x3x2xi32>>
-  }
-  return
-}
-
-// CHECK-LABEL: func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[EMPTY:.+]] = tensor.empty() : tensor<3x2xi32>
-// CHECK:       scf.for
-// CHECK:         %[[READ:.+]] = vector.transfer_read
-// CHECK:         %[[TRANS:.+]] = vector.transpose %[[READ]], [1, 0] : vector<2x8xi32> to vector<8x2xi32>
-// CHECK:         vector.transfer_write %[[TRANS]], %[[EMPTY]][%[[C0]], %[[C0]]] {in_bounds = [false, true]} : vector<8x2xi32>, tensor<3x2xi32>
-
-// -----
-
 func.func @subset_hoisting_invariant_tensor(%init: tensor<64x64xf32>, %t: tensor<64x64xf32>) -> tensor<64x64xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -373,3 +337,30 @@
 // CHECK: linalg.generic
 // CHECK-NOT: tensor.extract
 // CHECK: return
+
+// -----
+
+// Verify that loop invariant ops are not hoisted from regions that may not be
+// executed.
+func.func @no_hoist_from_possibly_unexecuted_region(%arg0: tensor<4x8xi32>) -> tensor<8x4xi32> {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %workgroup_id_x = hal.interface.workgroup.id[0] : index
+  %0 = tensor.empty() : tensor<8x4xi32>
+  %1 = scf.for %arg1 = %workgroup_id_x to %c1 step %c100 iter_args(%arg2 = %0) -> tensor<8x4xi32> {
+    %2 = vector.transfer_read %arg0[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : tensor<4x8xi32>, vector<2x8xi32>
+    %3 = vector.transpose %2, [1, 0] : vector<2x8xi32> to vector<8x2xi32>
+    %4 = vector.transfer_write %3, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<8x2xi32>, tensor<8x4xi32>
+    scf.yield %4 : tensor<8x4xi32>
+  }
+  return %1 : tensor<8x4xi32>
+}
+
+// CHECK-LABEL: func.func @no_hoist_from_possibly_unexecuted_region
+// CHECK:       scf.for {{.*}} {
+// CHECK:         vector.transfer_read
+// CHECK:         vector.transpose
+// CHECK:         vector.transfer_write
+// CHECK:       }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir
index bd7a5f7..1523a6b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir
@@ -43,10 +43,10 @@
 }
 //   CHECK-LABEL:  func.func @winograd_input_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract
@@ -71,10 +71,10 @@
 }
 //   CHECK-LABEL:  func.func @winograd_output_transform
 //     CHECK-NOT:      memref.alloc
-//         CHECK:      vector.transfer_read
-//         CHECK:      vector.transfer_read
 //         CHECK:      scf.for
 //         CHECK:        scf.for
+//         CHECK:          vector.transfer_read
+//         CHECK:          vector.transfer_read
 //         CHECK:          scf.for
 //         CHECK:          vector.transfer_read
 //         CHECK:          vector.contract