Revert commits following a bad test deactivation (#12147)

There seems to be a legit error lurking that I do not understand atm.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD b/compiler/src/iree/compiler/Codegen/Common/BUILD
index d6ba6c8..0d483a0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/BUILD
+++ b/compiler/src/iree/compiler/Codegen/Common/BUILD
@@ -97,7 +97,6 @@
         "//compiler/src/iree/compiler/Codegen/LLVMCPU/TransformExtensions:LLVMCPUExtensions",
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
         "@llvm-project//mlir:AffineTransformOps",
-        "@llvm-project//mlir:BufferizationTransformOps",
         "@llvm-project//mlir:GPUTransformOps",
         "@llvm-project//mlir:LinalgTransformOps",
         "@llvm-project//mlir:MemRefTransformOps",
diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
index ddf494a..d722594 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -39,7 +39,6 @@
     MLIRArithUtils
     MLIRAsyncDialect
     MLIRBufferizationDialect
-    MLIRBufferizationTransformOps
     MLIRBufferizationTransforms
     MLIRFuncDialect
     MLIRGPUOps
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index d89e6d1..a6207ac 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
@@ -21,7 +21,6 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
 #include "mlir/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -100,7 +99,6 @@
     iree_compiler::registerTransformDialectLLVMCPUExtension(registry);
     iree_compiler::registerTransformDialectLLVMGPUExtension(registry);
     affine::registerTransformDialectExtension(registry);
-    bufferization::registerTransformDialectExtension(registry);
     gpu::registerTransformDialectExtension(registry);
     linalg::registerTransformDialectExtension(registry);
     memref::registerTransformDialectExtension(registry);
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index b281ed8..62b7a42 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -14,7 +14,6 @@
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Interfaces/BufferizationInterfaces.h"
 #include "iree/compiler/Codegen/Passes.h"
-#include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -87,7 +86,6 @@
   ADD_PATTERN(swapPaddingElideConditional,
               getSwapPaddingElideConditionalAttrName)
   ADD_PATTERN(swappingPatterns, getSwappingPatternsAttrName)
-  ADD_PATTERN(unrollVectorsGpuMma, getUnrollVectorsGpuMmaAttrName)
 #undef ADD_PATTERN
   result.addTypes({pdl::OperationType::get(ctx)});
 }
@@ -204,23 +202,6 @@
       });
 }
 
-static Optional<SmallVector<int64_t>> getGPUTensorCoreNativeVectorSize(
-    Operation *op) {
-  return getWmmaNativeVectorSize(op);
-}
-
-static void addUnrollVectorsGpuMmaPatterns(RewritePatternSet &patterns) {
-  auto unrollOrder = [](Operation *op) -> Optional<SmallVector<int64_t>> {
-    auto contract = dyn_cast<vector::ContractionOp>(op);
-    if (!contract) return std::nullopt;
-    return mlir::iree_compiler::gpuMmaUnrollOrder(contract);
-  };
-  vector::populateVectorUnrollPatterns(
-      patterns, vector::UnrollVectorOptions()
-                    .setNativeShapeFn(getGPUTensorCoreNativeVectorSize)
-                    .setUnrollTraversalOrderFn(unrollOrder));
-}
-
 static void addAdditionalIreePatterns(RewritePatternSet &patterns) {
   patterns.add<GenerateToConstant>(patterns.getContext());
 }
@@ -265,7 +246,6 @@
     linalg::populateFoldReshapeOpsByExpansionPatterns(
         patterns, [](OpOperand *) { return true; });
   }
-  if (getUnrollVectorsGpuMma()) addUnrollVectorsGpuMmaPatterns(patterns);
 
   TrackingListener listener(state);
   GreedyRewriteConfig config;
@@ -790,22 +770,12 @@
     transform::TransformResults &transformResults,
     transform::TransformState &state) {
   ArrayRef<Operation *> targetOps = state.getPayloadOps(getTarget());
-  if (targetOps.empty()) {
-    transformResults.set(getForeachThreadOp().cast<OpResult>(), {});
-    transformResults.set(getTiledOp().cast<OpResult>(), {});
-    return DiagnosedSilenceableFailure::success();
-  }
-  if (targetOps.size() != 1) {
-    return mlir::emitDefiniteFailure(
-               state.getTopLevel(),
-               "expected single target op in payload, got: ")
-           << targetOps.size();
-  }
+  assert(targetOps.size() == 1 && "expected single target op in payload");
   auto funcOp = targetOps.front()->getParentOfType<func::FuncOp>();
   FailureOr<IREE::HAL::ExecutableExportOp> exportOp = getEntryPoint(funcOp);
   if (failed(exportOp)) {
-    return mlir::emitDefiniteFailure(state.getTopLevel(),
-                                     "couldn't find export op for func");
+    return mlir::emitDefiniteFailure(
+        state.getTopLevel(), "couldn't find top level HAL export op for func");
   }
 
   /// Lower the workgroup count region in keeping with the way dispatch
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
index e268c4e..87cea4f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h
@@ -50,7 +50,6 @@
   bool rewritePackOps = false;
   bool swapPaddingElideConditional = false;
   bool swappingPatterns = false;
-  bool unrollVectorsGpuMma = false;
 };
 }  // namespace transform_dialect
 }  // namespace IREE
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
index 52f535d..1dcd352 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
@@ -68,10 +68,8 @@
       tensor.extract_slice swapping pattern. This injects static information
       that guarantees padding is smaller than the window size which guarantees
       we never see a tile comprised of padding-only.
-      - unroll_vectors_gpu_mma: adds patterns that unroll vectors to a native tile
-      size for GPUs with mma operations. The size is currently hardcoded but 
-      should be refactored upstream and made pluggable.
-
+      This allows dropping the generation or an annoying internal scf.if but may
+      yield incorrect code in pathological cases.
 
     #### Return modes:
 
@@ -100,8 +98,7 @@
                        UnitAttr:$rank_reducing_vector,
                        UnitAttr:$rewrite_pack_ops,
                        UnitAttr:$swap_padding_elide_conditional,
-                       UnitAttr:$swapping_patterns,
-                       UnitAttr:$unroll_vectors_gpu_mma);
+                       UnitAttr:$swapping_patterns);
   let results = (outs PDL_Operation:$result);
 
   let assemblyFormat = "$target attr-dict";
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
index 40c37ba..a6b366f 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD
@@ -55,7 +55,6 @@
         "//compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions:LLVMGPUExtensions",
         "//compiler/src/iree/compiler/Dialect/Flow/TransformExtensions:FlowExtensions",
         "@llvm-project//mlir:AffineTransformOps",
-        "@llvm-project//mlir:BufferizationTransformOps",
         "@llvm-project//mlir:GPUTransformOps",
         "@llvm-project//mlir:LinalgTransformOps",
         "@llvm-project//mlir:MemRefTransformOps",
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
index 8179315..05701b4 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -24,7 +24,6 @@
     IREELinalgExtTransformOps
     IREELinalgTransformDialect
     MLIRAffineTransformOps
-    MLIRBufferizationTransformOps
     MLIRGPUTransformOps
     MLIRIR
     MLIRLinalgTransformOps
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
index ea299b6..dae53e8 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/Interfaces.cpp
@@ -18,7 +18,6 @@
 #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
 #include "iree/compiler/Dialect/Flow/TransformExtensions/FlowExtensions.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
-#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
 #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
@@ -45,7 +44,6 @@
   linalg::registerTilingInterfaceExternalModels(registry);
 
   affine::registerTransformDialectExtension(registry);
-  bufferization::registerTransformDialectExtension(registry);
   gpu::registerTransformDialectExtension(registry);
   linalg::registerTransformDialectExtension(registry);
   memref::registerTransformDialectExtension(registry);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 73b167e..91c1b33 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -46,6 +46,38 @@
   vector::populateVectorReductionToContractPatterns(patterns);
 }
 
+static Optional<SmallVector<int64_t>> unrollOrder(Operation *op) {
+  auto contract = dyn_cast<vector::ContractionOp>(op);
+  if (!contract) return std::nullopt;
+  SmallVector<int64_t> order;
+  // Pick an unrolling order that will allow tensorcore operation to reuse LHS
+  // register. This is needed to get good performance on sm_80 target.
+  // First make reduction the outer dimensions.
+  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+    if (vector::isReductionIterator(iter)) {
+      order.push_back(index);
+    }
+  }
+
+  llvm::SmallDenseSet<int64_t> dims;
+  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
+    dims.insert(expr.cast<AffineDimExpr>().getPosition());
+  }
+  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
+  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+    if (vector::isParallelIterator(iter) && dims.count(index)) {
+      order.push_back(index);
+    }
+  }
+  // Then the remaining parallel loops.
+  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
+    if (vector::isParallelIterator(iter) && !dims.count(index)) {
+      order.push_back(index);
+    }
+  }
+  return order;
+}
+
 /// Returns vector::ContractionOp operand's index where the result is used.
 static Optional<int> getVectorContractOpOperandId(
     vector::ContractionOp contractOp, OpResult result) {
@@ -198,11 +230,6 @@
 }
 
 static void populateVectorUnrollPatterns(RewritePatternSet &patterns) {
-  auto unrollOrder = [](Operation *op) -> Optional<SmallVector<int64_t>> {
-    auto contract = dyn_cast<vector::ContractionOp>(op);
-    if (!contract) return std::nullopt;
-    return gpuMmaUnrollOrder(contract);
-  };
   vector::populateVectorUnrollPatterns(
       patterns, vector::UnrollVectorOptions()
                     .setNativeShapeFn(getGPUTensorCoreNativeVectorSize)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 9e5efa6..f04a064 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -104,12 +104,12 @@
 
   auto &nestedModulePM = pm.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(
+      createConvertToDestinationPassingStylePass(
+          useWARForCooperativeMatrixCodegen));
+  nestedModulePM.addNestedPass<func::FuncOp>(
       IREE::LinalgExt::createTileAndDecomposeAttentionPass());
   nestedModulePM.addNestedPass<func::FuncOp>(
       IREE::LinalgExt::createDecomposeSoftmaxPass());
-  nestedModulePM.addNestedPass<func::FuncOp>(
-      createConvertToDestinationPassingStylePass(
-          useWARForCooperativeMatrixCodegen));
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
 }
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index eb87340..4cb6d99 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -112,37 +112,6 @@
   return threadsAvailable == 1;
 }
 
-/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
-/// register. This is needed to get good performance on sm_80 target.
-Optional<SmallVector<int64_t>> gpuMmaUnrollOrder(
-    vector::ContractionOp contract) {
-  SmallVector<int64_t> order;
-  // First make reduction the outer dimensions.
-  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
-    if (vector::isReductionIterator(iter)) {
-      order.push_back(index);
-    }
-  }
-
-  llvm::SmallDenseSet<int64_t> dims;
-  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
-    dims.insert(expr.cast<AffineDimExpr>().getPosition());
-  }
-  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
-  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
-    if (vector::isParallelIterator(iter) && dims.count(index)) {
-      order.push_back(index);
-    }
-  }
-  // Then the remaining parallel loops.
-  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
-    if (vector::isParallelIterator(iter) && !dims.count(index)) {
-      order.push_back(index);
-    }
-  }
-  return order;
-}
-
 //===----------------------------------------------------------------------===//
 // GPU workgroup memory
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index ace232c..10e4838 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -48,11 +48,6 @@
                                            int64_t threadCount,
                                            int64_t vectorSize);
 
-/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
-/// register. This is needed to get good performance on sm_80 target.
-Optional<SmallVector<int64_t>> gpuMmaUnrollOrder(
-    vector::ContractionOp contract);
-
 //===----------------------------------------------------------------------===//
 // GPU workgroup memory
 //===----------------------------------------------------------------------===//
diff --git a/tests/e2e/linalg_ext_ops/BUILD b/tests/e2e/linalg_ext_ops/BUILD
index ae8bd96..9966915 100644
--- a/tests/e2e/linalg_ext_ops/BUILD
+++ b/tests/e2e/linalg_ext_ops/BUILD
@@ -18,10 +18,7 @@
     srcs = enforce_glob(
         # keep sorted
         [
-            # attention.mlir is broken at HEAD and blocks commits.
-            # https://github.com/iree-org/iree/issues/12129
-            # reactivate when truly fixed.
-            # "attention.mlir",
+            "attention.mlir",
             "reverse.mlir",
             "scan.mlir",
             "scatter.mlir",
@@ -31,7 +28,6 @@
         ],
         include = ["*.mlir"],
         exclude = [
-            "attention.mlir",
             "pack.mlir",
             "unpack.mlir",
             "winograd_input.mlir",
@@ -99,10 +95,7 @@
     srcs = enforce_glob(
         # keep sorted
         [
-            # attention.mlir is broken at HEAD and blocks commits.
-            # https://github.com/iree-org/iree/issues/12129
-            # reactivate when truly fixed.
-            # "attention.mlir",
+            "attention.mlir",
             "pack.mlir",
             "reverse.mlir",
             "scan.mlir",
@@ -115,9 +108,6 @@
             "winograd_output.mlir",
         ],
         include = ["*.mlir"],
-        exclude = [
-            "attention.mlir",
-        ],
     ),
     driver = "local-task",
     target_backend = "llvm-cpu",
diff --git a/tests/e2e/linalg_ext_ops/CMakeLists.txt b/tests/e2e/linalg_ext_ops/CMakeLists.txt
index d209465..46a8b04 100644
--- a/tests/e2e/linalg_ext_ops/CMakeLists.txt
+++ b/tests/e2e/linalg_ext_ops/CMakeLists.txt
@@ -14,6 +14,7 @@
   NAME
     check_cuda
   SRCS
+    "attention.mlir"
     "reverse.mlir"
     "scan.mlir"
     "scatter.mlir"
@@ -77,6 +78,7 @@
   NAME
     check_llvm-cpu_local-task
   SRCS
+    "attention.mlir"
     "pack.mlir"
     "reverse.mlir"
     "scan.mlir"