Integrate LLVM at https://github.com/llvm/llvm-project/commit/161255eb059e36f40136e26c7eb2ab92ac98ef9d (#12822)

MHLO : https://github.com/tensorflow/mlir-hlo/commit/926e62f88435b5098054f9cb0b0eecb900e71e96
TensorFlow : https://github.com/tensorflow/tensorflow/commit/0a8fd11e73559d9bf0e547343f08f648981e2ff7

Co-authored-by: Lei Zhang <antiagainst@google.com>
Co-authored-by: Mahesh Ravishankar <ravishankarm@google.com>
diff --git a/compiler/src/iree/compiler/API/api_exports.c b/compiler/src/iree/compiler/API/api_exports.c
index 7cff00b..15ada99 100644
--- a/compiler/src/iree/compiler/API/api_exports.c
+++ b/compiler/src/iree/compiler/API/api_exports.c
@@ -305,6 +305,7 @@
 extern void mlirF64TypeGet();
 extern void mlirFlatSymbolRefAttrGet();
 extern void mlirFlatSymbolRefAttrGetValue();
+extern void mlirFloat8E4M3B11FNUZTypeGet();
 extern void mlirFloat8E4M3FNTypeGet();
 extern void mlirFloat8E4M3FNUZTypeGet();
 extern void mlirFloat8E5M2FNUZTypeGet();
@@ -540,6 +541,7 @@
 extern void mlirTypeIsAF16();
 extern void mlirTypeIsAF32();
 extern void mlirTypeIsAF64();
+extern void mlirTypeIsAFloat8E4M3B11FNUZ();
 extern void mlirTypeIsAFloat8E4M3FN();
 extern void mlirTypeIsAFloat8E4M3FNUZ();
 extern void mlirTypeIsAFloat8E5M2();
@@ -893,6 +895,7 @@
   x += (uintptr_t)&mlirF64TypeGet;
   x += (uintptr_t)&mlirFlatSymbolRefAttrGet;
   x += (uintptr_t)&mlirFlatSymbolRefAttrGetValue;
+  x += (uintptr_t)&mlirFloat8E4M3B11FNUZTypeGet;
   x += (uintptr_t)&mlirFloat8E4M3FNTypeGet;
   x += (uintptr_t)&mlirFloat8E4M3FNUZTypeGet;
   x += (uintptr_t)&mlirFloat8E5M2FNUZTypeGet;
@@ -1128,6 +1131,7 @@
   x += (uintptr_t)&mlirTypeIsAF16;
   x += (uintptr_t)&mlirTypeIsAF32;
   x += (uintptr_t)&mlirTypeIsAF64;
+  x += (uintptr_t)&mlirTypeIsAFloat8E4M3B11FNUZ;
   x += (uintptr_t)&mlirTypeIsAFloat8E4M3FN;
   x += (uintptr_t)&mlirTypeIsAFloat8E4M3FNUZ;
   x += (uintptr_t)&mlirTypeIsAFloat8E5M2;
diff --git a/compiler/src/iree/compiler/API/api_exports.def b/compiler/src/iree/compiler/API/api_exports.def
index 06e7d0f..95429a1 100644
--- a/compiler/src/iree/compiler/API/api_exports.def
+++ b/compiler/src/iree/compiler/API/api_exports.def
@@ -297,6 +297,7 @@
   mlirF64TypeGet
   mlirFlatSymbolRefAttrGet
   mlirFlatSymbolRefAttrGetValue
+  mlirFloat8E4M3B11FNUZTypeGet
   mlirFloat8E4M3FNTypeGet
   mlirFloat8E4M3FNUZTypeGet
   mlirFloat8E5M2FNUZTypeGet
@@ -532,6 +533,7 @@
   mlirTypeIsAF16
   mlirTypeIsAF32
   mlirTypeIsAF64
+  mlirTypeIsAFloat8E4M3B11FNUZ
   mlirTypeIsAFloat8E4M3FN
   mlirTypeIsAFloat8E4M3FNUZ
   mlirTypeIsAFloat8E5M2
diff --git a/compiler/src/iree/compiler/API/api_exports.ld b/compiler/src/iree/compiler/API/api_exports.ld
index d96c51a..a1766ef 100644
--- a/compiler/src/iree/compiler/API/api_exports.ld
+++ b/compiler/src/iree/compiler/API/api_exports.ld
@@ -298,6 +298,7 @@
     mlirF64TypeGet;
     mlirFlatSymbolRefAttrGet;
     mlirFlatSymbolRefAttrGetValue;
+    mlirFloat8E4M3B11FNUZTypeGet;
     mlirFloat8E4M3FNTypeGet;
     mlirFloat8E4M3FNUZTypeGet;
     mlirFloat8E5M2FNUZTypeGet;
@@ -533,6 +534,7 @@
     mlirTypeIsAF16;
     mlirTypeIsAF32;
     mlirTypeIsAF64;
+    mlirTypeIsAFloat8E4M3B11FNUZ;
     mlirTypeIsAFloat8E4M3FN;
     mlirTypeIsAFloat8E4M3FNUZ;
     mlirTypeIsAFloat8E5M2;
diff --git a/compiler/src/iree/compiler/API/api_exports.macos.lst b/compiler/src/iree/compiler/API/api_exports.macos.lst
index 4f0b1f7..81d1668 100644
--- a/compiler/src/iree/compiler/API/api_exports.macos.lst
+++ b/compiler/src/iree/compiler/API/api_exports.macos.lst
@@ -296,6 +296,7 @@
 _mlirF64TypeGet
 _mlirFlatSymbolRefAttrGet
 _mlirFlatSymbolRefAttrGetValue
+_mlirFloat8E4M3B11FNUZTypeGet
 _mlirFloat8E4M3FNTypeGet
 _mlirFloat8E4M3FNUZTypeGet
 _mlirFloat8E5M2FNUZTypeGet
@@ -531,6 +532,7 @@
 _mlirTypeIsAF16
 _mlirTypeIsAF32
 _mlirTypeIsAF64
+_mlirTypeIsAFloat8E4M3B11FNUZ
 _mlirTypeIsAFloat8E4M3FN
 _mlirTypeIsAFloat8E4M3FNUZ
 _mlirTypeIsAFloat8E5M2
diff --git a/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp b/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
index 3f44d4f..eb040ac 100644
--- a/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/FoldAffineMinInDistributedLoops.cpp
@@ -143,8 +143,8 @@
       unsigned index = idOp.getDimension().getZExtValue();
       if (index >= numWorkgroup.size()) return failure();
       constraints.appendDimVar({idOp});
-      constraints.addBound(FlatAffineRelation::BoundType::LB, idOp, 0);
-      constraints.addBound(FlatAffineRelation::BoundType::UB, idOp,
+      constraints.addBound(presburger::BoundType::LB, idOp, 0);
+      constraints.addBound(presburger::BoundType::UB, idOp,
                            numWorkgroup[index] - 1);
     }
     return canonicalizeMinMaxOp(rewriter, minOp, constraints);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
index fda438f..05f59da 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPUVectorization.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 1897a6a..b73a182 100644
--- a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -112,7 +112,8 @@
     // TODO(thomasraoux): Remove it once the fix is merged.
     loopInvariantCodeMotion(funcOp);
     linalg::hoistRedundantVectorTransfers(funcOp);
-    vector::transferOpflowOpt(funcOp);
+    IRRewriter rewriter(funcOp->getContext());
+    vector::transferOpflowOpt(rewriter, funcOp);
 
     // Move bitcast inwards from loop region boundaries to increase chances to
     // cancel them.
diff --git a/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
index bc3b741..177d268 100644
--- a/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/SplitFullPartialTransferPass.cpp
@@ -8,6 +8,7 @@
 
 #include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Pass/Pass.h"
@@ -36,7 +37,7 @@
             .Case("vector-transfers",
                   vector::VectorTransferSplit::VectorTransfer)
             .Default(vector::VectorTransferSplit::None));
-    patterns.add<vector::VectorTransferFullPartialRewriter>(ctx, options);
+    populateVectorTransferFullPartialPatterns(patterns, options);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
       return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index e0c347c..d38ce0f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -39,6 +39,7 @@
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/Passes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/Pass/PassManager.h"
@@ -85,14 +86,15 @@
 
 // Track temporary allocations that are never read from. If this is the case
 // it means both the allocations and associated stores can be removed.
-static void eraseDeadAllocAndStores(Operation *parentOp) {
+static void eraseDeadAllocAndStores(RewriterBase &rewriter,
+                                    Operation *parentOp) {
   std::vector<Operation *> opToErase;
   parentOp->walk([&](memref::AllocOp op) {
     if (allUsesAreStores(op, opToErase)) {
       opToErase.push_back(op.getOperation());
     }
   });
-  for (Operation *op : opToErase) op->erase();
+  for (Operation *op : opToErase) rewriter.eraseOp(op);
 }
 
 //===---------------------------------------------------------------------===//
@@ -103,9 +105,12 @@
     Operation *target, transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
   // Apply store to load forwarding and dead store elimination.
-  vector::transferOpflowOpt(target);
-  eraseDeadAllocAndStores(target);
-  return DiagnosedSilenceableFailure::success();
+  IRRewriter rewriter(target->getContext());
+  TrackingListener listener(state);
+  rewriter.setListener(&listener);
+  vector::transferOpflowOpt(rewriter, target);
+  eraseDeadAllocAndStores(rewriter, target);
+  return listener.check(target->getLoc());
 }
 
 void transform_dialect::ApplyBufferOptimizationsOp::getEffects(
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
index 32ba18d..fe6e242 100644
--- a/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/VectorReductionToGPU.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
index 4c7c9d7..624fcc3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/VectorizePackUnPackOps.cpp
@@ -15,6 +15,8 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
index bb21463..2cc5291 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir
@@ -7,13 +7,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @load_subspan_with_offset
 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //  CHECK-DAG:   %[[ZERO:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //      CHECK:   %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]]
 //      CHECK:   return %[[LOAD]]
 
@@ -26,13 +26,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @store_subspan_with_offset
 // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //  CHECK-DAG:   %[[ZERO:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //      CHECK:   memref.store %[[VALUE]], %[[SUBSPAN]][%[[INDEX]]] : memref<?xf32>
 
 // -----
@@ -43,7 +43,7 @@
   return %val: vector<4xf32>
 }
 
-//      CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 16)>
+//      CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 16)>
 //CHECK-LABEL: func.func @load_subspan_with_vector_element
 //      CHECK:   affine.apply #[[$MAP]]()
 
@@ -55,7 +55,7 @@
   return %val: f16
 }
 
-//      CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 2)>
+//      CHECK: #[[$MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 2)>
 //CHECK-LABEL: func.func @load_subspan_with_16bit_element
 //      CHECK:   affine.apply #[[$MAP]]()
 
@@ -69,14 +69,14 @@
 }
 
 //      CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 12 + s1 floordiv 4)
-//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @store_subspan_with_leading_dynamic_dim
 // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = hal.interface.constant.load[0] : index
 //      CHECK:   %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]]
 //      CHECK:   %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //      CHECK:   memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref<?xf32>
 
 // -----
@@ -93,7 +93,7 @@
 }
 
 //      CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (((s0 * s1) * s2) * s3 + s4 floordiv 4)>
-//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s1 + s2 floordiv 4 + (s4 + (s7 + s5 * s6) * s3) * s0)>
+//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s0 floordiv 4 + s2 + (s4 + (s7 + s5 * s6) * s3) * s1)>
 //CHECK-LABEL: func.func @store_subspan_with_all_dynamic_dim
 // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
@@ -103,7 +103,7 @@
 //      CHECK:   %[[DIM3:.+]] = hal.interface.constant.load[3] : index
 //      CHECK:   %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[OFFSET]]]
 //      CHECK:   %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[DIM3]], %[[I3]], %[[OFFSET]], %[[DIM2]], %[[I2]], %[[I0]], %[[DIM1]], %[[I1]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[DIM3]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[DIM1]], %[[I1]]]
 //      CHECK:   memref.store %[[VALUE]], %[[DST]][%[[INDEX]]]
 
 // -----
@@ -117,7 +117,7 @@
 }
 
 //      CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1, s2] -> ((s0 * s1) * 32 + s2 floordiv 4)>
-//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0 + s1 floordiv 4 + s3 * 8 + ((s4 * 4 + s5) * s2) * 8)>
+//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0 floordiv 4 + s1 + s3 * 8 + ((s4 * 4 + s5) * s2) * 8)>
 //CHECK-LABEL: func.func @store_subspan_with_mixed_dynamic_dim
 // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
@@ -125,7 +125,7 @@
 //      CHECK:   %[[DIM2:.+]] = hal.interface.constant.load[1] : index
 //      CHECK:   %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM2]], %[[OFFSET]]]
 //      CHECK:   %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I3]], %[[OFFSET]], %[[DIM2]], %[[I2]], %[[I0]], %[[I1]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[I1]]]
 //      CHECK:   memref.store %[[VALUE]], %[[DST]][%[[INDEX]]]
 
 // -----
@@ -140,7 +140,7 @@
 }
 
 //      CHECK: #[[$SIZE_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 12 + s1 floordiv 4)
-//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+//      CHECK: #[[$OFFSET_MAP:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @store_subspan_with_flow_control
 // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
@@ -148,7 +148,7 @@
 //      CHECK:   %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]]
 //      CHECK:   %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
 //      CHECK: scf.for
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //      CHECK:   memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref<?xf32>
 
 // -----
@@ -221,7 +221,7 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] =  affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-//  CHECK-DAG: #[[$MAP1:.+]] =  affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] =  affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @transfer_read_subspan_with_offset
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -230,7 +230,7 @@
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
 //      CHECK:   %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG0]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]]
 //      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[MEMREF]][%[[INDEX]]]
 //      CHECK:   return %[[VEC]]
 
@@ -244,7 +244,7 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] =  affine_map<()[s0] -> (s0 floordiv 4 + 336)>
-//  CHECK-DAG: #[[$MAP1:.+]] =  affine_map<()[s0, s1, s2, s3] -> (s0 * 56 + s1 * 8 + s2 + s3 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] =  affine_map<()[s0, s1, s2, s3] -> (s1 * 56 + s2 * 8 + s3 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @transfer_write_subspan_with_offset
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: index
 // CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]+]]: index
@@ -254,7 +254,7 @@
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
 //      CHECK:   %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG0]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]]
 //      CHECK:   vector.transfer_write %[[ARG4]], %[[MEMREF]][%[[INDEX]]]
 
 // -----
@@ -333,13 +333,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 840)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 42 + s1 + s2 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 42 + s2 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @collapse_shape
 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
 //      CHECK:   memref.load %[[SUBSPAN]][%[[INDEX]]]
 
 // -----
@@ -352,13 +352,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 840)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s0 * 210 + s1 * 42 + s2 * 7 + s3 + s4 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3, s4] -> (s1 * 210 + s2 * 42 + s3 * 7 + s4 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @expand_shape
 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[I3]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]], %[[I3]]]
 //      CHECK:   memref.load %[[SUBSPAN]][%[[INDEX]]]
 
 // -----
@@ -371,13 +371,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 128)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 128 + s1 + s2 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 128 + s2 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @expand_shape2
 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
 //      CHECK:   memref.load %[[SUBSPAN]][%[[INDEX]]]
 
 // -----
@@ -413,13 +413,13 @@
 }
 
 //  CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 4096)>
-//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 128 + s1 + s2 floordiv 4)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 128 + s2 + s0 floordiv 4)>
 //CHECK-LABEL: func.func @subview
 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
 //  CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //  CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //      CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref<?xf32>{%[[SIZE]]}
-//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
 //      CHECK:   memref.load %[[SUBSPAN]][%[[INDEX]]]
 
 // -----
@@ -462,13 +462,13 @@
 }
 
 //   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 floordiv 2 + 1024)>
-//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 32 + s1 + s2 floordiv 2)>
+//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 32 + s2 + s0 floordiv 2)>
 // CHECK-LABEL: func.func @subgroup_mma_load_with_offset
 //  CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index)
 //   CHECK-DAG:   %[[ZERO:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]]
 //       CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf16, 3>{%[[SIZE]]}
-//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
 //       CHECK:   %[[LD:.+]] = gpu.subgroup_mma_load_matrix %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 32 : index}
 //       CHECK:   return %[[LD]]
 
@@ -481,13 +481,13 @@
 }
 
 //   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 floordiv 2 + 1024)>
-//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s0 * 32 + s1 + s2 floordiv 2)>
+//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 32 + s2 + s0 floordiv 2)>
 // CHECK-LABEL: func.func @subgroup_mma_store_with_offset
 //  CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[VAL:.+]]: !gpu.mma_matrix<16x16xf16, "COp">
 //   CHECK-DAG:   %[[ZERO:.+]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]]
 //       CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref<?xf16, 3>{%[[SIZE]]}
-//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[I1]], %[[OFFSET]]]
+//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]]
 //       CHECK:   gpu.subgroup_mma_store_matrix %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 128 : index}
 
 // -----
@@ -498,12 +498,12 @@
   return %val: i32
 }
 
-//       CHECK: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+//       CHECK: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
 // CHECK-LABEL: func.func @load_uniform_buffer
 //  CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //       CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //       CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
-//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //       CHECK:   %[[LD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
 //       CHECK:   return %[[LD]] : i32
 
@@ -517,11 +517,11 @@
 }
 
 //   CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 floordiv 4 + 24)>
-//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s0 * 12 + s1 * 4 + s2 + s3 floordiv 4)>
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2, s3] -> (s1 * 12 + s2 * 4 + s3 + s0 floordiv 4)>
 // CHECK-LABEL: func.func @store_uniform_buffer
 //  CHECK-SAME: (%[[VAL:.+]]: i32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index)
 //       CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //       CHECK:   %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]]
 //       CHECK:   %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref<?xi32, #hal.descriptor_type<uniform_buffer>>{%[[SIZE]]}
-//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[I0]], %[[I1]], %[[I2]], %[[OFFSET]]]
+//       CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]]
 //       CHECK:   memref.store %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] : memref<?xi32, #hal.descriptor_type<uniform_buffer>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
index fbd1f2d..fd029f3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -48,6 +48,8 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
@@ -821,12 +823,18 @@
     RewritePatternSet patterns(&getContext());
     vector::populateVectorToVectorCanonicalizationPatterns(patterns);
     vector::populateVectorBroadcastLoweringPatterns(patterns);
-    vector::populateVectorContractLoweringPatterns(patterns);
+    // TODO: doubtful that the "default" does what one want here, it is likely
+    // better to use outerproduct.
+    vector::populateVectorContractLoweringPatterns(
+        patterns, vector::VectorTransformsOptions());
     vector::populateVectorMaskMaterializationPatterns(
         patterns, /*force32BitVectorIndices=*/false);
     vector::populateVectorMaskOpLoweringPatterns(patterns);
     vector::populateVectorShapeCastLoweringPatterns(patterns);
-    vector::populateVectorTransposeLoweringPatterns(patterns);
+    // TODO: doubtful that the "default" does what one want here, it is likely
+    // better to use shuffle.
+    vector::populateVectorTransposeLoweringPatterns(
+        patterns, vector::VectorTransformsOptions());
     populateConvertArmNeon2dToIntrPatterns(patterns);
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
index 3cfbef6..ac2d324 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUMmt4dVectorLowering.cpp
@@ -10,6 +10,8 @@
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -134,10 +136,9 @@
         vector::VectorTransformsOptions().setVectorTransformsOptions(
             vector::VectorContractLowering::OuterProduct);
     RewritePatternSet vectorContractLoweringPatterns(&getContext());
-    vectorContractLoweringPatterns.insert<
-        vector::ContractionOpToOuterProductOpLowering,
-        vector::ContractionOpToMatmulOpLowering, vector::ContractionOpLowering>(
-        vectorTransformsOptions, context);
+    vector::populateVectorContractLoweringPatterns(
+        vectorContractLoweringPatterns, vectorTransformsOptions, /*benefit=*/1,
+        /*disableOuterProductLowering=*/true);
     vector::populateVectorTransferPermutationMapLoweringPatterns(
         vectorContractLoweringPatterns);
     if (failed(applyPatternsAndFoldGreedily(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index e4921de..8ae73b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -112,7 +113,10 @@
               vector::VectorContractLowering::OuterProduct));
       vector::populateVectorMaskOpLoweringPatterns(patterns);
       vector::populateVectorShapeCastLoweringPatterns(patterns);
-      vector::populateVectorTransposeLoweringPatterns(patterns);
+      // TODO: doubtful that the "default" does what one want here, it is likely
+      // better to use something else.
+      vector::populateVectorTransposeLoweringPatterns(
+          patterns, vector::VectorTransformsOptions());
       vector::populateVectorTransferLoweringPatterns(patterns);
       if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
         return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 5ffe476..abec204 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -78,7 +79,10 @@
               vector::VectorContractLowering::OuterProduct));
       vector::populateVectorMaskOpLoweringPatterns(patterns);
       vector::populateVectorShapeCastLoweringPatterns(patterns);
-      mlir::vector::populateVectorTransposeLoweringPatterns(patterns);
+      // TODO: doubtful that the "default" does what one want here, it is likely
+      // better to use something else.
+      vector::populateVectorTransposeLoweringPatterns(
+          patterns, vector::VectorTransformsOptions());
       vector::populateVectorTransferLoweringPatterns(patterns);
       if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) {
         return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index edfff5e..461cb2a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
 #include "mlir/Dialect/NVGPU/Utils/MMAUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
index 9800cd8..615f3c6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/Passes.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index f26fc59..295c97d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -28,14 +28,9 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
-#include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/Visitors.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using llvm::dbgs;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
index 24ffbf1..1883352 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir
@@ -42,20 +42,20 @@
 //       CHECK:  %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32>
 //       CHECK:  memref.assume_alignment %[[D5]], 64 : memref<4096x4096xf32>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D6:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:  %[[D8:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[D6]], %[[D7]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<1x4xf32>
-//       CHECK:  %[[D9:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:  vector.transfer_write %[[D8]], %[[D3]]{{\[}}%[[D9]], %[[D10]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D6:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:  %[[D8:.*]] = vector.transfer_read %[[D4]][%[[D6]], %[[D7]]], %[[CST]] {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<1x4xf32>
+//       CHECK:  %[[D9:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:  vector.transfer_write %[[D8]], %[[D3]][%[[D9]], %[[D10]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:  %[[D12:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[D11]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:  %[[D12:.*]] = vector.transfer_read %[[D3]][%[[D11]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
 //       CHECK:  %[[D13:.*]] = vector.shape_cast %[[D12]] : vector<4x1xf32> to vector<1x4xf32>
 //       CHECK:  %[[D14:.*]] = vector.extract %[[D13]][0] : vector<1x4xf32>
-//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:  vector.transfer_write %[[D14]], %[[D5]]{{\[}}%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32>
+//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:  vector.transfer_write %[[D14]], %[[D5]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<4096x4096xf32>
 
 // -----
 
@@ -107,22 +107,22 @@
 //       CHECK:  %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<768x2048xf32>
 //       CHECK:  memref.assume_alignment %[[D6]], 64 : memref<768x2048xf32>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-//       CHECK:  %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:  %[[D9:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32>, vector<1x4xf32>
-//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:  vector.transfer_write %[[D9]], %[[D3]]{{\[}}%[[D10]], %[[D11]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:  %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:  %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:  %[[D9:.*]] = vector.transfer_read %[[D4]][%[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true]} : memref<2048x768xf32>, vector<1x4xf32>
+//       CHECK:  %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:  %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:  vector.transfer_write %[[D9]], %[[D3]][%[[D10]], %[[D11]]] {in_bounds = [true, true]} : vector<1x4xf32>, memref<32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:  gpu.barrier
-//       CHECK:  %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:  %[[D13:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:  %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:  %[[D13:.*]] = vector.transfer_read %[[D3]][%[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
 //       CHECK:  %[[D14:.*]] = vector.shape_cast %[[D13]] : vector<4x1xf32> to vector<1x4xf32>
-//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:  %[[D17:.*]] = vector.transfer_read %[[D5]]{{\[}}%[[D15]], %[[D16]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32>, vector<4xf32>
+//       CHECK:  %[[D15:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+//       CHECK:  %[[D16:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:  %[[D17:.*]] = vector.transfer_read %[[D5]][%[[D15]], %[[D16]]], %[[CST]] {in_bounds = [true]} : memref<768x2048xf32>, vector<4xf32>
 //       CHECK:  %[[D18:.*]] = vector.extract %[[D14]][0] : vector<1x4xf32>
 //       CHECK:  %[[D19:.*]] = arith.addf %[[D18]], %[[D17]] : vector<4xf32>
-//       CHECK:  vector.transfer_write %[[D19]], %[[D6]]{{\[}}%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32>
+//       CHECK:  vector.transfer_write %[[D19]], %[[D6]][%[[D15]], %[[D16]]] {in_bounds = [true]} : vector<4xf32>, memref<768x2048xf32>
 
 // -----
 
@@ -213,23 +213,23 @@
 //       CHECK:   %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32>
 //       CHECK:   memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D7:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:   %[[D9:.*]] = vector.transfer_read %[[D4]]{{\[}}%{{.*}}, %[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32>, vector<1x1x4xf32>
-//       CHECK:   %[[D10:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:   vector.transfer_write %[[D9]], %[[D3]]{{\[}}%[[C0]], %[[D10]], %[[D11]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:   %[[D9:.*]] = vector.transfer_read %[[D4]][%{{.*}}, %[[D7]], %[[D8]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x2048x768xf32>, vector<1x1x4xf32>
+//       CHECK:   %[[D10:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:   vector.transfer_write %[[D9]], %[[D3]][%[[C0]], %[[D10]], %[[D11]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[C0]], %[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D12]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
 //       CHECK:   %[[D14:.*]] = vector.broadcast %[[D13]] : vector<4x1xf32> to vector<1x4x1xf32>
 //       CHECK:   %[[D15:.*]] = vector.shape_cast %[[D14]] : vector<1x4x1xf32> to vector<1x1x4xf32>
-//       CHECK:   %[[D16:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-//       CHECK:   %[[D17:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:   %[[D18:.*]] = vector.transfer_read %[[D5]]{{\[}}%{{.*}}, %[[D16]], %[[D17]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32>, vector<4xf32>
+//       CHECK:   %[[D16:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+//       CHECK:   %[[D17:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:   %[[D18:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[D16]], %[[D17]]], %[[CST]] {in_bounds = [true]} : memref<10x768x2048xf32>, vector<4xf32>
 //       CHECK:   %[[D19:.*]] = vector.extract %[[D15]][0, 0] : vector<1x1x4xf32>
 //       CHECK:   %[[D20:.*]] = arith.addf %[[D19]], %[[D18]] : vector<4xf32>
-//       CHECK:   vector.transfer_write %[[D20]], %[[D6]]{{\[}}%{{.*}}, %[[D16]], %[[D17]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32>
+//       CHECK:   vector.transfer_write %[[D20]], %[[D6]][%{{.*}}, %[[D16]], %[[D17]]] {in_bounds = [true]} : vector<4xf32>, memref<10x768x2048xf32>
 
 // -----
 
@@ -281,25 +281,25 @@
 //       CHECK:   %[[D7:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32>
 //       CHECK:   memref.assume_alignment %[[D7]], 64 : memref<10x2048x768xf32>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]], %{{.*}}]
-//       CHECK:   %[[D9:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:   %[[D10:.*]] = vector.transfer_read %[[D5]]{{\[}}%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
-//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %[[D1]], %[[D2]]]
-//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:   vector.transfer_write %[[D10]], %[[D4]]{{\[}}%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
-//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D6]]{{\[}}%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
-//       CHECK:   vector.transfer_write %[[D13]], %[[D3]]{{\[}}%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:   %[[D9:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:   %[[D10:.*]] = vector.transfer_read %[[D5]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
+//       CHECK:   %[[D11:.*]] = affine.apply #{{.*}}()[%[[D0]], %[[D1]], %[[D2]]]
+//       CHECK:   %[[D12:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:   vector.transfer_write %[[D10]], %[[D4]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
+//       CHECK:   %[[D13:.*]] = vector.transfer_read %[[D6]][%{{.*}}, %[[D8]], %[[D9]]], %[[CST]] {in_bounds = [true, true, true]} : memref<10x768x2048xf32>, vector<1x1x4xf32>
+//       CHECK:   vector.transfer_write %[[D13]], %[[D3]][%[[C0]], %[[D11]], %[[D12]]] {in_bounds = [true, true, true]} : vector<1x1x4xf32>, memref<1x32x33xf32, #gpu.address_space<workgroup>>
 //       CHECK:   gpu.barrier
-//       CHECK:   %[[D14:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]]]
-//       CHECK:   %[[D15:.*]] = vector.transfer_read %[[D4]]{{\[}}%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
-//       CHECK:   %[[D16:.*]] = vector.transfer_read %[[D3]]{{\[}}%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D14:.*]] = affine.apply #{{.*}}()[%[[D0]]]
+//       CHECK:   %[[D15:.*]] = vector.transfer_read %[[D4]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
+//       CHECK:   %[[D16:.*]] = vector.transfer_read %[[D3]][%[[C0]], %[[D14]], %[[D1]]], %[[CST]] {in_bounds = [true, true]} : memref<1x32x33xf32, #gpu.address_space<workgroup>>, vector<4x1xf32>
 //       CHECK:   %[[D17:.*]] = arith.addf %[[D15]], %[[D16]] : vector<4x1xf32>
 //       CHECK:   %[[D18:.*]] = vector.broadcast %[[D17]] : vector<4x1xf32> to vector<1x4x1xf32>
 //       CHECK:   %[[D19:.*]] = vector.shape_cast %[[D18]] : vector<1x4x1xf32> to vector<1x1x4xf32>
 //       CHECK:   %[[D20:.*]] = vector.extract %[[D19]][0, 0] : vector<1x1x4xf32>
-//       CHECK:   %[[D21:.*]] = affine.apply #{{.*}}(){{\[}}%{{.*}}, %[[D1]]]
-//       CHECK:   %[[D22:.*]] = affine.apply #{{.*}}(){{\[}}%[[D0]], %{{.*}}]
-//       CHECK:   vector.transfer_write %[[D20]], %[[D7]]{{\[}}%{{.*}}, %[[D21]], %[[D22]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32>
+//       CHECK:   %[[D21:.*]] = affine.apply #{{.*}}()[%[[D1]], %{{.*}}]
+//       CHECK:   %[[D22:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]]]
+//       CHECK:   vector.transfer_write %[[D20]], %[[D7]][%{{.*}}, %[[D21]], %[[D22]]] {in_bounds = [true]} : vector<4xf32>, memref<10x2048x768xf32>
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
index 70497b6..4c7755a 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVTileAndVectorizeToCooperativeOps.cpp
@@ -37,6 +37,7 @@
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Pass/Pass.h"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
index 66997da..7a16383 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/Matchers.h"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index bbf82d7..42c00a2 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
@@ -64,8 +64,8 @@
           outs(%17 : tensor<256x1024xf16>) {
         ^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16):
           %28 = arith.divf %arg2, %arg3 : f16
-          // spirv.GL.Exp is not permitted to use cooperative matrix types per the spec.
-          %29 = math.exp %28 : f16
+          // spirv.GL.FAbs is not permitted to use cooperative matrix types per the spec.
+          %29 = math.absf %28 : f16
           linalg.yield %29 : f16
         } -> tensor<256x1024xf16>
         flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
@@ -189,19 +189,19 @@
 //         CHECK:     spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
 //         CHECK:     spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 //         CHECK:     spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
 //         CHECK:     spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 //         CHECK:     spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
 //         CHECK:     spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 //         CHECK:     spirv.Load "StorageBuffer" %{{.+}} : vector<4xf32>
 //         CHECK:     spirv.Load "Workgroup" %{{.+}} : vector<4xf32>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 //         CHECK:     spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
 
 // -----
@@ -577,8 +577,8 @@
           outs(%17 : tensor<256x1024xf16>) {
         ^bb0(%arg2: f16, %arg3: f16, %arg4: f16, %arg5: f16):
           %28 = arith.divf %arg2, %arg3 : f16
-          // spirv.GL.Exp is not permitted to use cooperative matrix types per the spec.
-          %29 = math.exp %28 : f16
+          // spirv.GL.FAbs is not permitted to use cooperative matrix types per the spec.
+          %29 = math.absf %28 : f16
           linalg.yield %29 : f16
         } -> tensor<256x1024xf16>
         flow.dispatch.tensor.store %27, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x1024xf16>>
@@ -639,13 +639,13 @@
 
 //         CHECK:     spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 // CHECK-COUNT-2:     spirv.FDiv %{{.+}}, %{{.+}} : vector<4xf16>
-// CHECK-COUNT-2:     spirv.GL.Exp %{{.+}} : vector<4xf16>
+// CHECK-COUNT-2:     spirv.GL.FAbs %{{.+}} : vector<4xf16>
 //         CHECK:     spirv.ControlBarrier <Workgroup>, <Workgroup>, <AcquireRelease|WorkgroupMemory>
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
index 3689722..9fb1220 100644
--- a/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformDialectStrategies/CPU/Common.cpp
@@ -19,6 +19,8 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 
@@ -33,20 +35,56 @@
 using iree_compiler::cpu::ReductionStrategy;
 using iree_compiler::IREE::transform_dialect::ApplyPatternsOpPatterns;
 using iree_compiler::IREE::transform_dialect::ForallToWorkgroupOp;
-using transform::LowerVectorsOp;
+using transform::ApplyTransferPermutationPatternsOp;
+using transform::LowerContractionOp;
+using transform::LowerMultiReductionOp;
+using transform::LowerShapeCastOp;
+using transform::LowerTransferOp;
+using transform::LowerTransposeOp;
 using transform::MatchOp;
 using transform::SplitHandlesOp;
+using transform::SplitTransferFullPartialOp;
+using transform::TransferToScfOp;
 using transform_ext::AllDims;
 using transform_ext::m_StructuredOp;
 using transform_ext::NumEqualsTo;
 using transform_ext::RegisterMatchCallbacksOp;
 using transform_ext::ShapeKind;
 using transform_ext::StructuredOpMatcher;
+using vector::VectorContractLoweringAttr;
 
 //===----------------------------------------------------------------------===//
 // Mid-level problem-specific strategy builder APIs, follow MLIR-style builders.
 //===----------------------------------------------------------------------===//
 
+// TODO: better builders.
+static Value buildDefaultVectorLoweringStrategy(
+    ImplicitLocOpBuilder &b, Value funcH,
+    const vector::LowerVectorsOptions &lowerVectorsOpts) {
+  auto pdlOperation = pdl::OperationType::get(b.getContext());
+  funcH = b.create<LowerContractionOp>(
+      pdlOperation, funcH,
+      /*loweringStrategy*/ lowerVectorsOpts.vectorContractLowering);
+  funcH = b.create<ApplyTransferPermutationPatternsOp>(pdlOperation, funcH);
+  funcH = b.create<LowerMultiReductionOp>(
+      pdlOperation, funcH,
+      /*loweringStrategy=*/lowerVectorsOpts.vectorMultiReductionLowering);
+  funcH = b.create<SplitTransferFullPartialOp>(
+      pdlOperation, funcH,
+      /*splitTransferStrategy=*/lowerVectorsOpts.vectorTransferSplit);
+  funcH = b.create<TransferToScfOp>(
+      pdlOperation, funcH,
+      /*maxTransferRank=*/1,
+      /*fullUnroll=*/lowerVectorsOpts.unrollVectorTransfers);
+  funcH = b.create<LowerTransferOp>(pdlOperation, funcH, /*maxTransferRank=*/1);
+  funcH = b.create<LowerShapeCastOp>(pdlOperation, funcH);
+  funcH = b.create<LowerTransposeOp>(
+      pdlOperation, funcH,
+      /*loweringStrategy=*/lowerVectorsOpts.vectorTransposeLowering,
+      /*avx2LoweringStrategy=*/lowerVectorsOpts.transposeAVX2Lowering);
+  return funcH;
+}
+
 /// Take care of the last common steps in a CPU strategy (i.e. vectorize,
 /// bufferize and map to blocks).
 /// Return the handles to the updated variant and the func::FuncOp ops under
@@ -81,9 +119,7 @@
   b.create<ForallToWorkgroupOp>(funcH);
 
   // Step N. Lower vectors.
-  // TODO: Control the lowering to vectors.
-  auto pdlOperation = pdl::OperationType::get(b.getContext());
-  funcH = b.create<LowerVectorsOp>(pdlOperation, funcH, lowerVectorsOpts);
+  funcH = buildDefaultVectorLoweringStrategy(b, funcH, lowerVectorsOpts);
   return std::make_pair(variantH, funcH);
 }
 
diff --git a/integrations/tensorflow/WORKSPACE b/integrations/tensorflow/WORKSPACE
index f97b607..6dd236e 100644
--- a/integrations/tensorflow/WORKSPACE
+++ b/integrations/tensorflow/WORKSPACE
@@ -7,7 +7,7 @@
 
 load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
-TENSORFLOW_COMMIT = "9104c104fe2422b1034be276f94f9b53b61751eb"
+TENSORFLOW_COMMIT = "0a8fd11e73559d9bf0e547343f08f648981e2ff7"
 
 git_repository(
     name = "org_tensorflow",
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
index c348284..33a8682 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/Passes.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/PassManager.h"
@@ -259,10 +260,10 @@
     vector::populateVectorToVectorCanonicalizationPatterns(patterns);
     // In a progressive lowering of vectors, this would be the 1st step.
     if (options.contractionLowering) {
-      patterns.add<vector::ContractionOpToOuterProductOpLowering,
-                   vector::ContractionOpToMatmulOpLowering,
-                   vector::ContractionOpLowering>(
-          options.vectorTransformOptions, context);
+      vector::populateVectorContractLoweringPatterns(
+          patterns, options.vectorTransformOptions,
+          /*benefit=*/1,
+          /*disableOuterProductLowering=*/true);
       vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
     }
     // In a progressive lowering of vectors, this would be the 2nd step.
@@ -273,8 +274,8 @@
     }
     // In a progressive lowering of vectors, this would be the 3rd step.
     if (options.transferPartialRewrite) {
-      patterns.add<vector::VectorTransferFullPartialRewriter>(
-          context, options.vectorTransformOptions);
+      populateVectorTransferFullPartialPatterns(patterns,
+                                                options.vectorTransformOptions);
     }
     // In a progressive lowering of vectors, this would be the 4th step.
     if (options.transferLowering) {
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
index 3926f9c..17a3e45 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/roundtrip.mlir
@@ -20,9 +20,11 @@
   // CHECK: transform.structured.vectorize %[[OPS2]]
   transform.structured.vectorize %5
   // CHECK: %[[FUNC:.*]] = transform.structured.match ops{["func.func"]} in %arg0
-  // CHECK: lower_vectors %[[FUNC]] {{.*}} multireduction_lowering = innerreduction
+  // CHECK: vector.lower_contraction %[[FUNC]] {{.*}}
   %6 = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  transform.vector.lower_vectors %6 multireduction_lowering = "innerreduction"
+  transform.vector.lower_contraction %6
+    lowering_strategy = "outerproduct" 
+      : (!pdl.operation) -> !pdl.operation
   // CHECK: lower_to_llvm
   lower_to_llvm
 }
diff --git a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
index f5b7a52..888a4c0 100644
--- a/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
+++ b/llvm-external-projects/iree-dialects/test/Dialect/linalg_transform/single-tiling-full-script.mlir
@@ -25,9 +25,16 @@
     {bufferize_function_boundaries = true}
   %3 = transform.structured.match ops{["func.func"]} in %module_op 
     : (!pdl.operation) -> !pdl.operation
-  transform.vector.lower_vectors %3 multireduction_lowering = "innerreduction"
 
-  // lower_to_llvm is the only remaining op not upstreamed, at the same time we
-  // upstreamed --test-lower-to-llvm.
+
+  %func = transform.structured.match ops{["func.func"]} in %module_op
+    : (!pdl.operation) -> !pdl.operation
+  %func_e_2 = transform.vector.lower_contraction %func
+    lowering_strategy = "outerproduct" 
+      : (!pdl.operation) -> !pdl.operation
+  %func_e_3 = transform.vector.lower_transpose %func_e_2
+    lowering_strategy = "shuffle" 
+      : (!pdl.operation) -> !pdl.operation
+
   lower_to_llvm
 }
diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
index 6e08a3f..139417c 100644
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
@@ -64,6 +64,7 @@
     : (!pdl.operation) -> (!pdl.operation)
 
   transform.structured.pack_greedily %matmul
-      gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
+      matmul_packed_sizes = [8, 16, 32] 
+      matmul_inner_dims_order = [0, 1, 2]
     : (!pdl.operation) -> !transform.op<"linalg.generic">
 }
diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
index 828f2c6..8848abe 100644
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ b/tests/transform_dialect/cpu/contraction-packing.mlir
@@ -146,6 +146,6 @@
   // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
   // dimensions.
   transform.structured.pack_greedily %matmul
-      gemm_packed_sizes = [8, 16, 32] gemm_inner_dims_order = [0, 1, 2]
+      matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
     : (!pdl.operation) -> !transform.op<"linalg.generic">
 }
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 18f5bc5..8759c3b 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 18f5bc58159389fb1aaaa50577be4ff5397a65ae
+Subproject commit 8759c3bea5defd7c9c382904c1809ed0aa800eef
diff --git a/third_party/mlir-hlo b/third_party/mlir-hlo
index ab58786..29e975a 160000
--- a/third_party/mlir-hlo
+++ b/third_party/mlir-hlo
@@ -1 +1 @@
-Subproject commit ab587867e4aa4307c4b573d7926c3f869563eaa2
+Subproject commit 29e975a3c0a469c41fc564c36766217aef9d07a4