Integrate LLVM@5c35af8f1e6ebc7c32 (#23252)

Reverts carried forward:
* Local revert of https://github.com/llvm/llvm-project/pull/169614 due
to https://github.com/iree-org/iree/issues/22649

Other changes:
* Fixes lit tests to account for
https://github.com/llvm/llvm-project/pull/174452
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index ac0aea5..2aea851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
@@ -68,20 +68,16 @@
 
 // CHECK-LABEL: func @contract_to_mfma_32x32x8_mm
 // CHECK-SAME: (%[[A:.+]]: vector<32x8xf16>, %[[B:.+]]: vector<8x32xf16>, %[[C:.+]]: vector<32x32xf32>)
-// CHECK:       %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<32x32xf32> -> vector<1x1x4x1x4x1xf32>
-// CHECK:       %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<32x8xf16>  -> vector<1x1x1x1x1x4xf16>
-// CHECK:       %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<8x32xf16>  -> vector<1x1x1x1x4x1xf16>
-// CHECK:       %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<4x1x4x1xf32> from vector<1x1x4x1x4x1xf32>
-// CHECK:       %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<1x1x1x1x1x4xf16>
-// CHECK:       %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x4x1xf16> from vector<1x1x1x1x4x1xf16>
-// CHECK:       %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
-// CHECK:       %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
-// CHECK:       %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<32x32xf32> -> vector<1x1x4x1x4x1xf32>
+// CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<32x8xf16>  -> vector<1x1x1x1x1x4xf16>
+// CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<8x32xf16>  -> vector<1x1x1x1x4x1xf16>
+// CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x4xf16> to vector<4xf16>
+// CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x4x1xf16> to vector<4xf16>
+// CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x4x1x4x1xf32> to vector<16xf32>
 // CHECK:       %[[MFMA:.+]] = amdgpu.mfma 32x32x8 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp =  none
 // CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<16xf32>
-// CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
-// CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
-// CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
+// CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<1x1x4x1x4x1xf32>
+// CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
 // CHECK:       return %[[R_SIMD]]
 
 // -----
@@ -144,21 +140,17 @@
 
 // CHECK-LABEL: func @contract_to_mfma_16x16x16_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x4x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x4xf16>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x4x1xf16>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x4x1xf32> from vector<1x1x1x1x4x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<1x1x1x1x1x4xf16>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x4x1xf16> from vector<1x1x1x1x4x1xf16>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x4x1xf32> to vector<4xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x4x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x4xf16>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x4x1xf16>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x4xf16> to vector<4xf16>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x4x1xf16> to vector<4xf16>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x4x1xf32> to vector<4xf32>
 //       CHECK:   %[[MFMA:.+]] = amdgpu.mfma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp =  none
 //  CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<4xf32>
 
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]]  : vector<4xf32> to vector<1x1x4x1xf32>
-//       CHECK:   %[[B_OUT:.*]]  = vector.broadcast %[[R_CAST]] : vector<1x1x4x1xf32> to vector<1x1x1x1x4x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]]  : vector<4xf32> to vector<1x1x1x1x4x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -573,19 +565,15 @@
 
 // CHECK-LABEL: func.func @contract_to_WMMAR3_16x16x16_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x8x1x1x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x16xf16>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x16x1xf16>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<8x1x1x1xf32> from vector<1x1x8x1x1x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x16xf16> from vector<1x1x1x1x1x16xf16>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x16x1xf16> from vector<1x1x1x1x16x1xf16>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf16>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf16>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<8x1x1x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x8x1x1x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x16xf16>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x16x1xf16>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x16xf16> to vector<16xf16>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x16x1xf16> to vector<16xf16>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x8x1x1x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
-//       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1x1x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -659,19 +647,15 @@
 
 // CHECK-LABEL: func.func @contract_to_WMMAR4_16x16x16_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x8xf16>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x8x1xf16>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x8xf16> from vector<1x1x1x1x1x8xf16>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x8x1xf16> from vector<1x1x1x1x8x1xf16>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x8xf16> to vector<8xf16>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x8x1xf16> to vector<8xf16>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x8xf16>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x8x1xf16>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x8xf16> to vector<8xf16>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x8x1xf16> to vector<8xf16>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-//       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -744,19 +728,15 @@
 
 // CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x4_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x4xf32>, %[[B:.+]]: vector<4x16xf32>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x4xf32> -> vector<1x1x1x1x1x2xf32>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<4x16xf32> -> vector<1x1x1x1x2x1xf32>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x2xf32> from vector<1x1x1x1x1x2xf32>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x2x1xf32> from vector<1x1x1x1x2x1xf32>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x2xf32> to vector<2xf32>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x2x1xf32> to vector<2xf32>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x4xf32> -> vector<1x1x1x1x1x2xf32>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<4x16xf32> -> vector<1x1x1x1x2x1xf32>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x2xf32> to vector<2xf32>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x2x1xf32> to vector<2xf32>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x4 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<2xf32>, vector<2xf32>, vector<8xf32>
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-//       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -829,19 +809,15 @@
 
 // CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x32_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x32xf16>, %[[B:.+]]: vector<32x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x32xf16> -> vector<1x1x1x1x1x16xf16>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<32x16xf16> -> vector<1x1x1x1x16x1xf16>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x16xf16> from vector<1x1x1x1x1x16xf16>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x16x1xf16> from vector<1x1x1x1x16x1xf16>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf16>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf16>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x32xf16> -> vector<1x1x1x1x1x16xf16>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<32x16xf16> -> vector<1x1x1x1x16x1xf16>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x16xf16> to vector<16xf16>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x16x1xf16> to vector<16xf16>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x32 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<16xf16>, vector<16xf16>, vector<8xf32>
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-//       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -914,19 +890,15 @@
 
 // CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x64_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x64xf8E4M3FN>, %[[B:.+]]: vector<64x16xf8E4M3FN>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x64xf8E4M3FN> -> vector<1x1x1x1x1x32xf8E4M3FN>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<64x16xf8E4M3FN> -> vector<1x1x1x1x32x1xf8E4M3FN>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x32xf8E4M3FN> from vector<1x1x1x1x1x32xf8E4M3FN>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x32x1xf8E4M3FN> from vector<1x1x1x1x32x1xf8E4M3FN>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x32xf8E4M3FN> to vector<32xf8E4M3FN>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x32x1xf8E4M3FN> to vector<32xf8E4M3FN>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x64xf8E4M3FN> -> vector<1x1x1x1x1x32xf8E4M3FN>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<64x16xf8E4M3FN> -> vector<1x1x1x1x32x1xf8E4M3FN>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x32xf8E4M3FN> to vector<32xf8E4M3FN>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x32x1xf8E4M3FN> to vector<32xf8E4M3FN>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x64 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
-//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-//       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+//       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+//       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
 //       CHECK:   return %[[R_SIMD]]
 
 // -----
@@ -999,15 +971,12 @@
 
 // CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x128_mm
 //  CHECK-SAME: (%[[A:.+]]: vector<16x128xf8E4M3FN>, %[[B:.+]]: vector<128x16xf8E4M3FN>, %[[C:.+]]: vector<16x16xf32>)
-//       CHECK:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x128xf8E4M3FN> -> vector<1x1x1x1x1x64xf8E4M3FN>
-//       CHECK:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<128x16xf8E4M3FN> -> vector<1x1x1x1x64x1xf8E4M3FN>
-//       CHECK:   %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-//       CHECK:   %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x64xf8E4M3FN> from vector<1x1x1x1x1x64xf8E4M3FN>
-//       CHECK:   %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x64x1xf8E4M3FN> from vector<1x1x1x1x64x1xf8E4M3FN>
-//       CHECK:   %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x64xf8E4M3FN> to vector<64xf8E4M3FN>
-//       CHECK:   %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x64x1xf8E4M3FN> to vector<64xf8E4M3FN>
-//       CHECK:   %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+//   CHECK-DAG:   %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+//   CHECK-DAG:   %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x128xf8E4M3FN> -> vector<1x1x1x1x1x64xf8E4M3FN>
+//   CHECK-DAG:   %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<128x16xf8E4M3FN> -> vector<1x1x1x1x64x1xf8E4M3FN>
+//   CHECK-DAG:   %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x64xf8E4M3FN> to vector<64xf8E4M3FN>
+//   CHECK-DAG:   %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x64x1xf8E4M3FN> to vector<64xf8E4M3FN>
+//   CHECK-DAG:   %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
 //       CHECK:   %[[WMMA:.+]] = amdgpu.wmma 16x16x128 %[[A_CAST]] * %[[B_CAST]]
 
 // -----
@@ -1085,9 +1054,9 @@
 // 3. Result of first mma becomes the second mma's accumulator.
 
 // CHECK-LABEL: func @contract_to_vmfma_32x32x16_mm
-// CHECK:       %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x8xf16> to vector<8xf16>
-// CHECK:       %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x8x1xf16> to vector<8xf16>
-// CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK:       %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf16> to vector<8xf16>
+// CHECK:       %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x8x1xf16> to vector<8xf16>
+// CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1x4x1xf32> to vector<16xf32>
 // CHECK:       %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp =  none
@@ -1096,9 +1065,8 @@
 // CHECK:       %[[B_SLICE_1:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[MFMA_1:.+]] = amdgpu.mfma 32x32x8 %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]] blgp =  none
 // CHECK-SAME:     : vector<4xf16>, vector<4xf16>, vector<16xf32>
-// CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
-// CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
-// CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
+// CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<1x1x4x1x4x1xf32>
+// CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
 // CHECK:       return %[[R_SIMD]]
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
index 63e8382..ea58e80 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -72,14 +72,13 @@
 //      CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
 //          CHECK:       gpu.barrier
 //      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x1x1x2x4xf16>
-//      CHECK-DAG:       %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<4x1x1x2x4xf16> to vector<1x4x1x1x2x4xf16>
+//      CHECK-DAG:       %[[LHS_MM1:.+]] = vector.shape_cast {{.*}} vector<4x1x1x2x4xf16> to vector<1x4x1x2x1x4xf16>
 //      CHECK-DAG:       %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x4x1xf16>
-//      CHECK-DAG:       vector.transpose %[[LHS_MM1]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x1x2x4xf16> to vector<1x4x1x2x1x4xf16>
 //      CHECK-DAG:       vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x4x1xf16> to vector<2x4x1x4xf16>
 // CHECK-COUNT-32:       amdgpu.mfma 16x16x16
 //          CHECK:     %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
-//          CHECK:     %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
-//          CHECK:     vector.transfer_write %[[EXTRACT]], %[[BUF2]]
+//          CHECK:     %[[CAST:.+]] = vector.shape_cast %[[LOOP_T]] : vector<1x4x1x4x4x1xf32> to vector<4x1x4x4x1xf32>
+//          CHECK:     vector.transfer_write %[[CAST]], %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
 
 // TODO(Max191): Add tests for more convolution types
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index ae10b0a..be4f85f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -606,7 +606,7 @@
 
 // CHECK:       %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x8xf16> to vector<8xf16>
 // CHECK:       %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x8x1xf16> to vector<8xf16>
-// CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK:       %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1x4x1xf32> to vector<16xf32>
 // CHECK:       %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
 // CHECK:       %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp =  none
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
index 4f55e53..15339be 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
@@ -35,8 +35,7 @@
 //  CHECK-SAME:memref<128x16x256xf32> to memref<16x8xf32, strided<[4096, 1], offset: 8964>>
 //       CHECK: vector.transfer_read %[[M]]
 //  CHECK-SAME: {in_bounds = [true, true]} : memref<16x8xf32, strided<[4096, 1], offset: 8964>>, vector<16x8xf32>
-//       CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x16x8xf32>
-//       CHECK: vector.transpose %{{.*}} [1, 0, 2] : vector<1x16x8xf32> to vector<16x1x8xf32>
+//       CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x8xf32>
 //       CHECK: return %{{.*}} : vector<16x1x8xf32>
 
 // -----
@@ -77,8 +76,7 @@
 //  CHECK-SAME: memref<128x16x32x256xf32> to memref<16x8xf32, strided<[131072, 1], offset: 287749>>
 //       CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
 //  CHECK-SAME: {in_bounds = [true, true]} :  memref<16x8xf32, strided<[131072, 1], offset: 287749>>, vector<16x8xf32>
-//       CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x1x16x8xf32>
-//       CHECK: vector.transpose %{{.*}} [2, 0, 1, 3] : vector<1x1x16x8xf32> to vector<16x1x1x8xf32>
+//       CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x1x8xf32>
 //       CHECK: return %{{.*}} : vector<16x1x1x8xf32>
 
 // -----
@@ -100,8 +98,7 @@
 //  CHECK-SAME: memref<128x512x32x256xf32> to memref<16x8xf32, strided<[8192, 1], offset: 8414213>>
 //       CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
 //  CHECK-SAME: {in_bounds = [true, true]} :  memref<16x8xf32, strided<[8192, 1], offset: 8414213>>, vector<16x8xf32>
-//       CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x16x8xf32>
-//       CHECK: vector.transpose %{{.*}} [1, 0, 2] : vector<1x16x8xf32> to vector<16x1x8xf32>
+//       CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x8xf32>
 //       CHECK: return %{{.*}} : vector<16x1x8xf32>
 
 // -----
@@ -146,7 +143,7 @@
 //  CHECK-SAME: memref<128x16x32x256xf32> to memref<1x1xf32, strided<[131072, 8192], offset: 287749>>
 //       CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
 //  CHECK-SAME: {in_bounds = [true]} :  memref<1x1xf32, strided<[131072, 8192], offset: 287749>>, vector<1xf32>
-//       CHECK: vector.broadcast %{{.*}} : vector<1xf32> to vector<1x1x1x1xf32>
+//       CHECK: vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1x1x1xf32>
 //   CHECK-NOT: vector.transpose
 //       CHECK: return %{{.*}} : vector<1x1x1x1xf32>
 
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
index 699a8ec..d419a06 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
@@ -7,7 +7,8 @@
 // CHECK: scf.for
 // CHECK:   vector.transfer_read {{.+}} : tensor<f32>, vector<f32>
 // CHECK:   arith.mulf {{.+}} : vector<f32>
-// CHECK:   vector.broadcast {{.+}} : vector<f32> to vector<1xf32>
+// CHECK:   %[[VEC:.+]] = vector.extract {{.+}}[] : f32 from vector<f32>
+// CHECK:   vector.insert %[[VEC]], {{.+}} [0] : f32 into vector<1xf32>
 // CHECK:   vector.transfer_write {{.+}} : vector<1xf32>, tensor<32xf32>
 
 func.func @main(%0: tensor<32xf32>, %1: tensor<f32>) -> tensor<32xf32> {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
index 8b09d19..8648966 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
@@ -59,13 +59,13 @@
 //         CHECK:     %[[BROADCAST1:.+]] = vector.broadcast %[[EXTRACT1]] : f32 to vector<4xf32>
 //         CHECK:     %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[BROADCAST1]] : vector<4xf32>
 //         CHECK:     %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<4xf32>
-//         CHECK:     %[[EXTRACT2:.+]] = vector.extract %arg1[0] : vector<4xf32> from vector<1x4xf32>
-//         CHECK:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[EXTRACT2]] : vector<4xf32>
-//         CHECK:     %[[BCAST:.+]] = vector.broadcast %[[ADD]] : vector<4xf32> to vector<1x4xf32>
-//         CHECK:     scf.yield %[[BCAST]] : vector<1x4xf32>
+//         CHECK:     %[[SHAPE_CAST:.+]] = vector.shape_cast %arg1 : vector<1x4xf32> to vector<4xf32>
+//         CHECK:     %[[ADD:.+]] = arith.addf %[[MUL1]], %[[SHAPE_CAST]] : vector<4xf32>
+//         CHECK:     %[[SHAPE_CAST2:.+]] = vector.shape_cast %[[ADD]] : vector<4xf32> to vector<1x4xf32>
+//         CHECK:     scf.yield %[[SHAPE_CAST2]] : vector<1x4xf32>
 
-//         CHECK:   %[[EXTRACT3:.+]] = vector.extract %[[FOR]][0] : vector<4xf32> from vector<1x4xf32>
-//         CHECK:   %[[REDUCE:.+]] = vector.reduction <add>, %[[EXTRACT3]] : vector<4xf32> into f32
+//         CHECK:   %[[SHAPE_CAST3:.+]] = vector.shape_cast %[[FOR]] : vector<1x4xf32> to vector<4xf32>
+//         CHECK:   %[[REDUCE:.+]] = vector.reduction <add>, %[[SHAPE_CAST3]] : vector<4xf32> into f32
 //         CHECK:   gpu.subgroup_reduce add %[[REDUCE]] : (f32) -> f32
 //         CHECK:   scf.if
 //         CHECK:     vector.transfer_write
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
index d80faa9..e4bf969 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
@@ -15,7 +15,7 @@
 //  CHECK-COUNT-8:   vector.transfer_read %[[INPUT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32>
 // CHECK-COUNT-16:   vector.transfer_read %[[FILTER]]{{.+}} : tensor<4x4x1xf32>, vector<1xf32>
 //  CHECK-COUNT-8:   vector.transfer_read %[[INIT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32>
-// CHECK-COUNT-16:   vector.extract %{{.+}}[0] : f32 from vector<1xf32>
+// CHECK-COUNT-4:   vector.extract %{{.+}}[0] : f32 from vector<1xf32>
 //      CHECK-NOT:   vector.insert
 // CHECK-COUNT-32:   vector.fma {{.+}} : vector<4xf32>
 //      CHECK-NOT:   vector.insert
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
index f2d4fe6..ad195d3 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
@@ -23,16 +23,16 @@
 //       CHECK:   %[[RHS_3_VECTOR:.+]] = vector.transfer_read %[[RHS]][%[[C3]], %[[C0]]], %[[PAD]]
 //       CHECK:   %[[INIT_VECTOR:.+]] = vector.transfer_read %[[INIT]][%[[C0]], %[[C0]]], %[[PAD]]
 //       CHECK:   %[[LHS_0_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][0]
-//       CHECK:   %[[LHS_0_VECTOR:.+]] = vector.broadcast %[[LHS_0_SCALAR]] : f32 to vector<4xf32>
+//       CHECK:   %[[LHS_0_VECTOR:.+]] = vector.from_elements %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]]
 //       CHECK:   %[[FMA_0:.+]] = vector.fma %[[LHS_0_VECTOR]], %[[RHS_0_VECTOR]], %[[INIT_VECTOR]] : vector<4xf32>
 //       CHECK:   %[[LHS_1_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][1]
-//       CHECK:   %[[LHS_1_VECTOR:.+]] = vector.broadcast %[[LHS_1_SCALAR]] : f32 to vector<4xf32>
+//       CHECK:   %[[LHS_1_VECTOR:.+]] = vector.from_elements %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]]
 //       CHECK:   %[[FMA_1:.+]] = vector.fma %[[LHS_1_VECTOR]], %[[RHS_1_VECTOR]], %[[FMA_0]] : vector<4xf32>
 //       CHECK:   %[[LHS_2_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][2]
-//       CHECK:   %[[LHS_2_VECTOR:.+]] = vector.broadcast %[[LHS_2_SCALAR]] : f32 to vector<4xf32>
+//       CHECK:   %[[LHS_2_VECTOR:.+]] = vector.from_elements %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]]
 //       CHECK:   %[[FMA_2:.+]] = vector.fma %[[LHS_2_VECTOR]], %[[RHS_2_VECTOR]], %[[FMA_1]] : vector<4xf32>
 //       CHECK:   %[[LHS_3_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][3]
-//       CHECK:   %[[LHS_3_VECTOR:.+]] = vector.broadcast %[[LHS_3_SCALAR]] : f32 to vector<4xf32>
+//       CHECK:   %[[LHS_3_VECTOR:.+]] = vector.from_elements %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]]
 //       CHECK:   %[[FMA_3:.+]] = vector.fma %[[LHS_3_VECTOR]], %[[RHS_3_VECTOR]], %[[FMA_2]] : vector<4xf32>
 //       CHECK:   vector.transfer_write %[[FMA_3]], %[[INIT]][%[[C0]], %[[C0]]]
 
@@ -239,10 +239,10 @@
 // CHECK-NEXT:    %[[RHS2E:.+]]  = arith.extsi %[[RHS2]] : vector<4xi8> to vector<4xi32>
 // CHECK-NEXT:    %[[RHS3E:.+]]  = arith.extsi %[[RHS3]] : vector<4xi8> to vector<4xi32>
 // CHECK:         %[[EXT0:.+]]   = vector.extract %[[LHS0E]][0]
-// CHECK-NEXT:    %[[BROADCAST:.+]]   = vector.broadcast %[[EXT0]] : i32 to vector<4xi32>
+// CHECK-NEXT:    %[[BROADCAST:.+]]   = vector.from_elements %[[EXT0]], %[[EXT0]], %[[EXT0]], %[[EXT0]]
 // CHECK-NEXT:    %[[MUL0:.+]]   = arith.muli %[[BROADCAST]], %[[RHS0E]]
 // CHECK:         %[[EXT1:.+]]   = vector.extract %[[LHS0E]][1]
-// CHECK-NEXT:    %[[BROADCAST:.+]]   = vector.broadcast %[[EXT1]] : i32 to vector<4xi32>
+// CHECK-NEXT:    %[[BROADCAST:.+]]   = vector.from_elements %[[EXT1]], %[[EXT1]], %[[EXT1]], %[[EXT1]]
 // CHECK-NEXT:    %[[MUL1:.+]]   = arith.muli %[[BROADCAST]], %[[RHS1E]]
 // CHECK-NEXT:    %[[ADD0:.+]]   = arith.addi %[[MUL1]], %[[MUL0]]
 //
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
index a4332e1..6933526 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
@@ -302,11 +302,10 @@
 //   CHECK-DAG:   %[[CST_2:.+]] = arith.constant dense<2> : vector<4x1xindex>
 //   CHECK-DAG:   %[[FLAT_OUTPUT:.+]] = memref.collapse_shape %[[OUTPUT]] {{.*}} memref<8x16xf4E2M1FN> into memref<128xf4E2M1FN>
 //   CHECK-DAG:   %[[STEP_4:.+]] = vector.step : vector<4xindex>
-//   CHECK-DAG:   %[[BROADCAST_1x4:.+]] = vector.broadcast %[[STEP_4]] : vector<4xindex> to vector<1x4xindex>
-//   CHECK-DAG:   %[[TRANSPOSE:.+]] = vector.transpose %[[BROADCAST_1x4]], [1, 0] : vector<1x4xindex> to vector<4x1xindex>
+//   CHECK-DAG:   %[[SHAPE_CAST_0:.+]] = vector.shape_cast %[[STEP_4]] : vector<4xindex> to vector<4x1xindex>
 //   CHECK-DAG:   %[[STEP_1:.+]] = vector.step : vector<1xindex>
-//   CHECK-DAG:   %[[CMPI:.+]] = arith.cmpi ult, %[[TRANSPOSE]], %[[CST_2]] : vector<4x1xindex>
-//   CHECK-DAG:   %[[MULI:.+]] = arith.muli %[[TRANSPOSE]], %[[CST_16]] overflow<nsw> : vector<4x1xindex>
+//   CHECK-DAG:   %[[CMPI:.+]] = arith.cmpi ult, %[[SHAPE_CAST_0]], %[[CST_2]] : vector<4x1xindex>
+//   CHECK-DAG:   %[[MULI:.+]] = arith.muli %[[SHAPE_CAST_0]], %[[CST_16]] overflow<nsw> : vector<4x1xindex>
 //   CHECK-DAG:   %[[BROADCAST_4x1:.+]] = vector.broadcast %[[STEP_1]] : vector<1xindex> to vector<4x1xindex>
 //   CHECK-DAG:   %[[ADDI:.+]] = arith.addi %[[MULI]], %[[BROADCAST_4x1]] overflow<nsw> : vector<4x1xindex>
 //       CHECK:   %[[EXTRACT_COND_0:.+]] = vector.extract %[[CMPI]][0, 0] : i1 from vector<4x1xi1>
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 34bc56e..a0d75a9 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 34bc56e363acf50deba46141335abe28ced25159
+Subproject commit a0d75a96218ff238133e3a5d1e12b8d4c9509a47