Integrate LLVM@5c35af8f1e6ebc7c32 (#23252)
Reverts carried forward:
* Local revert of https://github.com/llvm/llvm-project/pull/169614 due
to https://github.com/iree-org/iree/issues/22649
Other changes:
* Fixes lit tests to account for
https://github.com/llvm/llvm-project/pull/174452
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index ac0aea5..2aea851 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
@@ -68,20 +68,16 @@
// CHECK-LABEL: func @contract_to_mfma_32x32x8_mm
// CHECK-SAME: (%[[A:.+]]: vector<32x8xf16>, %[[B:.+]]: vector<8x32xf16>, %[[C:.+]]: vector<32x32xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<32x32xf32> -> vector<1x1x4x1x4x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<32x8xf16> -> vector<1x1x1x1x1x4xf16>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<8x32xf16> -> vector<1x1x1x1x4x1xf16>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<4x1x4x1xf32> from vector<1x1x4x1x4x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<1x1x1x1x1x4xf16>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x4x1xf16> from vector<1x1x1x1x4x1xf16>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<32x32xf32> -> vector<1x1x4x1x4x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<32x8xf16> -> vector<1x1x1x1x1x4xf16>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<8x32xf16> -> vector<1x1x1x1x4x1xf16>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x4xf16> to vector<4xf16>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x4x1xf16> to vector<4xf16>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x4x1x4x1xf32> to vector<16xf32>
// CHECK: %[[MFMA:.+]] = amdgpu.mfma 32x32x8 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp = none
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<16xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<1x1x4x1x4x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -144,21 +140,17 @@
// CHECK-LABEL: func @contract_to_mfma_16x16x16_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x4x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x4xf16>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x4x1xf16>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x4x1xf32> from vector<1x1x1x1x4x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x4xf16> from vector<1x1x1x1x1x4xf16>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x4x1xf16> from vector<1x1x1x1x4x1xf16>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x4xf16> to vector<4xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x4x1xf16> to vector<4xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x4x1xf32> to vector<4xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x4x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x4xf16>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x4x1xf16>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x4xf16> to vector<4xf16>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x4x1xf16> to vector<4xf16>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x4x1xf32> to vector<4xf32>
// CHECK: %[[MFMA:.+]] = amdgpu.mfma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] blgp = none
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<4xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<4xf32> to vector<1x1x4x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x4x1xf32> to vector<1x1x1x1x4x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<4xf32> to vector<1x1x1x1x4x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -573,19 +565,15 @@
// CHECK-LABEL: func.func @contract_to_WMMAR3_16x16x16_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x8x1x1x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x16xf16>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x16x1xf16>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<8x1x1x1xf32> from vector<1x1x8x1x1x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x16xf16> from vector<1x1x1x1x1x16xf16>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x16x1xf16> from vector<1x1x1x1x16x1xf16>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<8x1x1x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x8x1x1x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x16xf16>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x16x1xf16>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x16xf16> to vector<16xf16>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x8x1x1x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1x1x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -659,19 +647,15 @@
// CHECK-LABEL: func.func @contract_to_WMMAR4_16x16x16_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x16xf16>, %[[B:.+]]: vector<16x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x8xf16>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x8x1xf16>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x8xf16> from vector<1x1x1x1x1x8xf16>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x8x1xf16> from vector<1x1x1x1x8x1xf16>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x8xf16> to vector<8xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x8x1xf16> to vector<8xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x16xf16> -> vector<1x1x1x1x1x8xf16>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<16x16xf16> -> vector<1x1x1x1x8x1xf16>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x8xf16> to vector<8xf16>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x8x1xf16> to vector<8xf16>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x16 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]]
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -744,19 +728,15 @@
// CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x4_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x4xf32>, %[[B:.+]]: vector<4x16xf32>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x4xf32> -> vector<1x1x1x1x1x2xf32>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<4x16xf32> -> vector<1x1x1x1x2x1xf32>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x2xf32> from vector<1x1x1x1x1x2xf32>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x2x1xf32> from vector<1x1x1x1x2x1xf32>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x2xf32> to vector<2xf32>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x2x1xf32> to vector<2xf32>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x4xf32> -> vector<1x1x1x1x1x2xf32>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<4x16xf32> -> vector<1x1x1x1x2x1xf32>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x2xf32> to vector<2xf32>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x2x1xf32> to vector<2xf32>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x4 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<2xf32>, vector<2xf32>, vector<8xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -829,19 +809,15 @@
// CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x32_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x32xf16>, %[[B:.+]]: vector<32x16xf16>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x32xf16> -> vector<1x1x1x1x1x16xf16>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<32x16xf16> -> vector<1x1x1x1x16x1xf16>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x16xf16> from vector<1x1x1x1x1x16xf16>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x16x1xf16> from vector<1x1x1x1x16x1xf16>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x16xf16> to vector<16xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x16x1xf16> to vector<16xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x32xf16> -> vector<1x1x1x1x1x16xf16>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<32x16xf16> -> vector<1x1x1x1x16x1xf16>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x16xf16> to vector<16xf16>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x16x1xf16> to vector<16xf16>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x32 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<16xf16>, vector<16xf16>, vector<8xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -914,19 +890,15 @@
// CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x64_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x64xf8E4M3FN>, %[[B:.+]]: vector<64x16xf8E4M3FN>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x64xf8E4M3FN> -> vector<1x1x1x1x1x32xf8E4M3FN>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<64x16xf8E4M3FN> -> vector<1x1x1x1x32x1xf8E4M3FN>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x32xf8E4M3FN> from vector<1x1x1x1x1x32xf8E4M3FN>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x32x1xf8E4M3FN> from vector<1x1x1x1x32x1xf8E4M3FN>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x32xf8E4M3FN> to vector<32xf8E4M3FN>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x32x1xf8E4M3FN> to vector<32xf8E4M3FN>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x64xf8E4M3FN> -> vector<1x1x1x1x1x32xf8E4M3FN>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<64x16xf8E4M3FN> -> vector<1x1x1x1x32x1xf8E4M3FN>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x32xf8E4M3FN> to vector<32xf8E4M3FN>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x32x1xf8E4M3FN> to vector<32xf8E4M3FN>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x64 %[[A_CAST]] * %[[B_CAST]] + %[[C_CAST]] : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<8xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x8x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<1x1x8x1xf32> to vector<1x1x1x1x8x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<1x1x1x1x8x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x1x1x8x1xf32> -> vector<16x16xf32>
// CHECK: return %[[R_SIMD]]
// -----
@@ -999,15 +971,12 @@
// CHECK-LABEL: func.func @contract_to_gfx1250_WMMA_16x16x128_mm
// CHECK-SAME: (%[[A:.+]]: vector<16x128xf8E4M3FN>, %[[B:.+]]: vector<128x16xf8E4M3FN>, %[[C:.+]]: vector<16x16xf32>)
-// CHECK: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x128xf8E4M3FN> -> vector<1x1x1x1x1x64xf8E4M3FN>
-// CHECK: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<128x16xf8E4M3FN> -> vector<1x1x1x1x64x1xf8E4M3FN>
-// CHECK: %[[C_VEC:.+]] = vector.extract %[[C_SIMT]][0, 0] : vector<1x1x8x1xf32> from vector<1x1x1x1x8x1xf32>
-// CHECK: %[[A_VEC:.+]] = vector.extract %[[A_SIMT]][0, 0] : vector<1x1x1x64xf8E4M3FN> from vector<1x1x1x1x1x64xf8E4M3FN>
-// CHECK: %[[B_VEC:.+]] = vector.extract %[[B_SIMT]][0, 0] : vector<1x1x64x1xf8E4M3FN> from vector<1x1x1x1x64x1xf8E4M3FN>
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %[[A_VEC]] : vector<1x1x1x64xf8E4M3FN> to vector<64xf8E4M3FN>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %[[B_VEC]] : vector<1x1x64x1xf8E4M3FN> to vector<64xf8E4M3FN>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %[[C_VEC]] : vector<1x1x8x1xf32> to vector<8xf32>
+// CHECK-DAG: %[[C_SIMT:.+]] = iree_vector_ext.to_simt %[[C]] : vector<16x16xf32> -> vector<1x1x1x1x8x1xf32>
+// CHECK-DAG: %[[A_SIMT:.+]] = iree_vector_ext.to_simt %[[A]] : vector<16x128xf8E4M3FN> -> vector<1x1x1x1x1x64xf8E4M3FN>
+// CHECK-DAG: %[[B_SIMT:.+]] = iree_vector_ext.to_simt %[[B]] : vector<128x16xf8E4M3FN> -> vector<1x1x1x1x64x1xf8E4M3FN>
+// CHECK-DAG: %[[A_CAST:.+]] = vector.shape_cast %[[A_SIMT]] : vector<1x1x1x1x1x64xf8E4M3FN> to vector<64xf8E4M3FN>
+// CHECK-DAG: %[[B_CAST:.+]] = vector.shape_cast %[[B_SIMT]] : vector<1x1x1x1x64x1xf8E4M3FN> to vector<64xf8E4M3FN>
+// CHECK-DAG: %[[C_CAST:.+]] = vector.shape_cast %[[C_SIMT]] : vector<1x1x1x1x8x1xf32> to vector<8xf32>
// CHECK: %[[WMMA:.+]] = amdgpu.wmma 16x16x128 %[[A_CAST]] * %[[B_CAST]]
// -----
@@ -1085,9 +1054,9 @@
// 3. Result of first mma becomes the second mma's accumulator.
// CHECK-LABEL: func @contract_to_vmfma_32x32x16_mm
-// CHECK: %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x8xf16> to vector<8xf16>
-// CHECK: %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x8x1xf16> to vector<8xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK: %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x1x8xf16> to vector<8xf16>
+// CHECK: %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x1x8x1xf16> to vector<8xf16>
+// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1x4x1xf32> to vector<16xf32>
// CHECK: %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
// CHECK: %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp = none
@@ -1096,9 +1065,8 @@
// CHECK: %[[B_SLICE_1:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [4], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
// CHECK: %[[MFMA_1:.+]] = amdgpu.mfma 32x32x8 %[[A_SLICE_1]] * %[[B_SLICE_1]] + %[[MFMA_0]] blgp = none
// CHECK-SAME: : vector<4xf16>, vector<4xf16>, vector<16xf32>
-// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
-// CHECK: %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
-// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
+// CHECK: %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<1x1x4x1x4x1xf32>
+// CHECK: %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[R_CAST]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
// CHECK: return %[[R_SIMD]]
// -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
index 63e8382..ea58e80 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_igemm_tile_and_fuse.mlir
@@ -72,14 +72,13 @@
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
// CHECK: gpu.barrier
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x1x1x2x4xf16>
-// CHECK-DAG: %[[LHS_MM1:.+]] = vector.broadcast {{.*}} vector<4x1x1x2x4xf16> to vector<1x4x1x1x2x4xf16>
+// CHECK-DAG: %[[LHS_MM1:.+]] = vector.shape_cast {{.*}} vector<4x1x1x2x4xf16> to vector<1x4x1x2x1x4xf16>
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<2x4x4x1xf16>
-// CHECK-DAG: vector.transpose %[[LHS_MM1]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x1x2x4xf16> to vector<1x4x1x2x1x4xf16>
// CHECK-DAG: vector.transpose %[[RHS_MM]], [0, 2, 3, 1] : vector<2x4x4x1xf16> to vector<2x4x1x4xf16>
// CHECK-COUNT-32: amdgpu.mfma 16x16x16
// CHECK: %[[LOOP_T:.+]] = vector.transpose %[[LOOP]], [0, 1, 2, 4, 3, 5] : vector<1x4x1x4x4x1xf32> to vector<1x4x1x4x4x1xf32>
-// CHECK: %[[EXTRACT:.+]] = vector.extract %[[LOOP_T]][0] : vector<4x1x4x4x1xf32> from vector<1x4x1x4x4x1xf32>
-// CHECK: vector.transfer_write %[[EXTRACT]], %[[BUF2]]
+// CHECK: %[[CAST:.+]] = vector.shape_cast %[[LOOP_T]] : vector<1x4x1x4x4x1xf32> to vector<4x1x4x4x1xf32>
+// CHECK: vector.transfer_write %[[CAST]], %[[BUF2]]
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
// TODO(Max191): Add tests for more convolution types
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index ae10b0a..be4f85f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
@@ -606,7 +606,7 @@
// CHECK: %[[A_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x1x8xf16> to vector<8xf16>
// CHECK: %[[B_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x8x1xf16> to vector<8xf16>
-// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<4x1x4x1xf32> to vector<16xf32>
+// CHECK: %[[C_CAST:.+]] = vector.shape_cast %{{.+}} : vector<1x1x4x1x4x1xf32> to vector<16xf32>
// CHECK: %[[A_SLICE_0:.+]] = vector.extract_strided_slice %[[A_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
// CHECK: %[[B_SLICE_0:.+]] = vector.extract_strided_slice %[[B_CAST]] {offsets = [0], sizes = [4], strides = [1]} : vector<8xf16> to vector<4xf16>
// CHECK: %[[MFMA_0:.*]] = amdgpu.mfma 32x32x8 %[[A_SLICE_0]] * %[[B_SLICE_0]] + %[[C_CAST]] blgp = none
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
index 4f55e53..15339be 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_to_gpu.mlir
@@ -35,8 +35,7 @@
// CHECK-SAME:memref<128x16x256xf32> to memref<16x8xf32, strided<[4096, 1], offset: 8964>>
// CHECK: vector.transfer_read %[[M]]
// CHECK-SAME: {in_bounds = [true, true]} : memref<16x8xf32, strided<[4096, 1], offset: 8964>>, vector<16x8xf32>
-// CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x16x8xf32>
-// CHECK: vector.transpose %{{.*}} [1, 0, 2] : vector<1x16x8xf32> to vector<16x1x8xf32>
+// CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x8xf32>
// CHECK: return %{{.*}} : vector<16x1x8xf32>
// -----
@@ -77,8 +76,7 @@
// CHECK-SAME: memref<128x16x32x256xf32> to memref<16x8xf32, strided<[131072, 1], offset: 287749>>
// CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
// CHECK-SAME: {in_bounds = [true, true]} : memref<16x8xf32, strided<[131072, 1], offset: 287749>>, vector<16x8xf32>
-// CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x1x16x8xf32>
-// CHECK: vector.transpose %{{.*}} [2, 0, 1, 3] : vector<1x1x16x8xf32> to vector<16x1x1x8xf32>
+// CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x1x8xf32>
// CHECK: return %{{.*}} : vector<16x1x1x8xf32>
// -----
@@ -100,8 +98,7 @@
// CHECK-SAME: memref<128x512x32x256xf32> to memref<16x8xf32, strided<[8192, 1], offset: 8414213>>
// CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
// CHECK-SAME: {in_bounds = [true, true]} : memref<16x8xf32, strided<[8192, 1], offset: 8414213>>, vector<16x8xf32>
-// CHECK: vector.broadcast %{{.*}} : vector<16x8xf32> to vector<1x16x8xf32>
-// CHECK: vector.transpose %{{.*}} [1, 0, 2] : vector<1x16x8xf32> to vector<16x1x8xf32>
+// CHECK: vector.shape_cast %{{.*}} : vector<16x8xf32> to vector<16x1x8xf32>
// CHECK: return %{{.*}} : vector<16x1x8xf32>
// -----
@@ -146,7 +143,7 @@
// CHECK-SAME: memref<128x16x32x256xf32> to memref<1x1xf32, strided<[131072, 8192], offset: 287749>>
// CHECK: vector.transfer_read %[[M]][%[[ID]], %[[ID]]]
// CHECK-SAME: {in_bounds = [true]} : memref<1x1xf32, strided<[131072, 8192], offset: 287749>>, vector<1xf32>
-// CHECK: vector.broadcast %{{.*}} : vector<1xf32> to vector<1x1x1x1xf32>
+// CHECK: vector.shape_cast %{{.*}} : vector<1xf32> to vector<1x1x1x1xf32>
// CHECK-NOT: vector.transpose
// CHECK: return %{{.*}} : vector<1x1x1x1xf32>
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
index 699a8ec..d419a06 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/initial_vector_lowering_0d.mlir
@@ -7,7 +7,8 @@
// CHECK: scf.for
// CHECK: vector.transfer_read {{.+}} : tensor<f32>, vector<f32>
// CHECK: arith.mulf {{.+}} : vector<f32>
-// CHECK: vector.broadcast {{.+}} : vector<f32> to vector<1xf32>
+// CHECK: %[[VEC:.+]] = vector.extract {{.+}}[] : f32 from vector<f32>
+// CHECK: vector.insert %[[VEC]], {{.+}} [0] : f32 into vector<1xf32>
// CHECK: vector.transfer_write {{.+}} : vector<1xf32>, tensor<32xf32>
func.func @main(%0: tensor<32xf32>, %1: tensor<f32>) -> tensor<32xf32> {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
index 8b09d19..8648966 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir
@@ -59,13 +59,13 @@
// CHECK: %[[BROADCAST1:.+]] = vector.broadcast %[[EXTRACT1]] : f32 to vector<4xf32>
// CHECK: %[[MUL0:.+]] = arith.mulf %[[SUB]], %[[BROADCAST1]] : vector<4xf32>
// CHECK: %[[MUL1:.+]] = arith.mulf %[[READ3]], %[[MUL0]] : vector<4xf32>
-// CHECK: %[[EXTRACT2:.+]] = vector.extract %arg1[0] : vector<4xf32> from vector<1x4xf32>
-// CHECK: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[EXTRACT2]] : vector<4xf32>
-// CHECK: %[[BCAST:.+]] = vector.broadcast %[[ADD]] : vector<4xf32> to vector<1x4xf32>
-// CHECK: scf.yield %[[BCAST]] : vector<1x4xf32>
+// CHECK: %[[SHAPE_CAST:.+]] = vector.shape_cast %arg1 : vector<1x4xf32> to vector<4xf32>
+// CHECK: %[[ADD:.+]] = arith.addf %[[MUL1]], %[[SHAPE_CAST]] : vector<4xf32>
+// CHECK: %[[SHAPE_CAST2:.+]] = vector.shape_cast %[[ADD]] : vector<4xf32> to vector<1x4xf32>
+// CHECK: scf.yield %[[SHAPE_CAST2]] : vector<1x4xf32>
-// CHECK: %[[EXTRACT3:.+]] = vector.extract %[[FOR]][0] : vector<4xf32> from vector<1x4xf32>
-// CHECK: %[[REDUCE:.+]] = vector.reduction <add>, %[[EXTRACT3]] : vector<4xf32> into f32
+// CHECK: %[[SHAPE_CAST3:.+]] = vector.shape_cast %[[FOR]] : vector<1x4xf32> to vector<4xf32>
+// CHECK: %[[REDUCE:.+]] = vector.reduction <add>, %[[SHAPE_CAST3]] : vector<4xf32> into f32
// CHECK: gpu.subgroup_reduce add %[[REDUCE]] : (f32) -> f32
// CHECK: scf.if
// CHECK: vector.transfer_write
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
index d80faa9..e4bf969 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir
@@ -15,7 +15,7 @@
// CHECK-COUNT-8: vector.transfer_read %[[INPUT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32>
// CHECK-COUNT-16: vector.transfer_read %[[FILTER]]{{.+}} : tensor<4x4x1xf32>, vector<1xf32>
// CHECK-COUNT-8: vector.transfer_read %[[INIT]]{{.+}} : tensor<2x4x4xf32>, vector<4xf32>
-// CHECK-COUNT-16: vector.extract %{{.+}}[0] : f32 from vector<1xf32>
+// CHECK-COUNT-4: vector.extract %{{.+}}[0] : f32 from vector<1xf32>
// CHECK-NOT: vector.insert
// CHECK-COUNT-32: vector.fma {{.+}} : vector<4xf32>
// CHECK-NOT: vector.insert
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
index f2d4fe6..ad195d3 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir
@@ -23,16 +23,16 @@
// CHECK: %[[RHS_3_VECTOR:.+]] = vector.transfer_read %[[RHS]][%[[C3]], %[[C0]]], %[[PAD]]
// CHECK: %[[INIT_VECTOR:.+]] = vector.transfer_read %[[INIT]][%[[C0]], %[[C0]]], %[[PAD]]
// CHECK: %[[LHS_0_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][0]
-// CHECK: %[[LHS_0_VECTOR:.+]] = vector.broadcast %[[LHS_0_SCALAR]] : f32 to vector<4xf32>
+// CHECK: %[[LHS_0_VECTOR:.+]] = vector.from_elements %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]], %[[LHS_0_SCALAR]]
// CHECK: %[[FMA_0:.+]] = vector.fma %[[LHS_0_VECTOR]], %[[RHS_0_VECTOR]], %[[INIT_VECTOR]] : vector<4xf32>
// CHECK: %[[LHS_1_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][1]
-// CHECK: %[[LHS_1_VECTOR:.+]] = vector.broadcast %[[LHS_1_SCALAR]] : f32 to vector<4xf32>
+// CHECK: %[[LHS_1_VECTOR:.+]] = vector.from_elements %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]], %[[LHS_1_SCALAR]]
// CHECK: %[[FMA_1:.+]] = vector.fma %[[LHS_1_VECTOR]], %[[RHS_1_VECTOR]], %[[FMA_0]] : vector<4xf32>
// CHECK: %[[LHS_2_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][2]
-// CHECK: %[[LHS_2_VECTOR:.+]] = vector.broadcast %[[LHS_2_SCALAR]] : f32 to vector<4xf32>
+// CHECK: %[[LHS_2_VECTOR:.+]] = vector.from_elements %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]], %[[LHS_2_SCALAR]]
// CHECK: %[[FMA_2:.+]] = vector.fma %[[LHS_2_VECTOR]], %[[RHS_2_VECTOR]], %[[FMA_1]] : vector<4xf32>
// CHECK: %[[LHS_3_SCALAR:.+]] = vector.extract %[[LHS_VECTOR]][3]
-// CHECK: %[[LHS_3_VECTOR:.+]] = vector.broadcast %[[LHS_3_SCALAR]] : f32 to vector<4xf32>
+// CHECK: %[[LHS_3_VECTOR:.+]] = vector.from_elements %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]], %[[LHS_3_SCALAR]]
// CHECK: %[[FMA_3:.+]] = vector.fma %[[LHS_3_VECTOR]], %[[RHS_3_VECTOR]], %[[FMA_2]] : vector<4xf32>
// CHECK: vector.transfer_write %[[FMA_3]], %[[INIT]][%[[C0]], %[[C0]]]
@@ -239,10 +239,10 @@
// CHECK-NEXT: %[[RHS2E:.+]] = arith.extsi %[[RHS2]] : vector<4xi8> to vector<4xi32>
// CHECK-NEXT: %[[RHS3E:.+]] = arith.extsi %[[RHS3]] : vector<4xi8> to vector<4xi32>
// CHECK: %[[EXT0:.+]] = vector.extract %[[LHS0E]][0]
-// CHECK-NEXT: %[[BROADCAST:.+]] = vector.broadcast %[[EXT0]] : i32 to vector<4xi32>
+// CHECK-NEXT: %[[BROADCAST:.+]] = vector.from_elements %[[EXT0]], %[[EXT0]], %[[EXT0]], %[[EXT0]]
// CHECK-NEXT: %[[MUL0:.+]] = arith.muli %[[BROADCAST]], %[[RHS0E]]
// CHECK: %[[EXT1:.+]] = vector.extract %[[LHS0E]][1]
-// CHECK-NEXT: %[[BROADCAST:.+]] = vector.broadcast %[[EXT1]] : i32 to vector<4xi32>
+// CHECK-NEXT: %[[BROADCAST:.+]] = vector.from_elements %[[EXT1]], %[[EXT1]], %[[EXT1]], %[[EXT1]]
// CHECK-NEXT: %[[MUL1:.+]] = arith.muli %[[BROADCAST]], %[[RHS1E]]
// CHECK-NEXT: %[[ADD0:.+]] = arith.addi %[[MUL1]], %[[MUL0]]
//
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
index a4332e1..6933526 100644
--- a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
+++ b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_map_scatter.mlir
@@ -302,11 +302,10 @@
// CHECK-DAG: %[[CST_2:.+]] = arith.constant dense<2> : vector<4x1xindex>
// CHECK-DAG: %[[FLAT_OUTPUT:.+]] = memref.collapse_shape %[[OUTPUT]] {{.*}} memref<8x16xf4E2M1FN> into memref<128xf4E2M1FN>
// CHECK-DAG: %[[STEP_4:.+]] = vector.step : vector<4xindex>
-// CHECK-DAG: %[[BROADCAST_1x4:.+]] = vector.broadcast %[[STEP_4]] : vector<4xindex> to vector<1x4xindex>
-// CHECK-DAG: %[[TRANSPOSE:.+]] = vector.transpose %[[BROADCAST_1x4]], [1, 0] : vector<1x4xindex> to vector<4x1xindex>
+// CHECK-DAG: %[[SHAPE_CAST_0:.+]] = vector.shape_cast %[[STEP_4]] : vector<4xindex> to vector<4x1xindex>
// CHECK-DAG: %[[STEP_1:.+]] = vector.step : vector<1xindex>
-// CHECK-DAG: %[[CMPI:.+]] = arith.cmpi ult, %[[TRANSPOSE]], %[[CST_2]] : vector<4x1xindex>
-// CHECK-DAG: %[[MULI:.+]] = arith.muli %[[TRANSPOSE]], %[[CST_16]] overflow<nsw> : vector<4x1xindex>
+// CHECK-DAG: %[[CMPI:.+]] = arith.cmpi ult, %[[SHAPE_CAST_0]], %[[CST_2]] : vector<4x1xindex>
+// CHECK-DAG: %[[MULI:.+]] = arith.muli %[[SHAPE_CAST_0]], %[[CST_16]] overflow<nsw> : vector<4x1xindex>
// CHECK-DAG: %[[BROADCAST_4x1:.+]] = vector.broadcast %[[STEP_1]] : vector<1xindex> to vector<4x1xindex>
// CHECK-DAG: %[[ADDI:.+]] = arith.addi %[[MULI]], %[[BROADCAST_4x1]] overflow<nsw> : vector<4x1xindex>
// CHECK: %[[EXTRACT_COND_0:.+]] = vector.extract %[[CMPI]][0, 0] : i1 from vector<4x1xi1>
diff --git a/third_party/llvm-project b/third_party/llvm-project
index 34bc56e..a0d75a9 160000
--- a/third_party/llvm-project
+++ b/third_party/llvm-project
@@ -1 +1 @@
-Subproject commit 34bc56e363acf50deba46141335abe28ced25159
+Subproject commit a0d75a96218ff238133e3a5d1e12b8d4c9509a47