[WebGPU] Push constants Storage i32 -> Uniform vector<4xi32>. (#11392)

We discussed this [here on
Discord](https://discord.com/channels/689900678990135345/867052513935753277/1047198677514600518).

When using `tensor<Nxi32>` we get an error like this from Tint:
```
Tint reported 1 error(s) for a SPIR-V program, see diagnostics:
error: line:36: Structure id 8 decorated as Block for variable in
  Uniform storage class must follow relaxed uniform buffer layout rules:
  member 0 contains an array with stride 4 not satisfying alignment to 16
```

Following the spec:
https://www.w3.org/TR/WGSL/#address-space-layout-constraints, we should
be aligning to 16. Switching to `tensor<Nxvector<4xi32>>` gives us that
alignment.

This change packs a conceptual array of `i32`s into an array of
`vector<4xi32>`s - possibly with unused elements at the end. Unpacking
changes from using just `tensor::ExtractOp` to using `tensor::ExtractOp`
(to get one of the vec4s) -> `vector::ExtractElementOp` (to get the
specific element).
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/BUILD b/compiler/src/iree/compiler/Codegen/WGSL/BUILD
index c06b6c3..9612b0c 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/BUILD
+++ b/compiler/src/iree/compiler/Codegen/WGSL/BUILD
@@ -26,5 +26,6 @@
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:VectorDialect",
     ],
 )
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
index c55901c..3e7f3dc 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
@@ -20,6 +20,7 @@
     MLIRIR
     MLIRPass
     MLIRTensorDialect
+    MLIRVectorDialect
     iree::compiler::Codegen::PassHeaders
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::HAL::IR
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp b/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
index 3275233..fccc9df 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
+++ b/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Pass/Pass.h"
@@ -24,7 +25,7 @@
 #define IREE_HAL_WEBGPU_PARAMS_BINDING_INDEX 0
 
 static Value convertOpTypeFromI32(IREE::HAL::InterfaceConstantLoadOp loadOp,
-                                  tensor::ExtractOp extractOp) {
+                                  vector::ExtractElementOp extractElementOp) {
   OpBuilder builder(loadOp);
 
   auto loc = loadOp.getLoc();
@@ -32,7 +33,7 @@
 
   // Index
   if (opType.isIndex()) {
-    return builder.create<arith::IndexCastOp>(loc, opType, extractOp);
+    return builder.create<arith::IndexCastOp>(loc, opType, extractElementOp);
   }
 
   unsigned sourceBitWidth = 32;
@@ -41,20 +42,20 @@
   // AnySignlessInteger
   if (opType.isa<IntegerType>()) {
     if (sourceBitWidth > destBitWidth) {
-      return builder.create<arith::TruncIOp>(loc, opType, extractOp);
+      return builder.create<arith::TruncIOp>(loc, opType, extractElementOp);
     } else if (sourceBitWidth < destBitWidth) {
-      return builder.create<arith::ExtUIOp>(loc, opType, extractOp);
+      return builder.create<arith::ExtUIOp>(loc, opType, extractElementOp);
     } else {
-      return extractOp.getResult();
+      return extractElementOp.getResult();
     }
   }
 
   // AnyFloat
-  Value resizedValue = extractOp.getResult();
+  Value resizedValue = extractElementOp.getResult();
   if (sourceBitWidth > destBitWidth) {
-    return builder.create<arith::TruncFOp>(loc, opType, extractOp);
+    return builder.create<arith::TruncFOp>(loc, opType, extractElementOp);
   } else if (sourceBitWidth < destBitWidth) {
-    return builder.create<arith::ExtFOp>(loc, opType, extractOp);
+    return builder.create<arith::ExtFOp>(loc, opType, extractElementOp);
   }
   return builder.create<arith::BitcastOp>(loc, opType, resizedValue);
 }
@@ -63,14 +64,22 @@
                                   IREE::HAL::InterfaceConstantLoadOp op) {
   OpBuilder builder(op);
 
-  // tensor.extract -> i32
-  auto offsetValue = builder.createOrFold<arith::ConstantIndexOp>(
-      op.getLoc(), op.getIndex().getZExtValue());
-  auto extractOp =
-      builder.create<tensor::ExtractOp>(op.getLoc(), loadOp, offsetValue);
+  // tensor.extract -> vector<4xi32>
+  uint64_t vec4Index = op.getIndex().getZExtValue() / 4;
+  auto tensorOffsetValue =
+      builder.createOrFold<arith::ConstantIndexOp>(op.getLoc(), vec4Index);
+  auto tensorExtractOp = builder.createOrFold<tensor::ExtractOp>(
+      op.getLoc(), loadOp, tensorOffsetValue);
+
+  // vector<4xi32> -> i32
+  uint64_t elementIndex = op.getIndex().getZExtValue() % 4;
+  auto vectorOffsetValue =
+      builder.createOrFold<arith::ConstantIndexOp>(op.getLoc(), elementIndex);
+  auto vectorExtractElementOp = builder.create<vector::ExtractElementOp>(
+      op.getLoc(), tensorExtractOp, vectorOffsetValue);
 
   // i32 -> original type
-  auto convertedTypeResult = convertOpTypeFromI32(op, extractOp);
+  auto convertedTypeResult = convertOpTypeFromI32(op, vectorExtractElementOp);
   op.replaceAllUsesWith(convertedTypeResult);
 
   op.erase();
@@ -80,8 +89,8 @@
     : public WGSLReplacePushConstantsBase<WGSLReplacePushConstantsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                    mlir::tensor::TensorDialect, IREE::Flow::FlowDialect,
-                    IREE::HAL::HALDialect>();
+                    mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
+                    IREE::Flow::FlowDialect, IREE::HAL::HALDialect>();
   }
 
   void runOnOperation() override {
@@ -124,13 +133,22 @@
       alignmentAttr = constantLoadOps[0].getAlignmentAttr();
     }
 
+    // We could store into a tensor<Nxi32>, but vec4s are better supported, so
+    // we'll use tensor<Nxvector<4xi32>> instead.
+    // Compute how many vec4s to use, i.e.
+    //   max index 0 -> 1 vec4
+    //   max index 3 -> 1 vec4
+    //   max index 4 -> 2 vec4s
+    uint64_t numberOfVec4s = maxConstantIndex / 4 + 1;
+
     // hal.interface.binding.subspan ->
-    // !flow.dispatch.tensor<readonly:tensor<Nxi32>>
-    //   * Group all push constants into a single tensor<Nxi32>
+    // !flow.dispatch.tensor<readonly:tensor<Nxvector<4xi32>>>
+    //   * Group all push constants into a single tensor<Nxvector<4xi32>>
     //   * If individual data types differ, they'll be bitcast when extracted
+    auto v4i32Type = VectorType::get({4}, builder.getI32Type());
     auto dispatchTensorType = IREE::Flow::DispatchTensorType::get(
         IREE::Flow::TensorAccess::ReadOnly,
-        {static_cast<int64_t>(maxConstantIndex + 1)}, builder.getI32Type());
+        {static_cast<int64_t>(numberOfVec4s)}, v4i32Type);
     SmallVector<Value> dynamicDims;
     // Note: we're ignoring all potential 'values' hints (if provided) on ops -
     // InterfaceBindingSubspanOp has no matching concept and we assume that any
@@ -139,12 +157,12 @@
         loc, dispatchTensorType,
         /*set=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX),
         /*binding=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BINDING_INDEX),
-        IREE::HAL::DescriptorType::StorageBuffer, maxConstantValue, dynamicDims,
-        alignmentAttr);
+        IREE::HAL::DescriptorType::UniformBuffer,
+        /*byte_offset=*/maxConstantValue, dynamicDims, alignmentAttr);
 
-    // flow.dispatch.tensor.load -> tensor<Nxi32>
-    auto tensorType = RankedTensorType::get({(int64_t)maxConstantIndex + 1},
-                                            builder.getI32Type());
+    // flow.dispatch.tensor.load -> tensor<Nxvector<4xi32>>
+    auto tensorType =
+        RankedTensorType::get({(int64_t)numberOfVec4s}, v4i32Type);
     auto loadOp = builder.create<IREE::Flow::DispatchTensorLoadOp>(
         loc, tensorType, subspanOp, dynamicDims);
 
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
index 944f841..8f5feca 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
+++ b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
@@ -10,10 +10,11 @@
 
 // CHECK-LABEL: @constantLoadIndex
 func.func @constantLoadIndex() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
-  // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
-  // CHECK: %[[CAST:.+]] = arith.index_cast %[[EXTRACT]] : i32 to index
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+  // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+  // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index
   %0 = hal.interface.constant.load[0] : index
   // CHECK: = arith.index_cast %[[CAST]] : index to i32
   %1 = arith.index_cast %0 : index to i32
@@ -24,11 +25,12 @@
 
 // CHECK-LABEL: @constantLoadI32
 func.func @constantLoadI32() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
-  // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+  // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
   %0 = hal.interface.constant.load[0] : i32
-  // CHECK: = math.absi %[[EXTRACT]] : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT]] : i32
   %1 = math.absi %0 : i32
   return
 }
@@ -37,10 +39,11 @@
 
 // CHECK-LABEL: @constantLoadI16
 func.func @constantLoadI16() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
-  // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
-  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[EXTRACT]] : i32 to i16
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+  // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+  // CHECK: %[[TRUNC:.+]] = arith.trunci %[[VECTOR_EXTRACT]] : i32 to i16
   %0 = hal.interface.constant.load[0] : i16
   // CHECK: = math.absi %[[TRUNC]] : i16
   %1 = math.absi %0 : i16
@@ -51,10 +54,11 @@
 
 // CHECK-LABEL: @constantLoadF32
 func.func @constantLoadF32() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
-  // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
-  // CHECK: %[[CAST:.+]] = arith.bitcast %[[EXTRACT]] : i32 to f32
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+  // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+  // CHECK: %[[CAST:.+]] = arith.bitcast %[[VECTOR_EXTRACT]] : i32 to f32
   %0 = hal.interface.constant.load[0] : f32
   // CHECK: = math.absf %[[CAST]] : f32
   %1 = math.absf %0 : f32
@@ -65,10 +69,11 @@
 
 // CHECK-LABEL: @constantLoadWithIndexAndAlignment
 func.func @constantLoadWithIndexAndAlignment() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c5) alignment(16) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
-  // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c5_0] : tensor<6xi32>
-  // CHECK: %[[CAST:.+]] = arith.index_cast %[[EXTRACT]] : i32 to index
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c5) alignment(16) : !flow.dispatch.tensor<readonly:tensor<2xvector<4xi32>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xvector<4xi32>>> -> tensor<2xvector<4xi32>>
+  // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<2xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c1{{.*}}] : vector<4xi32>
+  // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index
   %0 = hal.interface.constant.load[5] alignment(16) : index
   // CHECK: = arith.index_cast %[[CAST]] : index to i32
   %1 = arith.index_cast %0 : index to i32
@@ -79,21 +84,61 @@
 
 // CHECK-LABEL: @constantLoadMultiple
 func.func @constantLoadMultiple() {
-  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c2) : !flow.dispatch.tensor<readonly:tensor<3xi32>>
-  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<3xi32>> -> tensor<3xi32>
+  // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c8) : !flow.dispatch.tensor<readonly:tensor<3xvector<4xi32>>>
+  // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<3xvector<4xi32>>> -> tensor<3xvector<4xi32>>
 
-  // CHECK: %[[EXTRACT_0:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+  // Extracting 8 i32s from tensor<3xvector<4xi32>:
+  //   [0 1 2 3][4 5 6 7][8 9 10 11]
+  //    ^-----------------^
+  // 0-3 use the first vec4 (tensor extract 0 then vector extract 0-3)
+  // 4-7 use the second vec4 (tensor extract 1 then vector extract 0-3)
+  // 8 uses the third vec4 (tensor extract 2 then vector extract 0)
+
+  // CHECK: %[[TENSOR_EXTRACT_0:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_0:.+]] = vector.extractelement %[[TENSOR_EXTRACT_0]][%c0{{.*}}] : vector<4xi32>
   %0 = hal.interface.constant.load[0] : i32
-  // CHECK: %[[EXTRACT_1:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+  // CHECK: %[[TENSOR_EXTRACT_1:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_1:.+]] = vector.extractelement %[[TENSOR_EXTRACT_1]][%c1{{.*}}] : vector<4xi32>
   %1 = hal.interface.constant.load[1] : i32
-  // CHECK: %[[EXTRACT_2:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+  // CHECK: %[[TENSOR_EXTRACT_2:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_2:.+]] = vector.extractelement %[[TENSOR_EXTRACT_2]][%c2{{.*}}] : vector<4xi32>
   %2 = hal.interface.constant.load[2] : i32
+  // CHECK: %[[TENSOR_EXTRACT_3:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_3:.+]] = vector.extractelement %[[TENSOR_EXTRACT_3]][%c3{{.*}}] : vector<4xi32>
+  %3 = hal.interface.constant.load[3] : i32
+  // CHECK: %[[TENSOR_EXTRACT_4:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_4:.+]] = vector.extractelement %[[TENSOR_EXTRACT_4]][%c0{{.*}}] : vector<4xi32>
+  %4 = hal.interface.constant.load[4] : i32
+  // CHECK: %[[TENSOR_EXTRACT_5:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_5:.+]] = vector.extractelement %[[TENSOR_EXTRACT_5]][%c1{{.*}}] : vector<4xi32>
+  %5 = hal.interface.constant.load[5] : i32
+  // CHECK: %[[TENSOR_EXTRACT_6:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_6:.+]] = vector.extractelement %[[TENSOR_EXTRACT_6]][%c2{{.*}}] : vector<4xi32>
+  %6 = hal.interface.constant.load[6] : i32
+  // CHECK: %[[TENSOR_EXTRACT_7:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_7:.+]] = vector.extractelement %[[TENSOR_EXTRACT_7]][%c3{{.*}}] : vector<4xi32>
+  %7 = hal.interface.constant.load[7] : i32
+  // CHECK: %[[TENSOR_EXTRACT_8:.+]] = tensor.extract %[[LOAD]][%c2{{.*}}] : tensor<3xvector<4xi32>>
+  // CHECK: %[[VECTOR_EXTRACT_8:.+]] = vector.extractelement %[[TENSOR_EXTRACT_8]][%c0{{.*}}] : vector<4xi32>
+  %8 = hal.interface.constant.load[8] : i32
 
-  // CHECK: = math.absi %[[EXTRACT_0]] : i32
-  %3 = math.absi %0 : i32
-  // CHECK: = math.absi %[[EXTRACT_1]] : i32
-  %4 = math.absi %1 : i32
-  // CHECK: = math.absi %[[EXTRACT_2]] : i32
-  %5 = math.absi %2 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_0]] : i32
+  %abs_0 = math.absi %0 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_1]] : i32
+  %abs_1 = math.absi %1 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_2]] : i32
+  %abs_2 = math.absi %2 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_3]] : i32
+  %abs_3 = math.absi %3 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_4]] : i32
+  %abs_4 = math.absi %4 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_5]] : i32
+  %abs_5 = math.absi %5 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_6]] : i32
+  %abs_6 = math.absi %6 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_7]] : i32
+  %abs_7 = math.absi %7 : i32
+  // CHECK: = math.absi %[[VECTOR_EXTRACT_8]] : i32
+  %abs_8 = math.absi %8 : i32
   return
 }