[WebGPU] Push constants Storage i32 -> Uniform vector<4xi32>. (#11392)
We discussed this [here on
Discord](https://discord.com/channels/689900678990135345/867052513935753277/1047198677514600518).
When using `tensor<Nxi32>` we get an error like this from Tint:
```
Tint reported 1 error(s) for a SPIR-V program, see diagnostics:
error: line:36: Structure id 8 decorated as Block for variable in
Uniform storage class must follow relaxed uniform buffer layout rules:
member 0 contains an array with stride 4 not satisfying alignment to 16
```
Following the spec:
https://www.w3.org/TR/WGSL/#address-space-layout-constraints, we should
be aligning to 16. Switching to `tensor<Nxvector<4xi32>>` gives us that
alignment.
This change packs a conceptual array of `i32`s into an array of
`vector<4xi32>`s - possibly with unused elements at the end. Unpacking
changes from using just `tensor::ExtractOp` to using `tensor::ExtractOp`
(to get one of the vec4s) -> `vector::ExtractElementOp` (to get the
specific element).
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/BUILD b/compiler/src/iree/compiler/Codegen/WGSL/BUILD
index c06b6c3..9612b0c 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/BUILD
+++ b/compiler/src/iree/compiler/Codegen/WGSL/BUILD
@@ -26,5 +26,6 @@
"@llvm-project//mlir:IR",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:TensorDialect",
+ "@llvm-project//mlir:VectorDialect",
],
)
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
index c55901c..3e7f3dc 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/WGSL/CMakeLists.txt
@@ -20,6 +20,7 @@
MLIRIR
MLIRPass
MLIRTensorDialect
+ MLIRVectorDialect
iree::compiler::Codegen::PassHeaders
iree::compiler::Dialect::Flow::IR
iree::compiler::Dialect::HAL::IR
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp b/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
index 3275233..fccc9df 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
+++ b/compiler/src/iree/compiler/Codegen/WGSL/WGSLReplacePushConstants.cpp
@@ -11,6 +11,7 @@
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
@@ -24,7 +25,7 @@
#define IREE_HAL_WEBGPU_PARAMS_BINDING_INDEX 0
static Value convertOpTypeFromI32(IREE::HAL::InterfaceConstantLoadOp loadOp,
- tensor::ExtractOp extractOp) {
+ vector::ExtractElementOp extractElementOp) {
OpBuilder builder(loadOp);
auto loc = loadOp.getLoc();
@@ -32,7 +33,7 @@
// Index
if (opType.isIndex()) {
- return builder.create<arith::IndexCastOp>(loc, opType, extractOp);
+ return builder.create<arith::IndexCastOp>(loc, opType, extractElementOp);
}
unsigned sourceBitWidth = 32;
@@ -41,20 +42,20 @@
// AnySignlessInteger
if (opType.isa<IntegerType>()) {
if (sourceBitWidth > destBitWidth) {
- return builder.create<arith::TruncIOp>(loc, opType, extractOp);
+ return builder.create<arith::TruncIOp>(loc, opType, extractElementOp);
} else if (sourceBitWidth < destBitWidth) {
- return builder.create<arith::ExtUIOp>(loc, opType, extractOp);
+ return builder.create<arith::ExtUIOp>(loc, opType, extractElementOp);
} else {
- return extractOp.getResult();
+ return extractElementOp.getResult();
}
}
// AnyFloat
- Value resizedValue = extractOp.getResult();
+ Value resizedValue = extractElementOp.getResult();
if (sourceBitWidth > destBitWidth) {
- return builder.create<arith::TruncFOp>(loc, opType, extractOp);
+ return builder.create<arith::TruncFOp>(loc, opType, extractElementOp);
} else if (sourceBitWidth < destBitWidth) {
- return builder.create<arith::ExtFOp>(loc, opType, extractOp);
+ return builder.create<arith::ExtFOp>(loc, opType, extractElementOp);
}
return builder.create<arith::BitcastOp>(loc, opType, resizedValue);
}
@@ -63,14 +64,22 @@
IREE::HAL::InterfaceConstantLoadOp op) {
OpBuilder builder(op);
- // tensor.extract -> i32
- auto offsetValue = builder.createOrFold<arith::ConstantIndexOp>(
- op.getLoc(), op.getIndex().getZExtValue());
- auto extractOp =
- builder.create<tensor::ExtractOp>(op.getLoc(), loadOp, offsetValue);
+ // tensor.extract -> vector<4xi32>
+ uint64_t vec4Index = op.getIndex().getZExtValue() / 4;
+ auto tensorOffsetValue =
+ builder.createOrFold<arith::ConstantIndexOp>(op.getLoc(), vec4Index);
+ auto tensorExtractOp = builder.createOrFold<tensor::ExtractOp>(
+ op.getLoc(), loadOp, tensorOffsetValue);
+
+ // vector<4xi32> -> i32
+ uint64_t elementIndex = op.getIndex().getZExtValue() % 4;
+ auto vectorOffsetValue =
+ builder.createOrFold<arith::ConstantIndexOp>(op.getLoc(), elementIndex);
+ auto vectorExtractElementOp = builder.create<vector::ExtractElementOp>(
+ op.getLoc(), tensorExtractOp, vectorOffsetValue);
// i32 -> original type
- auto convertedTypeResult = convertOpTypeFromI32(op, extractOp);
+ auto convertedTypeResult = convertOpTypeFromI32(op, vectorExtractElementOp);
op.replaceAllUsesWith(convertedTypeResult);
op.erase();
@@ -80,8 +89,8 @@
: public WGSLReplacePushConstantsBase<WGSLReplacePushConstantsPass> {
void getDependentDialects(DialectRegistry ®istry) const override {
registry.insert<mlir::arith::ArithDialect, mlir::func::FuncDialect,
- mlir::tensor::TensorDialect, IREE::Flow::FlowDialect,
- IREE::HAL::HALDialect>();
+ mlir::tensor::TensorDialect, mlir::vector::VectorDialect,
+ IREE::Flow::FlowDialect, IREE::HAL::HALDialect>();
}
void runOnOperation() override {
@@ -124,13 +133,22 @@
alignmentAttr = constantLoadOps[0].getAlignmentAttr();
}
+ // We could store into a tensor<Nxi32>, but vec4s are better supported, so
+ // we'll use tensor<Nxvector<4xi32>> instead.
+ // Compute how many vec4s to use, i.e.
+ // max index 0 -> 1 vec4
+ // max index 3 -> 1 vec4
+ // max index 4 -> 2 vec4s
+ uint64_t numberOfVec4s = maxConstantIndex / 4 + 1;
+
// hal.interface.binding.subspan ->
- // !flow.dispatch.tensor<readonly:tensor<Nxi32>>
- // * Group all push constants into a single tensor<Nxi32>
+ // !flow.dispatch.tensor<readonly:tensor<Nxvector<4xi32>>>
+ // * Group all push constants into a single tensor<Nxvector<4xi32>>
// * If individual data types differ, they'll be bitcast when extracted
+ auto v4i32Type = VectorType::get({4}, builder.getI32Type());
auto dispatchTensorType = IREE::Flow::DispatchTensorType::get(
IREE::Flow::TensorAccess::ReadOnly,
- {static_cast<int64_t>(maxConstantIndex + 1)}, builder.getI32Type());
+ {static_cast<int64_t>(numberOfVec4s)}, v4i32Type);
SmallVector<Value> dynamicDims;
// Note: we're ignoring all potential 'values' hints (if provided) on ops -
// InterfaceBindingSubspanOp has no matching concept and we assume that any
@@ -139,12 +157,12 @@
loc, dispatchTensorType,
/*set=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX),
/*binding=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BINDING_INDEX),
- IREE::HAL::DescriptorType::StorageBuffer, maxConstantValue, dynamicDims,
- alignmentAttr);
+ IREE::HAL::DescriptorType::UniformBuffer,
+ /*byte_offset=*/maxConstantValue, dynamicDims, alignmentAttr);
- // flow.dispatch.tensor.load -> tensor<Nxi32>
- auto tensorType = RankedTensorType::get({(int64_t)maxConstantIndex + 1},
- builder.getI32Type());
+ // flow.dispatch.tensor.load -> tensor<Nxvector<4xi32>>
+ auto tensorType =
+ RankedTensorType::get({(int64_t)numberOfVec4s}, v4i32Type);
auto loadOp = builder.create<IREE::Flow::DispatchTensorLoadOp>(
loc, tensorType, subspanOp, dynamicDims);
diff --git a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
index 944f841..8f5feca 100644
--- a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
+++ b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir
@@ -10,10 +10,11 @@
// CHECK-LABEL: @constantLoadIndex
func.func @constantLoadIndex() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
- // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
- // CHECK: %[[CAST:.+]] = arith.index_cast %[[EXTRACT]] : i32 to index
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+ // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+ // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index
%0 = hal.interface.constant.load[0] : index
// CHECK: = arith.index_cast %[[CAST]] : index to i32
%1 = arith.index_cast %0 : index to i32
@@ -24,11 +25,12 @@
// CHECK-LABEL: @constantLoadI32
func.func @constantLoadI32() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
- // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+ // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
%0 = hal.interface.constant.load[0] : i32
- // CHECK: = math.absi %[[EXTRACT]] : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT]] : i32
%1 = math.absi %0 : i32
return
}
@@ -37,10 +39,11 @@
// CHECK-LABEL: @constantLoadI16
func.func @constantLoadI16() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
- // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
- // CHECK: %[[TRUNC:.+]] = arith.trunci %[[EXTRACT]] : i32 to i16
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+ // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+ // CHECK: %[[TRUNC:.+]] = arith.trunci %[[VECTOR_EXTRACT]] : i32 to i16
%0 = hal.interface.constant.load[0] : i16
// CHECK: = math.absi %[[TRUNC]] : i16
%1 = math.absi %0 : i16
@@ -51,10 +54,11 @@
// CHECK-LABEL: @constantLoadF32
func.func @constantLoadF32() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
- // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0_0] : tensor<1xi32>
- // CHECK: %[[CAST:.+]] = arith.bitcast %[[EXTRACT]] : i32 to f32
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xvector<4xi32>>> -> tensor<1xvector<4xi32>>
+ // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32>
+ // CHECK: %[[CAST:.+]] = arith.bitcast %[[VECTOR_EXTRACT]] : i32 to f32
%0 = hal.interface.constant.load[0] : f32
// CHECK: = math.absf %[[CAST]] : f32
%1 = math.absf %0 : f32
@@ -65,10 +69,11 @@
// CHECK-LABEL: @constantLoadWithIndexAndAlignment
func.func @constantLoadWithIndexAndAlignment() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c5) alignment(16) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
- // CHECK: %[[EXTRACT:.+]] = tensor.extract %[[LOAD]][%c5_0] : tensor<6xi32>
- // CHECK: %[[CAST:.+]] = arith.index_cast %[[EXTRACT]] : i32 to index
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c5) alignment(16) : !flow.dispatch.tensor<readonly:tensor<2xvector<4xi32>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor<readonly:tensor<2xvector<4xi32>>> -> tensor<2xvector<4xi32>>
+ // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<2xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c1{{.*}}] : vector<4xi32>
+ // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index
%0 = hal.interface.constant.load[5] alignment(16) : index
// CHECK: = arith.index_cast %[[CAST]] : index to i32
%1 = arith.index_cast %0 : index to i32
@@ -79,21 +84,61 @@
// CHECK-LABEL: @constantLoadMultiple
func.func @constantLoadMultiple() {
- // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(storage_buffer) offset(%c2) : !flow.dispatch.tensor<readonly:tensor<3xi32>>
- // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<3xi32>> -> tensor<3xi32>
+ // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c8) : !flow.dispatch.tensor<readonly:tensor<3xvector<4xi32>>>
+ // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor<readonly:tensor<3xvector<4xi32>>> -> tensor<3xvector<4xi32>>
- // CHECK: %[[EXTRACT_0:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+ // Extracting 8 i32s from tensor<3xvector<4xi32>:
+ // [0 1 2 3][4 5 6 7][8 9 10 11]
+ // ^-----------------^
+ // 0-3 use the first vec4 (tensor extract 0 then vector extract 0-3)
+ // 4-7 use the second vec4 (tensor extract 1 then vector extract 0-3)
+ // 8 uses the third vec4 (tensor extract 2 then vector extract 0)
+
+ // CHECK: %[[TENSOR_EXTRACT_0:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_0:.+]] = vector.extractelement %[[TENSOR_EXTRACT_0]][%c0{{.*}}] : vector<4xi32>
%0 = hal.interface.constant.load[0] : i32
- // CHECK: %[[EXTRACT_1:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+ // CHECK: %[[TENSOR_EXTRACT_1:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_1:.+]] = vector.extractelement %[[TENSOR_EXTRACT_1]][%c1{{.*}}] : vector<4xi32>
%1 = hal.interface.constant.load[1] : i32
- // CHECK: %[[EXTRACT_2:.+]] = tensor.extract %[[LOAD]][%{{.*}}] : tensor<3xi32>
+ // CHECK: %[[TENSOR_EXTRACT_2:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_2:.+]] = vector.extractelement %[[TENSOR_EXTRACT_2]][%c2{{.*}}] : vector<4xi32>
%2 = hal.interface.constant.load[2] : i32
+ // CHECK: %[[TENSOR_EXTRACT_3:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_3:.+]] = vector.extractelement %[[TENSOR_EXTRACT_3]][%c3{{.*}}] : vector<4xi32>
+ %3 = hal.interface.constant.load[3] : i32
+ // CHECK: %[[TENSOR_EXTRACT_4:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_4:.+]] = vector.extractelement %[[TENSOR_EXTRACT_4]][%c0{{.*}}] : vector<4xi32>
+ %4 = hal.interface.constant.load[4] : i32
+ // CHECK: %[[TENSOR_EXTRACT_5:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_5:.+]] = vector.extractelement %[[TENSOR_EXTRACT_5]][%c1{{.*}}] : vector<4xi32>
+ %5 = hal.interface.constant.load[5] : i32
+ // CHECK: %[[TENSOR_EXTRACT_6:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_6:.+]] = vector.extractelement %[[TENSOR_EXTRACT_6]][%c2{{.*}}] : vector<4xi32>
+ %6 = hal.interface.constant.load[6] : i32
+ // CHECK: %[[TENSOR_EXTRACT_7:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_7:.+]] = vector.extractelement %[[TENSOR_EXTRACT_7]][%c3{{.*}}] : vector<4xi32>
+ %7 = hal.interface.constant.load[7] : i32
+ // CHECK: %[[TENSOR_EXTRACT_8:.+]] = tensor.extract %[[LOAD]][%c2{{.*}}] : tensor<3xvector<4xi32>>
+ // CHECK: %[[VECTOR_EXTRACT_8:.+]] = vector.extractelement %[[TENSOR_EXTRACT_8]][%c0{{.*}}] : vector<4xi32>
+ %8 = hal.interface.constant.load[8] : i32
- // CHECK: = math.absi %[[EXTRACT_0]] : i32
- %3 = math.absi %0 : i32
- // CHECK: = math.absi %[[EXTRACT_1]] : i32
- %4 = math.absi %1 : i32
- // CHECK: = math.absi %[[EXTRACT_2]] : i32
- %5 = math.absi %2 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_0]] : i32
+ %abs_0 = math.absi %0 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_1]] : i32
+ %abs_1 = math.absi %1 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_2]] : i32
+ %abs_2 = math.absi %2 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_3]] : i32
+ %abs_3 = math.absi %3 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_4]] : i32
+ %abs_4 = math.absi %4 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_5]] : i32
+ %abs_5 = math.absi %5 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_6]] : i32
+ %abs_6 = math.absi %6 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_7]] : i32
+ %abs_7 = math.absi %7 : i32
+ // CHECK: = math.absi %[[VECTOR_EXTRACT_8]] : i32
+ %abs_8 = math.absi %8 : i32
return
}