[Stream] Encode packed_storage device and host tensors (#22722)
Extends the Stream passes `EncodeDeviceTensors` and `EncodeHostTensors`
to correctly handle tensors with the `packed_storage` encoding attached.
The two passes now encode tensors with the `packed_storage` attribute
the same way that `i1` tensors were encoded with the experimental
`iree-experimental-packed-i1-storage` option active.
---------
Signed-off-by: Lukas Sommer <lukas.sommer@amd.com>
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
index 32eed40..10d75a6 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
+++ b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
@@ -86,6 +86,13 @@
/*isPackedStorage=*/clEnableI1Support);
}
+Type legalizeStorageElementType(RankedTensorType shapedType) {
+ bool isPackedStorage =
+ clEnableI1Support || IREE::Encoding::hasPackedStorageAttr(shapedType);
+ return legalizeStorageElementTypeImpl(shapedType.getElementType(),
+ isPackedStorage);
+}
+
Value calculateStorageElementCountInBytes(Location loc,
RankedTensorType shapedType,
ValueRange dynamicDims,
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
index 7e5e8a2..efabcf0 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
+++ b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
@@ -30,6 +30,15 @@
/// cases.
Type legalizeStorageElementType(Type elementType);
+/// Legalizes the underlying element type of |shapedType| for storage, taking
+/// into account the encoding attributes of |shapedType|, if present.
+///
+/// In IREE, if compiling from the same source model, we control both the
+/// runtime and kernel. For such cases, we perform tight packing for supported
+/// sub-byte elements, and expand to the next power-of-two bit width for other
+/// cases.
+Type legalizeStorageElementType(RankedTensorType shapedType);
+
/// Emits IR with the given |builder| to calculate the total number of bytes
/// required for the given |shapedType| in storage. Returns the value for the
/// final count on success; returns nullptr on failure. Dynamic dimensions in
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
index 4c31f31..6470980 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
@@ -59,7 +59,7 @@
// Aligns the element type of a tensor<> to a byte-aligned power of 2 bit width.
static RankedTensorType alignTensorType(RankedTensorType originalType) {
Type elementType = originalType.getElementType();
- Type alignedType = legalizeStorageElementType(elementType);
+ Type alignedType = legalizeStorageElementType(originalType);
if (alignedType == elementType)
return originalType;
return RankedTensorType::get(originalType.getShape(), alignedType,
@@ -133,7 +133,8 @@
//
// Returns the pattern converted to one of [i8, i16, i32, i64] (with i64 needing
// to be handled via emulation) or nullptr if the type is unsupported.
-static Value canonicalizeFillPattern(Value pattern, OpBuilder &builder) {
+static Value canonicalizeFillPattern(Value pattern, RankedTensorType resultType,
+ OpBuilder &builder) {
auto loc = pattern.getLoc();
// Decompose complex numbers into the real/imag components and pack into an
@@ -153,14 +154,7 @@
pattern);
}
- // HACK: extend i1 to i8. This is really not something we should be doing here
- // in optimized programs as this is a super shady operation.
unsigned elementBitWidth = IREE::Util::getTypeBitWidth(pattern.getType());
- if (elementBitWidth == 1) {
- return builder.createOrFold<arith::ExtUIOp>(loc, builder.getI8Type(),
- pattern);
- }
-
// For packed sub-byte patterns, duplicate the sub-byte parts into a full
// byte. We first extend the sub-byte parts into full bytes, and then keep
// shifting left and bitwise or the sub-byte parts. For example, to create an
@@ -169,7 +163,7 @@
// %i8_val = (%i8_val << 2) | %i2_val
// %i8_val = (%i8_val << 2) | %i2_val
// %i8_val = (%i8_val << 2) | %i2_val
- if (needToPackSubByteElementBitWidth(elementBitWidth)) {
+ if (needToPackSubByteElements(resultType)) {
Type i8Type = builder.getI8Type();
Value bitwidth = builder.createOrFold<arith::ConstantOp>(
loc, i8Type, builder.getIntegerAttr(i8Type, elementBitWidth));
@@ -182,6 +176,15 @@
}
return i8Val;
}
+
+ // HACK: For unpacked i1, extend i1 to i8. This is really not something we
+ // should be doing here in optimized programs as this is a super shady
+ // operation.
+ if (elementBitWidth == 1) {
+ return builder.createOrFold<arith::ExtUIOp>(loc, builder.getI8Type(),
+ pattern);
+ }
+
if ((elementBitWidth % 8) != 0) {
// We'd need some policy to determine how to handle non-byte-aligned widths.
return {};
@@ -365,7 +368,7 @@
}
// Canonicalize the fill pattern into an integer type [i8, i16, i32, i64].
- auto pattern = canonicalizeFillPattern(op.getValue(), rewriter);
+ auto pattern = canonicalizeFillPattern(op.getValue(), resultType, rewriter);
if (!pattern) {
return op.emitOpError()
<< "has unsupported pattern type " << op.getValue().getType()
@@ -460,7 +463,7 @@
}
// Canonicalize the fill pattern into an integer type [i8, i16, i32, i64].
- auto pattern = canonicalizeFillPattern(op.getValue(), rewriter);
+ auto pattern = canonicalizeFillPattern(op.getValue(), targetType, rewriter);
if (!pattern) {
return op.emitOpError()
<< "has unsupported pattern type " << op.getValue().getType()
@@ -656,7 +659,8 @@
static IREE::TensorExt::DispatchTensorType
alignDispatchTensorType(IREE::TensorExt::DispatchTensorType originalType) {
Type elementType = originalType.getBoundElementType();
- Type alignedType = legalizeStorageElementType(elementType);
+ Type alignedType =
+ legalizeStorageElementType(originalType.asRankedTensorType());
if (alignedType == elementType)
return originalType;
return IREE::TensorExt::DispatchTensorType::get(
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index 1f48519..1a7d36d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -37,6 +37,7 @@
"emplace_transients.mlir",
"emplace_transients_scf.mlir",
"encode_device_tensors.mlir",
+ "encode_device_tensors_encoding.mlir",
"encode_device_tensors_packing.mlir",
"encode_host_tensors.mlir",
"encode_host_tensors_encoding.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
index 739bd8d..cf32fcb 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
@@ -35,6 +35,7 @@
"emplace_transients.mlir"
"emplace_transients_scf.mlir"
"encode_device_tensors.mlir"
+ "encode_device_tensors_encoding.mlir"
"encode_device_tensors_packing.mlir"
"encode_host_tensors.mlir"
"encode_host_tensors_encoding.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir
new file mode 100644
index 0000000..f31c74f
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir
@@ -0,0 +1,105 @@
+// RUN: iree-opt --split-input-file --iree-stream-encode-device-tensors %s | FileCheck %s
+
+// CHECK-LABEL: @convert_load_i1
+stream.executable private @convert_load_i1 {
+ stream.executable.export public @dispatch
+ builtin.module {
+ util.func public @dispatch(%arg0: !stream.binding) {
+ %c0 = arith.constant 0 : index
+ // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ %tile = iree_tensor_ext.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ // CHECK: util.optimization_barrier %[[DISPATCH_0]] : tensor<?xi1, #iree_encoding.packed_storage>
+ util.optimization_barrier %tile : tensor<?xi1, #iree_encoding.packed_storage>
+ util.return
+ }
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_store_i1
+stream.executable private @convert_store_i1 {
+ stream.executable.export public @dispatch
+ builtin.module {
+ util.func public @dispatch(%arg0: !stream.binding) {
+ // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant dense<[false, false, true, true]> : tensor<4xi1, #iree_encoding.packed_storage>
+ %c0 = arith.constant 0 : index
+ // CHECK-DAG: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ %cst = arith.constant dense<[false, false, true, true]> : tensor<4xi1, #iree_encoding.packed_storage>
+ // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[CONSTANT_0]], %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : tensor<4xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ iree_tensor_ext.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ util.return
+ }
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_multi_i1
+stream.executable private @convert_multi_i1 {
+ stream.executable.export public @dispatch
+ builtin.module {
+ util.func public @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
+ %c0 = arith.constant 0 : index
+ %c4 = arith.constant 4 : index
+ // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ %binding0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+ // CHECK: %[[BINDING_1:.*]] = stream.binding.subspan %arg1{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+ %binding1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+ // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ %tile0 = iree_tensor_ext.dispatch.tensor.load %binding0, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ // CHECK: %[[DISPATCH_1:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_1]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ %tile1 = iree_tensor_ext.dispatch.tensor.load %binding1, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+ // CHECK: %[[ORI_0:.*]] = arith.ori %[[DISPATCH_0]], %[[DISPATCH_1]] : tensor<?xi1, #iree_encoding.packed_storage>
+ %result = arith.ori %tile0, %tile1 : tensor<?xi1, #iree_encoding.packed_storage>
+ // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[ORI_0]], %[[BINDING_1]], {{.+}} : tensor<?xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+ iree_tensor_ext.dispatch.tensor.store %result, %binding1, offsets = [0], sizes = [%c4], strides = [1] : tensor<?xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+ util.return
+ }
+ }
+}
+
+// -----
+
+// Check that i4 are packed and not extended to a full byte. This is also the default behavior without the 'packed_storage' encoding,
+// so just making sure it still works with the encoding attached.
+
+// CHECK-LABEL: @convert_load_i4
+stream.executable private @convert_load_i4 {
+ stream.executable.export public @dispatch
+ builtin.module {
+ util.func public @dispatch(%arg0: !stream.binding) {
+ %c0 = arith.constant 0 : index
+ // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>> -> tensor<?xi4, #iree_encoding.packed_storage>
+ %tile = iree_tensor_ext.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>> -> tensor<?xi4, #iree_encoding.packed_storage>
+ // CHECK: util.optimization_barrier %[[DISPATCH_0]] : tensor<?xi4, #iree_encoding.packed_storage>
+ util.optimization_barrier %tile : tensor<?xi4, #iree_encoding.packed_storage>
+ util.return
+ }
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_store_i4
+
+stream.executable private @convert_store_i4 {
+ stream.executable.export public @dispatch
+ builtin.module {
+ util.func public @dispatch(%arg0: !stream.binding) {
+ // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi4, #iree_encoding.packed_storage>
+ %c0 = arith.constant 0 : index
+ // CHECK-DAG: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ %cst = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi4, #iree_encoding.packed_storage>
+ // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[CONSTANT_0]], %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : tensor<4xi4, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ iree_tensor_ext.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi4, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+ util.return
+ }
+ }
+}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
index b043696..a2c180f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
@@ -1,5 +1,19 @@
// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --verify-diagnostics %s | FileCheck %s
+// CHECK-LABEL: util.func public @denseTensorConstantI1Packed()
+util.func public @denseTensorConstantI1Packed() -> !stream.resource<constant> {
+ // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 2 : index
+ // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} =
+ // CHECK-SAME: dense<[false, true, false, true, false, true, false, false, false, true, true, true]> : tensor<12xi1, #iree_encoding.packed_storage>
+ %0 = stream.tensor.constant : tensor<12xi1, #iree_encoding.packed_storage> in !stream.resource<constant> = dense<[
+ false, true, false, true, false, true, false, false, false, true, true, true
+ ]> : tensor<12xi1, #iree_encoding.packed_storage>
+ // CHECK: util.return %[[RET]]
+ util.return %0 : !stream.resource<constant>
+}
+
+// -----
+
// CHECK-LABEL: util.func public @denseTensorConstantI2()
util.func public @denseTensorConstantI2() -> !stream.resource<constant> {
// CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
@@ -117,6 +131,32 @@
// -----
+// CHECK-LABEL: @denseTensorSplatI1Packed
+util.func public @denseTensorSplatI1Packed(%arg0: i1, %arg1: index) -> !stream.resource<*> {
+ // CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i8
+ // CHECK: %[[EXTUI_0:.*]] = arith.extui %arg0 : i1 to i8
+ // CHECK: %[[SHLI_0:.*]] = arith.shli %[[EXTUI_0]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_0:.*]] = arith.ori %[[SHLI_0]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_1:.*]] = arith.shli %[[ORI_0]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_1:.*]] = arith.ori %[[SHLI_1]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_2:.*]] = arith.shli %[[ORI_1]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_2:.*]] = arith.ori %[[SHLI_2]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_3:.*]] = arith.shli %[[ORI_2]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_3:.*]] = arith.ori %[[SHLI_3]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_4:.*]] = arith.shli %[[ORI_3]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_4:.*]] = arith.ori %[[SHLI_4]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_5:.*]] = arith.shli %[[ORI_4]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_5:.*]] = arith.ori %[[SHLI_5]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_6:.*]] = arith.shli %[[ORI_5]], %[[CONSTANT_0]] : i8
+ // CHECK: %[[ORI_6:.*]] = arith.ori %[[SHLI_6]], %[[EXTUI_0]] : i8
+ // CHECK: %[[ASYNC_0:.*]] = stream.async.splat %[[ORI_6]] : i8 -> !stream.resource<*>{%arg1}
+ %0 = stream.tensor.splat %arg0 : i1 -> tensor<4x3xi1, #iree_encoding.packed_storage> in !stream.resource<*>{%arg1}
+ // CHECK: util.return %[[ASYNC_0]] : !stream.resource<*>
+ util.return %0 : !stream.resource<*>
+}
+
+// -----
+
// CHECK-LABEL: @denseTensorSplatI2
util.func public @denseTensorSplatI2(%arg0: i2, %arg1: index, %arg2: index) -> !stream.resource<*> {
// CHECK: %[[C2:.+]] = arith.constant 2 : i8
@@ -135,6 +175,41 @@
// -----
+// CHECK-LABEL: @denseTensorFillI1Packed
+util.func public @denseTensorFillI1Packed(%arg0: i1, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
+ // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 8 : index
+ // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 16 : index
+ // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 1 : i8
+ // CHECK: %[[EXTUI_0:.*]] = arith.extui %arg0 : i1 to i8
+ // CHECK: %[[SHLI_0:.*]] = arith.shli %[[EXTUI_0]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_0:.*]] = arith.ori %[[SHLI_0]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_1:.*]] = arith.shli %[[ORI_0]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_1:.*]] = arith.ori %[[SHLI_1]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_2:.*]] = arith.shli %[[ORI_1]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_2:.*]] = arith.ori %[[SHLI_2]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_3:.*]] = arith.shli %[[ORI_2]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_3:.*]] = arith.ori %[[SHLI_3]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_4:.*]] = arith.shli %[[ORI_3]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_4:.*]] = arith.ori %[[SHLI_4]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_5:.*]] = arith.shli %[[ORI_4]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_5:.*]] = arith.ori %[[SHLI_5]], %[[EXTUI_0]] : i8
+ // CHECK: %[[SHLI_6:.*]] = arith.shli %[[ORI_5]], %[[CONSTANT_2]] : i8
+ // CHECK: %[[ORI_6:.*]] = arith.ori %[[SHLI_6]], %[[EXTUI_0]] : i8
+ // CHECK: %[[MULI_0:.*]] = arith.muli %arg4, %[[CONSTANT_1]] : index
+ // CHECK: %[[ADDI_0:.*]] = arith.addi %[[MULI_0]], %arg5 : index
+ // CHECK: %[[DIVUI_0:.*]] = arith.divui %[[ADDI_0]], %[[CONSTANT_0]] : index
+ // CHECK: %[[MULI_1:.*]] = arith.muli %arg6, %[[CONSTANT_1]] : index
+ // CHECK: %[[ADDI_1:.*]] = arith.addi %[[MULI_1]], %arg7 : index
+ // CHECK: %[[DIVUI_1:.*]] = arith.divui %[[ADDI_1]], %[[CONSTANT_0]] : index
+ // CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_0]], %[[DIVUI_1]] : index
+ // CHECK: %[[ASYNC_0:.*]] = stream.async.fill %[[ORI_6]], %arg1{{\[}}%[[DIVUI_0]] to %[[ADDI_2]] for %[[DIVUI_1]]] : i8 -> %arg1 as !stream.resource<*>{%arg3}
+ // CHECK: util.return %[[ASYNC_0]] : !stream.resource<*>
+ %0 = stream.tensor.fill %arg0, %arg1[%arg4, %arg5 for %arg6, %arg7] : i1 -> tensor<?x16xi1, #iree_encoding.packed_storage>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
+ util.return %0 : !stream.resource<*>
+}
+
+// -----
+
// CHECK-LABEL: @denseTensorFillI4
util.func public @denseTensorFillI4(%arg0: i4, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
@@ -158,6 +233,22 @@
// -----
+// CHECK-LABEL: @denseTensorSliceI1Packed
+util.func public @denseTensorSliceI1Packed(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+ %c2 = arith.constant 2 : index
+ // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
+ // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C8]] : index
+ // CHECK: %[[ADD:.+]] = arith.addi %[[MUL]], %arg6 : index
+ // CHECK: %[[START:.+]] = arith.divui %[[ADD]], %[[C8]] : index
+ // CHECK: %[[END:.+]] = arith.addi %[[START]], %arg4 : index
+ // CHECK: %[[SLICE:.+]] = stream.async.slice %arg0[%[[START]] to %[[END]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
+ %0 = stream.tensor.slice %arg0[%arg5, %arg6 for %arg3, %c2] : tensor<?x8xi1, #iree_encoding.packed_storage>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x2xi1, #iree_encoding.packed_storage>{%arg3} in !stream.resource<*>{%arg4}
+ // CHECK: util.return %[[SLICE]] : !stream.resource<*>
+ util.return %0 : !stream.resource<*>
+}
+
+// -----
+
// CHECK-LABEL: @denseTensorSliceI2
util.func public @denseTensorSliceI2(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
%c2 = arith.constant 2 : index
@@ -193,6 +284,22 @@
// -----
+// CHECK-LABEL: @denseTensorUpdateI1Packed
+util.func public @denseTensorUpdateI1Packed(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+ // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
+ // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
+ // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C4]] : index
+ // CHECK: %[[ADD:.+]] = arith.addi %[[MUL]], %arg6 : index
+ // CHECK: %[[START:.+]] = arith.divui %[[ADD]], %[[C8]] : index
+ // CHECK: %[[END:.+]] = arith.addi %[[START]], %arg1 : index
+ // CHECK: %[[UPDATE:.+]] = stream.async.update %arg0, %arg2[%[[START]] to %[[END]]] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
+ %0 = stream.tensor.update %arg0, %arg2[%arg5, %arg6] : tensor<8x4xi1, #iree_encoding.packed_storage> in !stream.resource<*>{%arg1} -> tensor<?x4xi1, #iree_encoding.packed_storage>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
+ // CHECK: util.return %[[UPDATE]] : !stream.resource<*>
+ util.return %0 : !stream.resource<*>
+}
+
+// -----
+
// Ensures that a non-power-of-two type (i3) update is expanded to a full byte
// because we don't currently do unaligned sub-byte packing.