[Stream] Encode packed_storage device and host tensors (#22722)

Extends the Stream passes `EncodeDeviceTensors` and `EncodeHostTensors`
to correctly handle tensors with the `packed_storage` encoding attached.

The two passes now encode tensors with the `packed_storage` attribute
the same way that `i1` tensors were encoded with the experimental
`iree-experimental-packed-i1-storage` option active.

---------

Signed-off-by: Lukas Sommer <lukas.sommer@amd.com>
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
index 32eed40..10d75a6 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
+++ b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.cpp
@@ -86,6 +86,13 @@
                                         /*isPackedStorage=*/clEnableI1Support);
 }
 
+Type legalizeStorageElementType(RankedTensorType shapedType) {
+  bool isPackedStorage =
+      clEnableI1Support || IREE::Encoding::hasPackedStorageAttr(shapedType);
+  return legalizeStorageElementTypeImpl(shapedType.getElementType(),
+                                        isPackedStorage);
+}
+
 Value calculateStorageElementCountInBytes(Location loc,
                                           RankedTensorType shapedType,
                                           ValueRange dynamicDims,
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
index 7e5e8a2..efabcf0 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
+++ b/compiler/src/iree/compiler/Dialect/Encoding/Utils/ElementPackingUtils.h
@@ -30,6 +30,15 @@
 /// cases.
 Type legalizeStorageElementType(Type elementType);
 
+/// Legalizes the underlying element type of |shapedType| for storage, taking
+/// into account the encoding attributes of |shapedType|, if present.
+///
+/// In IREE, if compiling from the same source model, we control both the
+/// runtime and kernel. For such cases, we perform tight packing for supported
+/// sub-byte elements, and expand to the next power-of-two bit width for other
+/// cases.
+Type legalizeStorageElementType(RankedTensorType shapedType);
+
 /// Emits IR with the given |builder| to calculate the total number of bytes
 /// required for the given |shapedType| in storage. Returns the value for the
 /// final count on success; returns nullptr on failure. Dynamic dimensions in
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
index 4c31f31..6470980 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
@@ -59,7 +59,7 @@
 // Aligns the element type of a tensor<> to a byte-aligned power of 2 bit width.
 static RankedTensorType alignTensorType(RankedTensorType originalType) {
   Type elementType = originalType.getElementType();
-  Type alignedType = legalizeStorageElementType(elementType);
+  Type alignedType = legalizeStorageElementType(originalType);
   if (alignedType == elementType)
     return originalType;
   return RankedTensorType::get(originalType.getShape(), alignedType,
@@ -133,7 +133,8 @@
 //
 // Returns the pattern converted to one of [i8, i16, i32, i64] (with i64 needing
 // to be handled via emulation) or nullptr if the type is unsupported.
-static Value canonicalizeFillPattern(Value pattern, OpBuilder &builder) {
+static Value canonicalizeFillPattern(Value pattern, RankedTensorType resultType,
+                                     OpBuilder &builder) {
   auto loc = pattern.getLoc();
 
   // Decompose complex numbers into the real/imag components and pack into an
@@ -153,14 +154,7 @@
         pattern);
   }
 
-  // HACK: extend i1 to i8. This is really not something we should be doing here
-  // in optimized programs as this is a super shady operation.
   unsigned elementBitWidth = IREE::Util::getTypeBitWidth(pattern.getType());
-  if (elementBitWidth == 1) {
-    return builder.createOrFold<arith::ExtUIOp>(loc, builder.getI8Type(),
-                                                pattern);
-  }
-
   // For packed sub-byte patterns, duplicate the sub-byte parts into a full
   // byte. We first extend the sub-byte parts into full bytes, and then keep
   // shifting left and bitwise or the sub-byte parts. For example, to create an
@@ -169,7 +163,7 @@
   //   %i8_val = (%i8_val << 2) | %i2_val
   //   %i8_val = (%i8_val << 2) | %i2_val
   //   %i8_val = (%i8_val << 2) | %i2_val
-  if (needToPackSubByteElementBitWidth(elementBitWidth)) {
+  if (needToPackSubByteElements(resultType)) {
     Type i8Type = builder.getI8Type();
     Value bitwidth = builder.createOrFold<arith::ConstantOp>(
         loc, i8Type, builder.getIntegerAttr(i8Type, elementBitWidth));
@@ -182,6 +176,15 @@
     }
     return i8Val;
   }
+
+  // HACK: For unpacked i1, extend i1 to i8. This is really not something we
+  // should be doing here in optimized programs as this is a super shady
+  // operation.
+  if (elementBitWidth == 1) {
+    return builder.createOrFold<arith::ExtUIOp>(loc, builder.getI8Type(),
+                                                pattern);
+  }
+
   if ((elementBitWidth % 8) != 0) {
     // We'd need some policy to determine how to handle non-byte-aligned widths.
     return {};
@@ -365,7 +368,7 @@
     }
 
     // Canonicalize the fill pattern into an integer type [i8, i16, i32, i64].
-    auto pattern = canonicalizeFillPattern(op.getValue(), rewriter);
+    auto pattern = canonicalizeFillPattern(op.getValue(), resultType, rewriter);
     if (!pattern) {
       return op.emitOpError()
              << "has unsupported pattern type " << op.getValue().getType()
@@ -460,7 +463,7 @@
     }
 
     // Canonicalize the fill pattern into an integer type [i8, i16, i32, i64].
-    auto pattern = canonicalizeFillPattern(op.getValue(), rewriter);
+    auto pattern = canonicalizeFillPattern(op.getValue(), targetType, rewriter);
     if (!pattern) {
       return op.emitOpError()
              << "has unsupported pattern type " << op.getValue().getType()
@@ -656,7 +659,8 @@
 static IREE::TensorExt::DispatchTensorType
 alignDispatchTensorType(IREE::TensorExt::DispatchTensorType originalType) {
   Type elementType = originalType.getBoundElementType();
-  Type alignedType = legalizeStorageElementType(elementType);
+  Type alignedType =
+      legalizeStorageElementType(originalType.asRankedTensorType());
   if (alignedType == elementType)
     return originalType;
   return IREE::TensorExt::DispatchTensorType::get(
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index 1f48519..1a7d36d 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -37,6 +37,7 @@
             "emplace_transients.mlir",
             "emplace_transients_scf.mlir",
             "encode_device_tensors.mlir",
+            "encode_device_tensors_encoding.mlir",
             "encode_device_tensors_packing.mlir",
             "encode_host_tensors.mlir",
             "encode_host_tensors_encoding.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
index 739bd8d..cf32fcb 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
@@ -35,6 +35,7 @@
     "emplace_transients.mlir"
     "emplace_transients_scf.mlir"
     "encode_device_tensors.mlir"
+    "encode_device_tensors_encoding.mlir"
     "encode_device_tensors_packing.mlir"
     "encode_host_tensors.mlir"
     "encode_host_tensors_encoding.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir
new file mode 100644
index 0000000..f31c74f
--- /dev/null
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_device_tensors_encoding.mlir
@@ -0,0 +1,105 @@
+// RUN: iree-opt --split-input-file --iree-stream-encode-device-tensors %s | FileCheck %s
+
+// CHECK-LABEL: @convert_load_i1
+stream.executable private @convert_load_i1 {
+  stream.executable.export public @dispatch
+  builtin.module {
+     util.func public @dispatch(%arg0: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      %tile = iree_tensor_ext.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      // CHECK: util.optimization_barrier %[[DISPATCH_0]] : tensor<?xi1, #iree_encoding.packed_storage>
+      util.optimization_barrier %tile : tensor<?xi1, #iree_encoding.packed_storage>
+      util.return
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_store_i1
+stream.executable private @convert_store_i1 {
+  stream.executable.export public @dispatch
+  builtin.module {
+     util.func public @dispatch(%arg0: !stream.binding) {
+      // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant dense<[false, false, true, true]> : tensor<4xi1, #iree_encoding.packed_storage>
+      %c0 = arith.constant 0 : index
+      // CHECK-DAG: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      %cst = arith.constant dense<[false, false, true, true]> : tensor<4xi1, #iree_encoding.packed_storage>
+      // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[CONSTANT_0]], %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : tensor<4xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      iree_tensor_ext.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      util.return
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_multi_i1
+stream.executable private @convert_multi_i1 {
+  stream.executable.export public @dispatch
+  builtin.module {
+     util.func public @dispatch(%arg0: !stream.binding, %arg1: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      %c4 = arith.constant 4 : index
+      // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      %binding0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>>
+      // CHECK: %[[BINDING_1:.*]] = stream.binding.subspan %arg1{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+      %binding1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+      // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      %tile0 = iree_tensor_ext.dispatch.tensor.load %binding0, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      // CHECK: %[[DISPATCH_1:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_1]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      %tile1 = iree_tensor_ext.dispatch.tensor.load %binding1, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>> -> tensor<?xi1, #iree_encoding.packed_storage>
+      // CHECK: %[[ORI_0:.*]] = arith.ori %[[DISPATCH_0]], %[[DISPATCH_1]] : tensor<?xi1, #iree_encoding.packed_storage>
+      %result = arith.ori %tile0, %tile1 : tensor<?xi1, #iree_encoding.packed_storage>
+      // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[ORI_0]], %[[BINDING_1]], {{.+}} : tensor<?xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+      iree_tensor_ext.dispatch.tensor.store %result, %binding1, offsets = [0], sizes = [%c4], strides = [1] : tensor<?xi1, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<4xi1, #iree_encoding.packed_storage>>
+      util.return
+    }
+  }
+}
+
+// -----
+
+// Check that i4 are packed and not extended to a full byte. This is also the default behavior without the 'packed_storage' encoding,
+// so just making sure it still works with the encoding attached.
+
+// CHECK-LABEL: @convert_load_i4
+stream.executable private @convert_load_i4 {
+  stream.executable.export public @dispatch
+  builtin.module {
+     util.func public @dispatch(%arg0: !stream.binding) {
+      %c0 = arith.constant 0 : index
+      // CHECK: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      // CHECK: %[[DISPATCH_0:.*]] = iree_tensor_ext.dispatch.tensor.load %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>> -> tensor<?xi4, #iree_encoding.packed_storage>
+      %tile = iree_tensor_ext.dispatch.tensor.load %binding, offsets = [0], sizes = [4], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4xi4, #iree_encoding.packed_storage>> -> tensor<?xi4, #iree_encoding.packed_storage>
+      // CHECK: util.optimization_barrier %[[DISPATCH_0]] : tensor<?xi4, #iree_encoding.packed_storage>
+      util.optimization_barrier %tile : tensor<?xi4, #iree_encoding.packed_storage>
+      util.return
+    }
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @convert_store_i4
+
+stream.executable private @convert_store_i4 {
+  stream.executable.export public @dispatch
+  builtin.module {
+     util.func public @dispatch(%arg0: !stream.binding) {
+      // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi4, #iree_encoding.packed_storage>
+      %c0 = arith.constant 0 : index
+      // CHECK-DAG: %[[BINDING_0:.*]] = stream.binding.subspan %arg0{{.+}} : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      %binding = stream.binding.subspan %arg0[%c0] : !stream.binding -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      %cst = arith.constant dense<[0, 7, 2, 5]> : tensor<4xi4, #iree_encoding.packed_storage>
+      // CHECK-NEXT: iree_tensor_ext.dispatch.tensor.store %[[CONSTANT_0]], %[[BINDING_0]], offsets = [0], sizes = [4], strides = [1] : tensor<4xi4, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      iree_tensor_ext.dispatch.tensor.store %cst, %binding, offsets = [0], sizes = [4], strides = [1] : tensor<4xi4, #iree_encoding.packed_storage> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4xi4, #iree_encoding.packed_storage>>
+      util.return
+    }
+  }
+}
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
index b043696..a2c180f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing.mlir
@@ -1,5 +1,19 @@
 // RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --verify-diagnostics %s | FileCheck %s
 
+// CHECK-LABEL: util.func public @denseTensorConstantI1Packed()
+util.func public @denseTensorConstantI1Packed() -> !stream.resource<constant> {
+  // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 2 : index
+  // CHECK: %[[RET:.+]] = stream.async.constant : !stream.resource<constant>{%[[STATIC_SIZE]]} =
+  // CHECK-SAME: dense<[false, true, false, true, false, true, false, false, false, true, true, true]> : tensor<12xi1, #iree_encoding.packed_storage>
+  %0 = stream.tensor.constant : tensor<12xi1, #iree_encoding.packed_storage> in !stream.resource<constant> = dense<[
+    false, true, false, true, false, true, false, false, false, true, true, true
+  ]> : tensor<12xi1, #iree_encoding.packed_storage>
+  // CHECK: util.return %[[RET]]
+  util.return %0 : !stream.resource<constant>
+}
+
+// -----
+
 // CHECK-LABEL:  util.func public @denseTensorConstantI2()
 util.func public @denseTensorConstantI2() -> !stream.resource<constant> {
   // CHECK: %[[STATIC_SIZE:.+]] = arith.constant 4 : index
@@ -117,6 +131,32 @@
 
 // -----
 
+// CHECK-LABEL: @denseTensorSplatI1Packed
+util.func public @denseTensorSplatI1Packed(%arg0: i1, %arg1: index) -> !stream.resource<*> {
+  // CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : i8
+  // CHECK: %[[EXTUI_0:.*]] = arith.extui %arg0 : i1 to i8
+  // CHECK: %[[SHLI_0:.*]] = arith.shli %[[EXTUI_0]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_0:.*]] = arith.ori %[[SHLI_0]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_1:.*]] = arith.shli %[[ORI_0]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_1:.*]] = arith.ori %[[SHLI_1]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_2:.*]] = arith.shli %[[ORI_1]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_2:.*]] = arith.ori %[[SHLI_2]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_3:.*]] = arith.shli %[[ORI_2]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_3:.*]] = arith.ori %[[SHLI_3]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_4:.*]] = arith.shli %[[ORI_3]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_4:.*]] = arith.ori %[[SHLI_4]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_5:.*]] = arith.shli %[[ORI_4]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_5:.*]] = arith.ori %[[SHLI_5]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_6:.*]] = arith.shli %[[ORI_5]], %[[CONSTANT_0]] : i8
+  // CHECK: %[[ORI_6:.*]] = arith.ori %[[SHLI_6]], %[[EXTUI_0]] : i8
+  // CHECK: %[[ASYNC_0:.*]] = stream.async.splat %[[ORI_6]] : i8 -> !stream.resource<*>{%arg1}
+  %0 = stream.tensor.splat %arg0 : i1 -> tensor<4x3xi1, #iree_encoding.packed_storage> in !stream.resource<*>{%arg1}
+  // CHECK: util.return %[[ASYNC_0]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
+}
+
+// -----
+
 // CHECK-LABEL: @denseTensorSplatI2
 util.func public @denseTensorSplatI2(%arg0: i2, %arg1: index, %arg2: index) -> !stream.resource<*> {
   // CHECK: %[[C2:.+]] = arith.constant 2 : i8
@@ -135,6 +175,41 @@
 
 // -----
 
+// CHECK-LABEL: @denseTensorFillI1Packed
+util.func public @denseTensorFillI1Packed(%arg0: i1, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
+  // CHECK-DAG: %[[CONSTANT_0:.*]] = arith.constant 8 : index
+  // CHECK-DAG: %[[CONSTANT_1:.*]] = arith.constant 16 : index
+  // CHECK-DAG: %[[CONSTANT_2:.*]] = arith.constant 1 : i8
+  // CHECK: %[[EXTUI_0:.*]] = arith.extui %arg0 : i1 to i8
+  // CHECK: %[[SHLI_0:.*]] = arith.shli %[[EXTUI_0]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_0:.*]] = arith.ori %[[SHLI_0]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_1:.*]] = arith.shli %[[ORI_0]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_1:.*]] = arith.ori %[[SHLI_1]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_2:.*]] = arith.shli %[[ORI_1]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_2:.*]] = arith.ori %[[SHLI_2]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_3:.*]] = arith.shli %[[ORI_2]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_3:.*]] = arith.ori %[[SHLI_3]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_4:.*]] = arith.shli %[[ORI_3]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_4:.*]] = arith.ori %[[SHLI_4]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_5:.*]] = arith.shli %[[ORI_4]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_5:.*]] = arith.ori %[[SHLI_5]], %[[EXTUI_0]] : i8
+  // CHECK: %[[SHLI_6:.*]] = arith.shli %[[ORI_5]], %[[CONSTANT_2]] : i8
+  // CHECK: %[[ORI_6:.*]] = arith.ori %[[SHLI_6]], %[[EXTUI_0]] : i8
+  // CHECK: %[[MULI_0:.*]] = arith.muli %arg4, %[[CONSTANT_1]] : index
+  // CHECK: %[[ADDI_0:.*]] = arith.addi %[[MULI_0]], %arg5 : index
+  // CHECK: %[[DIVUI_0:.*]] = arith.divui %[[ADDI_0]], %[[CONSTANT_0]] : index
+  // CHECK: %[[MULI_1:.*]] = arith.muli %arg6, %[[CONSTANT_1]] : index
+  // CHECK: %[[ADDI_1:.*]] = arith.addi %[[MULI_1]], %arg7 : index
+  // CHECK: %[[DIVUI_1:.*]] = arith.divui %[[ADDI_1]], %[[CONSTANT_0]] : index
+  // CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_0]], %[[DIVUI_1]] : index
+  // CHECK: %[[ASYNC_0:.*]] = stream.async.fill %[[ORI_6]], %arg1{{\[}}%[[DIVUI_0]] to %[[ADDI_2]] for %[[DIVUI_1]]] : i8 -> %arg1 as !stream.resource<*>{%arg3}
+  // CHECK: util.return %[[ASYNC_0]] : !stream.resource<*>
+  %0 = stream.tensor.fill %arg0, %arg1[%arg4, %arg5 for %arg6, %arg7] : i1 -> tensor<?x16xi1, #iree_encoding.packed_storage>{%arg2} in %arg1 as !stream.resource<*>{%arg3}
+  util.return %0 : !stream.resource<*>
+}
+
+// -----
+
 // CHECK-LABEL: @denseTensorFillI4
 util.func public @denseTensorFillI4(%arg0: i4, %arg1: !stream.resource<*>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) -> !stream.resource<*> {
   // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
@@ -158,6 +233,22 @@
 
 // -----
 
+// CHECK-LABEL: @denseTensorSliceI1Packed
+util.func public @denseTensorSliceI1Packed(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+  %c2 = arith.constant 2 : index
+  // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C8]] : index
+  // CHECK: %[[ADD:.+]] = arith.addi %[[MUL]], %arg6 : index
+  // CHECK: %[[START:.+]] = arith.divui %[[ADD]], %[[C8]] : index
+  // CHECK: %[[END:.+]] = arith.addi %[[START]], %arg4 : index
+  // CHECK: %[[SLICE:.+]] = stream.async.slice %arg0[%[[START]] to %[[END]]] : !stream.resource<*>{%arg2} -> !stream.resource<*>{%arg4}
+  %0 = stream.tensor.slice %arg0[%arg5, %arg6 for %arg3, %c2] : tensor<?x8xi1, #iree_encoding.packed_storage>{%arg1} in !stream.resource<*>{%arg2} -> tensor<?x2xi1, #iree_encoding.packed_storage>{%arg3} in !stream.resource<*>{%arg4}
+  // CHECK: util.return %[[SLICE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
+}
+
+// -----
+
 // CHECK-LABEL: @denseTensorSliceI2
 util.func public @denseTensorSliceI2(%arg0: !stream.resource<*>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
   %c2 = arith.constant 2 : index
@@ -193,6 +284,22 @@
 
 // -----
 
+// CHECK-LABEL: @denseTensorUpdateI1Packed
+util.func public @denseTensorUpdateI1Packed(%arg0: !stream.resource<*>, %arg1: index, %arg2: !stream.resource<*>, %arg3: index, %arg4: index, %arg5: index, %arg6: index) -> !stream.resource<*> {
+  // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
+  // CHECK: %[[MUL:.+]] = arith.muli %arg5, %[[C4]] : index
+  // CHECK: %[[ADD:.+]] = arith.addi %[[MUL]], %arg6 : index
+  // CHECK: %[[START:.+]] = arith.divui %[[ADD]], %[[C8]] : index
+  // CHECK: %[[END:.+]] = arith.addi %[[START]], %arg1 : index
+  // CHECK: %[[UPDATE:.+]] = stream.async.update %arg0, %arg2[%[[START]] to %[[END]]] : !stream.resource<*>{%arg1} -> %arg2 as !stream.resource<*>{%arg4}
+  %0 = stream.tensor.update %arg0, %arg2[%arg5, %arg6] : tensor<8x4xi1, #iree_encoding.packed_storage> in !stream.resource<*>{%arg1} -> tensor<?x4xi1, #iree_encoding.packed_storage>{%arg3} in %arg2 as !stream.resource<*>{%arg4}
+  // CHECK: util.return %[[UPDATE]] : !stream.resource<*>
+  util.return %0 : !stream.resource<*>
+}
+
+// -----
+
 // Ensures that a non-power-of-two type (i3) update is expanded to a full byte
 // because we don't currently do unaligned sub-byte packing.