Enable tile+distribute+vectorize for unpack ops. (#11232)
The tiling sizes are set to multiple of inner tile sizes. In this
context, we don't need stack buffer allocation for holding temp results.
Test case:
```mlir
%result = iree_linalg_ext.unpack %result_4d inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %result_init
: (tensor<32x32x8x8xi32> tensor<256x256xi32>) -> tensor<256x256xi32>
```
The commit gives us 1.86x improvements (93.63 us -> 49.44 us).
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
index 49b5da3..69b27de 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingInterface.cpp
@@ -379,6 +379,7 @@
return tilingResult;
}
+ SmallVector<Operation *> tiledImplementation;
{
SmallVector<OpFoldResult> offsets, sizes;
// If there is an interchange specified, permute the iteration domain and
@@ -458,13 +459,8 @@
if (!tilingResult.loops.empty())
rewriter.setInsertionPoint(
tilingResult.loops.back().getBody()->getTerminator());
- SmallVector<Operation *> tiledImplementation =
- op.getTiledImplementation(rewriter, offsets, sizes);
- if (tiledImplementation.size() != 1) {
- return rewriter.notifyMatchFailure(
- op, "expected tiled implementation to return a single op");
- }
- tilingResult.tiledOp = tiledImplementation[0];
+ tiledImplementation = op.getTiledImplementation(rewriter, offsets, sizes);
+ tilingResult.tiledOp = tiledImplementation.back();
LLVM_DEBUG({
if (!tilingResult.loops.empty()) {
@@ -478,7 +474,9 @@
}
// Update the filter.
- filter.replaceLinalgTransformationFilter(rewriter, tilingResult.tiledOp);
+ for (auto tiledOp : tiledImplementation) {
+ filter.replaceLinalgTransformationFilter(rewriter, tiledOp);
+ }
if (op->getNumResults() == 0) {
rewriter.eraseOp(op);
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 63e238e..3740539 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -1872,3 +1872,46 @@
// CHECK: %[[SIZE_X:.+]] = affine.min #[[MAP8]](%{{.+}})[%{{.+}}]
// CHECK: flow.dispatch.tensor.store
// CHECK-SAME: sizes = [%[[SIZE_Y]], %[[SIZE_X]], 8, 4]
+
+// -----
+
+hal.executable private @dynamic_unpack {
+ hal.executable.variant public @embedded_elf_x86_64, target = <"llvm-cpu", "embedded-elf-x86_64", {}> {
+ hal.executable.export public @dynamic_unpack ordinal(0) layout(
+ #hal.pipeline.layout<push_constants = 4, sets = [
+ <0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>)
+ attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index):
+ %c1 = arith.constant 1 : index
+ %0 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg1]
+ %1 = affine.apply affine_map<()[s0] -> (s0 ceildiv 64)>()[%arg2]
+ hal.return %1, %0, %c1 : index, index, index
+ }
+ builtin.module {
+ func.func @dynamic_unpack() {
+ %c131072 = arith.constant 131072 : index
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 : i32 to index
+ %5 = arith.index_castui %1 : i32 to index
+ %6 = arith.index_castui %2 : i32 to index
+ %7 = arith.index_castui %3 : i32 to index
+ %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5}
+ %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c131072) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x32x16xi32>>{%4, %5} -> tensor<?x?x32x16xi32>
+ %11 = tensor.empty(%6, %7) : tensor<?x?xi32>
+ %12 = iree_linalg_ext.unpack {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[64, 64]]>}
+ %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : (tensor<?x?x32x16xi32> tensor<?x?xi32>) -> tensor<?x?xi32>
+ flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%6, %7}
+ return
+ }
+ }
+ }
+}
+// CHECK-LABEL: func.func @dynamic_unpack
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: iree_linalg_ext.unpack
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
index 4844275..9766fcd 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
@@ -210,7 +210,7 @@
IREE::LinalgExt::PackOp::attachInterface<
OuterParallelAsPartitionableLoops<IREE::LinalgExt::PackOp>>(*ctx);
IREE::LinalgExt::UnPackOp::attachInterface<
- NoPartitionableLoops<IREE::LinalgExt::UnPackOp>>(*ctx);
+ OuterParallelAsPartitionableLoops<IREE::LinalgExt::UnPackOp>>(*ctx);
IREE::LinalgExt::ScanOp::attachInterface<
AllParallelAsPartitionableLoops<IREE::LinalgExt::ScanOp>>(*ctx);
IREE::LinalgExt::ScatterOp::attachInterface<
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index c4927de..7a529b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1047,6 +1047,7 @@
workgroupTileSizes[dim] = 0;
}
}
+
return workgroupTileSizes;
}
@@ -1057,6 +1058,27 @@
entryPointFn, op, tileSizes, DispatchLoweringPassPipeline::CPUDataTiling);
}
+static LogicalResult setRootConfig(
+ func::FuncOp entryPointFn, IREE::LinalgExt::UnPackOp op,
+ DispatchLoweringPassPipeline pipeline =
+ DispatchLoweringPassPipeline::CPUDataTiling) {
+ SmallVector<int64_t> tileSizes = getLinalgExtDefaultWorkgroupTileSizes(op);
+
+ // Fixup for making tileSizes be multiple of inner_tile_sizes.
+ SmallVector<int64_t> innerTiles = op.getStaticTiles();
+ SmallVector<int64_t> dimPos = extractFromI64ArrayAttr(op.getInnerDimsPos());
+ for (auto it : llvm::zip(dimPos, innerTiles)) {
+ int64_t pos = std::get<0>(it);
+ int64_t size = std::get<1>(it);
+ if (tileSizes[pos] == 0 || ShapedType::isDynamic(size)) continue;
+ tileSizes[pos] = llvm::alignDown(tileSizes[pos], size);
+ }
+
+ TileSizesListType tileSizesList = {tileSizes};
+ return setOpConfigAndEntryPointFnTranslation(entryPointFn, op, tileSizesList,
+ pipeline);
+}
+
/// Sets the lowering configuration for dispatch region for linalg_ext.fft
/// root op.
static LogicalResult setRootConfig(
@@ -1601,15 +1623,15 @@
return setRootConfig(entryPointFn, op, LinalgOpInfo(op),
targetMLTransInfo);
})
- .Case<IREE::LinalgExt::FftOp, linalg::Mmt4DOp, linalg::Conv2DNhwcHwcfOp,
- linalg::Conv2DNchwFchwOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+ .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::PackOp,
+ IREE::LinalgExt::UnPackOp, linalg::Mmt4DOp,
+ linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
+ linalg::DepthwiseConv2DNhwcHwcOp>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
.Case<linalg::ContractionOpInterface>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
.Case<linalg::LinalgOp>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
- .Case<IREE::LinalgExt::PackOp>(
- [&](auto op) { return setRootConfig(entryPointFn, op); })
.Case<TilingInterface>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
.Default([&](Operation *op) { return success(); });
@@ -1626,7 +1648,7 @@
// Redirect to individual operations.
auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
return TypeSwitch<Operation *, LogicalResult>(op)
- .Case<IREE::LinalgExt::FftOp>([&](auto op) {
+ .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::UnPackOp>([&](auto op) {
return setRootConfig(entryPointFn, op,
DispatchLoweringPassPipeline::VMVXDefault);
})