Add support for vectorizing normal convolution ops. (#8460)
This PR has slight improvements in MobileNet V3 and V2. For other cases, they may be noises. For PoseNet, there are regressions in multi-threaded. It's just a matter of configuration tuning, so it's fine at this moment.
Configurations: taskset 80 + dylib-sync on local Pixel 4
| Model | Before | After |
| ---------------- | ------- | ------- |
| DeepLabV3 | 705 ms | 704 ms |
| MobileBertSquad | 669 ms | 673 ms |
| MobileNetV2 | 104 ms | 103 ms |
| MobileNetV3Small | 25.4 ms | 24.7 ms |
| MobileSSD | 205 ms | 201 ms |
| PoseNet | 546 ms | 545 ms |
Configurations: taskset f0 + dylib on local Pixel 4
| Model | Before | After |
| ---------------- | ------- | ------- |
| DeepLabV3 | 371 ms | 374 ms |
| MobileBertSquad | 375 ms | 374 ms |
| MobileNetV2 | 66.1 ms | 64.5 ms |
| MobileNetV3Small | 21.4 ms | 20.9 ms |
| MobileSSD | 92.7 ms | 91.2 ms |
| PoseNet | 189 ms | 202 ms |
diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
index 204eaee..44179a6 100644
--- a/iree/compiler/Codegen/Dialect/LoweringConfig.td
+++ b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -16,6 +16,8 @@
: StrEnumAttrCase<"CPUSingleTilingExpert">;
def CPU_DoubleTilingExpert
: StrEnumAttrCase<"CPUDoubleTilingExpert">;
+def CPU_ConvTileAndDecomposeExpert
+ : StrEnumAttrCase<"CPUConvTileAndDecomposeExpert">;
def CPU_TileFuseAndVectorize
: StrEnumAttrCase<"CPUTileFuseAndVectorize">;
@@ -46,9 +48,10 @@
"DispatchLoweringPassPipeline",
"identifier for pass pipeline use to lower dispatch region",
[CPU_Default, CPU_SingleTilingExpert, CPU_DoubleTilingExpert,
- CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
- LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, SPIRV_Distribute,
- SPIRV_DistributeCopy, SPIRV_Vectorize,SPIRV_VectorizeToCooperativeOps,
+ CPU_ConvTileAndDecomposeExpert, CPU_TileFuseAndVectorize,
+ LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize, LLVMGPU_MatmulSimt,
+ LLVMGPU_MatmulTensorCore, SPIRV_Distribute, SPIRV_DistributeCopy,
+ SPIRV_Vectorize, SPIRV_VectorizeToCooperativeOps,
None]> {
let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
}
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 384c29e..fbdb83e 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -198,7 +198,8 @@
/// Adjusts the workload per workgroup to be a multiple of vector size to ensure
/// that the op vectorizes.
static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
- int64_t vectorSizeVal) {
+ int64_t vectorSizeVal,
+ Optional<int64_t> fallbackSizeVal = llvm::None) {
if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
return maxSize;
}
@@ -209,7 +210,7 @@
return i;
}
}
- return vectorSizeVal;
+ return fallbackSizeVal ? fallbackSizeVal.getValue() : vectorSizeVal;
}
/// Returns the tile size to use for the Flow level of an operation that
@@ -276,6 +277,22 @@
return distributedLevelTileSizes;
}
+/// Splits the tile sizes in parallelSizes into reductionSizes for the reduction
+/// loops.
+static void splitParallelAndReductionTiles(
+ linalg::LinalgOp op, SmallVectorImpl<int64_t> ¶llelSizes,
+ SmallVectorImpl<int64_t> &reductionSizes) {
+ reductionSizes.assign(parallelSizes.begin(), parallelSizes.end());
+ for (auto iteratorType : llvm::enumerate(op.iterator_types())) {
+ if (iteratorType.value().cast<StringAttr>().getValue() ==
+ getParallelIteratorTypeName()) {
+ reductionSizes[iteratorType.index()] = 0;
+ } else {
+ parallelSizes[iteratorType.index()] = 0;
+ }
+ }
+}
+
/// Sets the default configuration to use for an operation that implements the
/// `PartitionableLoopsInterface`, given the iteration domain of all the loops.
static LogicalResult setDefaultRootConfig(
@@ -580,9 +597,9 @@
// Set the flow level tiling to the default.
OpBuilder builder(genericOp.getContext());
builder.setInsertionPoint(genericOp);
+ auto linalgOp = cast<linalg::LinalgOp>(genericOp.getOperation());
SmallVector<Range> iterationDomain =
- cast<linalg::LinalgOp>(genericOp.getOperation())
- .createLoopRanges(builder, genericOp.getLoc());
+ linalgOp.createLoopRanges(builder, genericOp.getLoc());
auto partitionableLoopsInterfaceOp =
cast<IREE::Flow::PartitionableLoopsInterface>(genericOp.getOperation());
SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
@@ -592,7 +609,7 @@
// Set the Next level tile sizes.
SmallVector<int64_t> l1TileSizes(numLoops, 0);
Optional<SmallVector<int64_t, 4>> staticLoopRanges =
- cast<linalg::LinalgOp>(genericOp.getOperation()).getStaticLoopRanges();
+ linalgOp.getStaticLoopRanges();
for (auto loopNum : llvm::seq<unsigned>(0, numLoops)) {
if (flowTileSizes[loopNum]) {
l1TileSizes[loopNum] =
@@ -607,16 +624,8 @@
: minTileSizes[loopNum];
}
}
-
- SmallVector<int64_t> vectorTileSizes = l1TileSizes;
- for (auto iteratorType : llvm::enumerate(genericOp.iterator_types())) {
- if (iteratorType.value().cast<StringAttr>().getValue() ==
- getParallelIteratorTypeName()) {
- vectorTileSizes[iteratorType.index()] = 0;
- } else {
- l1TileSizes[iteratorType.index()] = 0;
- }
- }
+ SmallVector<int64_t> vectorTileSizes;
+ splitParallelAndReductionTiles(linalgOp, l1TileSizes, vectorTileSizes);
TileSizesListType tileSizes;
tileSizes.push_back(flowTileSizes);
@@ -628,6 +637,57 @@
DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
}
+/// Sets the lowering configuration for linalg.conv_2d_nhwc_hwcf operations.
+static LogicalResult setRootConfig(
+ FuncOp entryPointFn, linalg::Conv2DNhwcHwcfOp convOp,
+ ArrayRef<LoopTilingAndDistributionInfo> tiledLoops) {
+ auto linalgOp = cast<linalg::LinalgOp>(convOp.getOperation());
+ // Use the default distribution for the matmul loops.
+ unsigned numLoops = linalgOp.getNumLoops();
+ int64_t vectorSize =
+ getVectorSize(entryPointFn, convOp.getResult(0).getType());
+ SmallVector<int64_t> minTileSizes(numLoops, 1);
+ SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize);
+
+ // Set the flow level tiling to the default.
+ OpBuilder builder(convOp.getContext());
+ builder.setInsertionPoint(convOp);
+ SmallVector<Range> iterationDomain =
+ linalgOp.createLoopRanges(builder, convOp.getLoc());
+ auto partitionableLoopsInterfaceOp =
+ cast<IREE::Flow::PartitionableLoopsInterface>(convOp.getOperation());
+ SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
+ iterationDomain, partitionableLoopsInterfaceOp, minTileSizes,
+ maxTileSizes);
+
+ // Shapes of N, OH, OW, OC, KH, KW, IC
+ Optional<SmallVector<int64_t, 4>> shapes = linalgOp.getStaticLoopRanges();
+ assert(shapes.hasValue() &&
+ "something went wrong when inferring loop ranges");
+
+ SmallVector<int64_t> l1TileSizes = {1, 1, 8, vectorSize * 2, 1, 1, 8};
+ for (auto i : llvm::seq<unsigned>(0, l1TileSizes.size())) {
+ auto tileSize = flowTileSizes[i] ? flowTileSizes[i] : shapes.getValue()[i];
+ // If the tile size is intended to be 1, do not adjust it to `vectorSize`.
+ // The ops will be decomposed to lower-rank named ops.
+ if (l1TileSizes[i] != 1) {
+ l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize,
+ /*fallbackTileSize=*/1);
+ }
+ }
+ SmallVector<int64_t> vectorTileSizes;
+ splitParallelAndReductionTiles(linalgOp, l1TileSizes, vectorTileSizes);
+
+ TileSizesListType tileSizes;
+ tileSizes.push_back(flowTileSizes);
+ tileSizes.push_back(l1TileSizes);
+ tileSizes.push_back(vectorTileSizes);
+ return setOpConfigAndEntryPointFnTranslation(
+ entryPointFn, convOp, tileSizes,
+ /*nativeVectorSize=*/ArrayRef<int64_t>{},
+ DispatchLoweringPassPipeline::CPUConvTileAndDecomposeExpert);
+}
+
/// Set default configuration for Linalg ops.
static LogicalResult setRootConfig(
FuncOp entryPointFn, linalg::LinalgOp linalgOp,
@@ -673,10 +733,10 @@
// Redirect to individual operations.
auto setRootConfigFn = [&](Operation *op) -> LogicalResult {
return TypeSwitch<Operation *, LogicalResult>(op)
- .Case<IREE::LinalgExt::FftOp, linalg::GenericOp, linalg::Mmt4DOp>(
- [&](auto op) {
- return setRootConfig(entryPointFn, op, tiledLoops);
- })
+ .Case<IREE::LinalgExt::FftOp, linalg::GenericOp, linalg::Mmt4DOp,
+ linalg::Conv2DNhwcHwcfOp>([&](auto op) {
+ return setRootConfig(entryPointFn, op, tiledLoops);
+ })
.Case<linalg::ContractionOpInterface>([&](auto op) {
return setRootConfig(entryPointFn, op, tiledLoops);
})
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 3644e91..bd93ba6 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -182,6 +182,10 @@
addDoubleTilingExpertPassPipeline(nestedModulePM);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
+ CPUConvTileAndDecomposeExpert:
+ addConvTileAndDecomposeExpertPassPipeline(nestedModulePM);
+ break;
+ case IREE::Codegen::DispatchLoweringPassPipeline::
CPUTileFuseAndVectorize:
addTileFuseAndVectorizePassPipeline(nestedModulePM, lowerToVectors);
break;
diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 4f0cf24..ef7755d 100644
--- a/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -259,6 +259,62 @@
}
}
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
+ // Do first level of tiling and distribution.
+ passManager.addNestedPass<FuncOp>(createTileAndDistributeToWorkgroupsPass());
+ passManager.addPass(createCanonicalizerPass());
+ passManager.addPass(createCSEPass());
+
+ passManager.addNestedPass<FuncOp>(
+ createConvertToDestinationPassingStylePass());
+
+ passManager.addPass(createCanonicalizerPass());
+
+ // Run LinalgFusePass firstly in case that we have fill + conv + generic
+ // ops. At this stage, we do not apply vectorization. The reduction dim won't
+ // get tiled if the case is conv + generic op. In this case, we have to tile
+ // along reduction dim again, which needs them to be Linalg ops form.
+ {
+ LinalgFusePassOptions options;
+ options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
+ passManager.addNestedPass<FuncOp>(createLinalgFusePass(options));
+ passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+ passManager.addNestedPass<FuncOp>(createCSEPass());
+ }
+
+ // Add the sandbox single tiling expert to tile and vectorize.
+ {
+ LinalgSingleTilingExpertPassOptions options;
+ options.decomposeToLowerDimOp = true;
+ options.vectorize = true;
+ options.vectorizePadding = true;
+ options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
+ passManager.addNestedPass<FuncOp>(
+ createLinalgSingleTilingExpertPass(options));
+ passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+ passManager.addNestedPass<FuncOp>(createCSEPass());
+ }
+
+ BufferizationOptions::AllocationFn allocationFn =
+ cpuComprehensiveBufferizeAllocationFn;
+ BufferizationOptions::DeallocationFn deallocationFn =
+ cpuComprehensiveBufferizeDeallocationFn;
+ BufferizationOptions::MemCpyFn memcpyFn = cpuComprehensiveBufferizeCopyFn;
+ addIREEComprehensiveBufferizePasses(passManager, allocationFn, deallocationFn,
+ memcpyFn);
+
+ // Run IREE specific passes before vector lowering expert.
+ passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
+
+ // Add the vector lowering expert.
+ {
+ OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
+ LinalgVectorLoweringPassOptions options;
+ options.splitVectorTransfersTo = "shuffle";
+ addLowerToVectorTransforms(nestedFuncPassManager, options);
+ }
+}
+
void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
bool lowerToVectors) {
// Do first level of tile and distribute to workgroups.
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 09d6dee..3769731 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -536,12 +536,11 @@
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 64, 64, 64, 0, 0, 0]{{\]}}, native_vector_size = []>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDefault", workload_per_wg = []>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 64, 64, 64, 0, 0, 0], [1, 1, 8, 8, 0, 0, 0], [0, 0, 0, 0, 1, 1, 8]], native_vector_size = []>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUConvTileAndDecomposeExpert", workload_per_wg = []>
// CHECK: hal.executable.entry_point public @conv
// CHECK-SAME: translation.info = #[[TRANSLATION]]
// CHECK: linalg.conv_2d_nhwc_hwcf
-// CHECK: lowering.config = #[[CONFIG]]
// -----
@@ -561,6 +560,48 @@
hal.executable.entry_point public @conv_static layout(#executable_layout)
builtin.module {
func @conv_static() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %c607520 = arith.constant 607520 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:1x225x225x3xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c607520) alignment(32) : !flow.dispatch.tensor<readonly:3x3x3x16xf32>
+ %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xf32> -> tensor<1x225x225x3xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x16xf32> -> tensor<3x3x3x16xf32>
+ %5 = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
+ %6 = linalg.fill(%cst, %5) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+ %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x16xf32>
+ return
+ }
+ }
+ }
+}
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 28, 28, 16, 0, 0, 0], [1, 1, 4, 8, 0, 0, 0], [0, 0, 0, 0, 1, 1, 3]], native_vector_size = []>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUConvTileAndDecomposeExpert", workload_per_wg = []>
+// CHECK: hal.executable.entry_point public @conv
+// CHECK-SAME: translation.info = #[[TRANSLATION]]
+// CHECK: linalg.conv_2d_nhwc_hwcf
+
+
+// -----
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable private @depthwise_conv_static {
+ hal.executable.variant public @system_elf_x86_64, target = <"llvm", "system-elf-x86_64", {
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ native_vector_size = 16 : index,
+ target_triple = "x86_64-unknown-linux-gnu"
+ }> {
+ hal.executable.entry_point public @depthwise_conv_static layout(#executable_layout)
+ builtin.module {
+ func @depthwise_conv_static() {
%cst = arith.constant 0.0 : f32
%input_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
: !flow.dispatch.tensor<readonly:1x161x161x96xf32>
@@ -585,7 +626,7 @@
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[0, 20, 40, 48, 0, 0]{{\]}}, native_vector_size = []>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDefault", workload_per_wg = []>
-// CHECK: hal.executable.entry_point public @conv_static
+// CHECK: hal.executable.entry_point public @depthwise_conv_static
// CHECK-SAME: translation.info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering.config = #[[CONFIG]]
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
index d4867ca..5ae0ea4 100644
--- a/iree/compiler/Codegen/Passes.h
+++ b/iree/compiler/Codegen/Passes.h
@@ -244,6 +244,10 @@
ArrayRef<int64_t> workgroupSize = {});
void addDoubleTilingExpertPassPipeline(OpPassManager &passManager);
+// Populates the passes needed to do tiling, decomposing, and vectorizing the
+// convolution ops using the Codegen drivers from sandbox.
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager);
+
/// Populates the passes needed to multi level tile, fuse and vectorize lowering
/// of linalg ops on tensors to vectors operations.
void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,