- Move linalg.mmt4d to the tensor -> vector pipeline (#7307)
- Move linalg.mmt4d tile and vectorzation to tensor -> vector pipeline.
- Add vector transform patterns needed to get better performance.
Running `iree/test/microbenchmarks/linalg_mmt4d.mlir` on Pixel4 core-7
Baseline:
```
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
BM_matmul_384x384x512/process_time/real_time 6.64 ms 6.63 ms 106
BM_mmt4d_384x384x512_4x1x4/process_time/real_time 9.74 ms 9.72 ms 72
BM_mmt4d_384x384x512_8x1x8/process_time/real_time 7.10 ms 7.07 ms 100
```
This PR:
```
--------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------------
BM_matmul_384x384x512/process_time/real_time 6.24 ms 6.22 ms 112
BM_mmt4d_384x384x512_4x1x4/process_time/real_time 4.41 ms 4.40 ms 159
BM_mmt4d_384x384x512_8x1x8/process_time/real_time 3.83 ms 3.82 ms 183
```
diff --git a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 78e006e..1de9c92 100644
--- a/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -95,6 +95,8 @@
OwningRewritePatternList patterns(&getContext());
mlir::vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
patterns.add<TransposeUnitDimToShapeCast>(&getContext());
+ mlir::vector::populateVectorTransferCollapseInnerMostContiguousDimsPatterns(
+ patterns);
(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
// Workaround, run loop invariant code motion before hoist redudant vector
// transfer to workaround a bug upstream.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 8d675a8..6b2ad8e 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -374,7 +374,7 @@
return setOpConfigAndEntryPointFnTranslation(
entryPointFn, mmt4dOp, tileSizes, nativeVectorSize,
- IREE::Codegen::DispatchLoweringPassPipeline::CPUVectorization);
+ IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors);
}
/// Sets the lowering configuration for dispatch region for linalg_ext.fft
diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
index 7eb2c56..cdba503 100644
--- a/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndVectorizeLinalgTensorOps.cpp
@@ -173,6 +173,16 @@
return;
}
+ // Op specific conversion.
+ {
+ RewritePatternSet vectorizeOpsPattenrs(context);
+ populateLinalgToVectorVectorizeMMT4dPatterns(context, vectorizeOpsPattenrs);
+ if (failed(applyPatternsAndFoldGreedily(funcOp,
+ std::move(vectorizeOpsPattenrs)))) {
+ return signalPassFailure();
+ }
+ }
+
// Apply vectorization patterns.
{
OwningRewritePatternList vectorizationPatterns(&getContext());
diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
index 2a67f8e..eca3067 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/BUILD
+++ b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -25,6 +25,7 @@
"materialize_launch_configuration.mlir",
"matmul_vectorization.mlir",
"synchronize_symbol_visibility.mlir",
+ "test_config_mmt4d.mlir",
"tile_and_vectorize.mlir",
"unfused_fma.mlir",
"vector_contract_to_aarch64_asm.mlir",
diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index 1aa33ea..137565d 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -20,6 +20,7 @@
"materialize_launch_configuration.mlir"
"matmul_vectorization.mlir"
"synchronize_symbol_visibility.mlir"
+ "test_config_mmt4d.mlir"
"tile_and_vectorize.mlir"
"unfused_fma.mlir"
"vector_contract_to_aarch64_asm.mlir"
diff --git a/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir b/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
new file mode 100644
index 0000000..8045b29
--- /dev/null
+++ b/iree/compiler/Codegen/LLVMCPU/test/test_config_mmt4d.mlir
@@ -0,0 +1,58 @@
+// RUN: iree-opt -split-input-file -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' %s | IreeFileCheck %s
+
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm", "embedded-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-unknown-unknown-eabi-elf"}>
+#map0 = affine_map<()[s0, s1] -> (s0 * s1)>
+#map3 = affine_map<(d0)[s0] -> (s0, -d0 + 96)>
+#map4 = affine_map<(d0)[s0] -> (s0, -d0 + 128)>
+hal.executable private @mmt4d_384x384x512_4x1x4_dispatch_0 {
+ hal.interface public @io {
+ hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_rw_external, set=0, binding=2, type="StorageBuffer", access="Read|Write"
+ }
+ hal.executable.variant public @embedded_elf_arm_64, target = #executable_target_embedded_elf_arm_64_ {
+ hal.executable.entry_point public @mmt4d_384x384x512_4x1x4_dispatch_0 attributes {interface = @io, ordinal = 0 : index}
+ builtin.module {
+ func @mmt4d_384x384x512_4x1x4_dispatch_0() {
+ %c0 = arith.constant 0 : index
+ %c96 = arith.constant 96 : index
+ %c128 = arith.constant 128 : index
+ %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:96x384x4x1xf32>
+ %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:128x384x4x1xf32>
+ %2 = hal.interface.binding.subspan @io::@s0b2_rw_external[%c0] : !flow.dispatch.tensor<readwrite:96x128x4x4xf32>
+ %workgroup_size_x = hal.interface.workgroup.size[0] : index
+ %workgroup_size_y = hal.interface.workgroup.size[1] : index
+ %workgroup_id_x = hal.interface.workgroup.id[0] : index
+ %workgroup_count_x = hal.interface.workgroup.count[0] : index
+ %workgroup_id_y = hal.interface.workgroup.id[1] : index
+ %workgroup_count_y = hal.interface.workgroup.count[1] : index
+ %3 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y]
+ %4 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y]
+ scf.for %arg0 = %3 to %c96 step %4 {
+ %5 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x]
+ %6 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x]
+ scf.for %arg1 = %5 to %c128 step %6 {
+ %7 = affine.min #map3(%arg0)[%workgroup_size_y]
+ %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0, 0, 0], sizes = [%7, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:96x384x4x1xf32> -> tensor<?x384x4x1xf32>
+ %9 = affine.min #map4(%arg1)[%workgroup_size_x]
+ %10 = flow.dispatch.tensor.load %1, offsets = [%arg1, 0, 0, 0], sizes = [%9, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:128x384x4x1xf32> -> tensor<?x384x4x1xf32>
+ %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, 0, 0], sizes = [%7, %9, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readwrite:96x128x4x4xf32> -> tensor<?x?x4x4xf32>
+ %12 = linalg.mmt4d {__internal_linalg_transform__ = "workgroup"} ins(%8, %10 : tensor<?x384x4x1xf32>, tensor<?x384x4x1xf32>) outs(%11 : tensor<?x?x4x4xf32>) -> tensor<?x?x4x4xf32>
+ flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, 0, 0], sizes = [%7, %9, 4, 4], strides = [1, 1, 1, 1] : tensor<?x?x4x4xf32> -> !flow.dispatch.tensor<readwrite:96x128x4x4xf32>
+ }
+ }
+ return
+ }
+ hal.interface private @io {
+ hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+ hal.interface.binding public @s0b2_rw_external, set=0, binding=2, type="StorageBuffer", access="Read|Write"
+ }
+ }
+ }
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = {{\[}}[48, 32], [1, 1, 1, 4, 4, 1], [1, 1, 1, 4, 4, 1]{{\]}}, native_vector_size = [1, 1, 1, 4, 4, 1]
+// CHECK: func @mmt4d_384x384x512_4x1x4_dispatch_0()
+// CHECK: linalg.mmt4d
+// CHECK-SAME: lowering.config = #[[CONFIG]]