[LLVMCPU] Add tensor.pad canonicalization patterns to LLVMCPUTensorPad. (#13420)
They are needed because we have to fold chains of
tensor::ExtractSliceOp, tensor::PadOp pairs to a single
tensor::ExtractSliceOp, tensor::PadOp pair that pads all dimensions at
once, which simplifies vectorization and bufferization.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTensorPad.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTensorPad.cpp
index be293e1..2ffc26b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTensorPad.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTensorPad.cpp
@@ -86,6 +86,7 @@
memref::populateResolveRankedShapeTypeResultDimsPatterns(patterns);
context->getLoadedDialect<tensor::TensorDialect>()
->getCanonicalizationPatterns(patterns);
+ tensor::PadOp::getCanonicalizationPatterns(patterns, context);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
LLVM_DEBUG(llvm::dbgs() << "----- cleanup failed -----\n");
return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 7c74858..fe1adbf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -78,15 +78,15 @@
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
-hal.executable private @preset_config_matmul {
+hal.executable private @preset_pad_config_matmul {
hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
- hal.executable.export @preset_config_matmul layout(#pipeline_layout) {
+ hal.executable.export @preset_pad_config_matmul layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
- func.func @preset_config_matmul() {
+ func.func @preset_pad_config_matmul() {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<128x49xf32>>
@@ -105,8 +105,68 @@
}
}
}
-// CHECK-LABEL: func.func @preset_config_matmul
-// CHECK: vector.outerproduct
+// CHECK-LABEL: func.func @preset_pad_config_matmul
+// CHECK: vector.outerproduct
+
+// -----
+
+// Checks that the ops are padded and vectorized. The test sets tiling sizes to
+// be non-divisible by problem sizes. If padding and vectorizing are kicked in,
+// vector ops will be generated.
+#compilation = #iree_codegen.compilation_info<
+ lowering_config = <tile_sizes = [[192, 128, 0], [8, 32, 0], [0, 0, 16]]>,
+ translation_info = <CPUDoubleTilingPadExpert>,
+ workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>
+ ]>
+]>
+hal.executable private @preset_pad_config_dynamic_matmul {
+ hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+ hal.executable.export @preset_pad_config_dynamic_matmul layout(#pipeline_layout) {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index):
+ %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4
+ hal.return %x, %y, %z : index, index, index
+ }
+ builtin.module {
+ func.func @preset_pad_config_dynamic_matmul() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %2 = hal.interface.constant.load[2] : i32
+ %3 = hal.interface.constant.load[3] : i32
+ %4 = arith.index_castui %0 : i32 to index
+ %5 = arith.index_castui %1 : i32 to index
+ %6 = arith.index_castui %2 : i32 to index
+ %7 = arith.index_castui %3 : i32 to index
+ %8 = flow.dispatch.workload.ordinal %4, 0 : index
+ %9 = flow.dispatch.workload.ordinal %5, 1 : index
+ %10 = flow.dispatch.workload.ordinal %6, 2 : index
+ %11 = flow.dispatch.workload.ordinal %7, 3 : index
+ %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%10, %8}
+ %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%9, %11}
+ %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%10, %11}
+ %15 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [%10, %8], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%10, %8} -> tensor<?x?xf32>
+ %16 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [%9, %11], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%9, %11} -> tensor<?x?xf32>
+ %17 = tensor.empty(%10, %11) : tensor<?x?xf32>
+ %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ %19 = linalg.matmul {compilation_info = #compilation} ins(%15, %16 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%18 : tensor<?x?xf32>) -> tensor<?x?xf32>
+ flow.dispatch.tensor.store %19, %14, offsets = [0, 0], sizes = [%10, %11], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%10, %11}
+ return
+ }
+ }
+ }
+}
+// Checks that the bounded stack allocation are created.
+// CHECK-LABEL: func.func @preset_pad_config_dynamic_matmul
+// CHECK-DAG: memref.alloca() {{.+}} memref<8x16xf32>
+// CHECK-DAG: memref.alloca() {{.+}} memref<16x32xf32>
+// CHECK-DAG: memref.alloca() {{.+}} memref<8x32xf32>
+// CHECK: vector.outerproduct
// -----