[spirv] Do not use vectorization pipeline for pooling ops (#8994)
Pooling op vectorization is not implemented/connected. Letting
them going through vectorization pipeline can cause correctness
issues.
Fixes https://github.com/google/iree/issues/8733
diff --git a/integrations/tensorflow/test/iree_tfl_tests/vulkan_mobilenet_v1.run b/integrations/tensorflow/test/iree_tfl_tests/vulkan_mobilenet_v1.run
index 6efa392..75129ca 100644
--- a/integrations/tensorflow/test/iree_tfl_tests/vulkan_mobilenet_v1.run
+++ b/integrations/tensorflow/test/iree_tfl_tests/vulkan_mobilenet_v1.run
@@ -1,3 +1,2 @@
# REQUIRES: vulkan
# RUN: %PYTHON -m iree_tfl_tests.mobilenet_v1_test --target_backend=vulkan -artifacts_dir=%t
-# XFAIL: *
diff --git a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index f4c40c4..a8070dd 100644
--- a/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -334,7 +334,8 @@
//===----------------------------------------------------------------------===//
static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
- Operation *op) {
+ Operation *op,
+ bool allowVectorization = true) {
LLVM_DEBUG(llvm::dbgs() << "Using default config for op: " << *op << "\n");
func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
auto interfaceOp = cast<IREE::Flow::PartitionableLoopsInterface>(*op);
@@ -408,6 +409,7 @@
// Whether we can try to use the vectorization pipeline.
Optional<SmallVector<int64_t, 4>> loopBounds = linalgOp.getStaticLoopRanges();
bool vectorizable =
+ allowVectorization &&
// The vectorization pipeline assumes tensor semantics when tiling.
!linalgOp.hasBufferSemantics() && !linalgOp.hasIndexSemantics() &&
// Skip vectorization for non-minor identity inputs as it generates
@@ -578,8 +580,9 @@
// If unsuccessful, try to tile and distribute.
return setDefaultOpConfig(limits, op);
})
- .Case<IREE::LinalgExt::FftOp>([limits](IREE::LinalgExt::FftOp op) {
- return setFftOpConfig(limits, op);
+ .Case<linalg::ConvolutionOpInterface>([limits](auto op) {
+ // Other convolution/pooling op vectorization is not wired up.
+ return setDefaultOpConfig(limits, op, /*allowVectorization=*/false);
})
.Case<linalg::GenericOp>([limits](linalg::GenericOp op) {
// If a generic op has reduction iterator types, it can be treated as a
@@ -590,6 +593,9 @@
}
return success();
})
+ .Case<IREE::LinalgExt::FftOp>([limits](IREE::LinalgExt::FftOp op) {
+ return setFftOpConfig(limits, op);
+ })
.Default([](Operation *) { return success(); });
};
diff --git a/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp b/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
index fd2ae6b..3808661 100644
--- a/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
+++ b/iree/compiler/Codegen/SPIRV/SPIRVVectorize.cpp
@@ -100,7 +100,6 @@
patterns.add<linalg::LinalgVectorizationPattern>(
patterns.getContext(), f.addOpFilter<linalg::ContractionOpInterface>(),
opt);
- populateVectorizePadPatterns(patterns);
vector::populateVectorTransferPermutationMapLoweringPatterns(patterns);
vector::populateVectorReductionToContractPatterns(patterns);
}
@@ -131,7 +130,9 @@
{
RewritePatternSet patterns(context);
populateVectorizationPatterns(patterns);
+ // Pull in additional vectorization patterns in IREE.
populateLinalgToVectorVectorizeConvPatterns(context, patterns);
+ populateVectorizePadPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
return signalPassFailure();
}
diff --git a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
index 034ff39..a25cd54 100644
--- a/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
+++ b/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir
@@ -118,7 +118,7 @@
: !flow.dispatch.tensor<readonly:1x24x24x8xf32> -> tensor<1x24x24x8xf32>
%20 = linalg.init_tensor [1, 2, 2, 8] : tensor<1x2x2x8xf32>
%21 = linalg.fill ins(%cst : f32) outs(%20 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
- %22 = linalg.pooling_nhwc_sum {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>}
+ %22 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>}
ins(%14, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>)
outs(%21 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32>
flow.dispatch.tensor.store %22, %1, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 8], strides = [1, 1, 1, 1]
@@ -138,6 +138,63 @@
// -----
+// Polling vectorization is not supported for now.
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>
+ ]>
+]>
+hal.executable @avg_pool {
+ hal.executable.variant @vulkan_spirv_fb, target = <"vulkan-spirv", "vulkan-spirv-fb", {
+ spv.target_env = #spv.target_env<#spv.vce<v1.4, [Shader], []>, Unknown:IntegratedGPU, {
+ max_compute_shared_memory_size = 32768 : i32,
+ max_compute_workgroup_invocations = 512 : i32,
+ max_compute_workgroup_size = dense<512> : vector<3xi32>,
+ subgroup_size = 4 : i32}>
+ }> {
+ hal.executable.entry_point public @avg_pool layout(#executable_layout)
+ builtin.module {
+ func.func @avg_pool() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %cst_0 = arith.constant 4.900000e+01 : f32
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:1x7x7x1280xf32>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:1x1x1x1280xf32>
+ %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 1280], strides = [1, 1, 1, 1]
+ : !flow.dispatch.tensor<readonly:1x7x7x1280xf32> -> tensor<1x7x7x1280xf32>
+ %3 = linalg.init_tensor [7, 7] : tensor<7x7xf32>
+ %4 = linalg.init_tensor [1, 1, 1, 1280] : tensor<1x1x1x1280xf32>
+ %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
+ %6 = linalg.pooling_nhwc_sum {
+ dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>
+ } ins(%2, %3 : tensor<1x7x7x1280xf32>, tensor<7x7xf32>) outs(%5 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32>
+ %7 = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+ iterator_types = ["parallel", "parallel", "parallel", "parallel"]
+ } ins(%6 : tensor<1x1x1x1280xf32>) outs(%4 : tensor<1x1x1x1280xf32>) {
+ ^bb0(%arg0: f32, %arg1: f32):
+ %8 = arith.divf %arg0, %cst_0 : f32
+ linalg.yield %8 : f32
+ } -> tensor<1x1x1x1280xf32>
+ flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 1280], strides = [1, 1, 1, 1]
+ : tensor<1x1x1x1280xf32> -> !flow.dispatch.tensor<writeonly:1x1x1x1280xf32>
+ return
+ }
+ }
+ }
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 4], [0, 0, 0, 1]{{\]}}>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVDistribute>
+// CHECK: hal.executable.entry_point public @avg_pool
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.pooling_nhwc_sum
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
// Max pooling op with odd size-1 dimension sizes.
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [