Move bufferization before configuration selection. (#6789)
* Move bufferization before configuration selection.
Current configuraiton selection in presence of operand fusion relies
on getting the original problem size. The current approach works
reliably only on buffer ops. So move bufferization before
configuration selection.
* Re-enable operand fusion for Adrenos.
Setting a default configuration for the root ops allows operand fusion
on adrenos as well.
* Address comment.
diff --git a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
index e4b2540..9ac980a 100644
--- a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
@@ -530,7 +530,7 @@
succeeded(setMaliSpecificConfig(entryPoint, op))) {
return success();
}
- return success();
+ return setDefaultRootConfig(entryPoint, targetEnv, op);
}
static LogicalResult setMaliSpecificConfig(
@@ -550,6 +550,7 @@
{{2, 2, 32}, {8, 2, 2}},
{{1, 4, 16}, {4, 4, 1}},
{{1, 1, 64}, {16, 1, 1}},
+ {{4, 4, 8}, {2, 4, 2}},
};
for (const auto &pair : tileWorkgroupSizePairs) {
@@ -617,7 +618,7 @@
succeeded(setMaliSpecificConfig(entryPoint, op))) {
return success();
}
- return success();
+ return setDefaultRootConfig(entryPoint, targetEnv, op);
}
/// Helper function to generate the number of workgroups when the
@@ -665,7 +666,9 @@
int64_t subgroupSize =
targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();
- if (computeOps.empty()) {
+ if (computeOps.empty() || llvm::none_of(computeOps, [](Operation *op) {
+ return hasMarker(op, getWorkgroupMarker());
+ })) {
// TODO(ravishankarm): `tensor.insert_slice` is not a compute op but still
// needs to be handled in dispatch region. For now it is handled in
// ConvertToGPU pass. Eventually this will be handled as a compute
@@ -690,8 +693,7 @@
}
Operation *rootOperation = nullptr;
- for (Operation *computeOp : reverse(computeOps)) {
- if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
+ for (Operation *computeOp : computeOps) {
auto setConfigFn = [&](Operation *rootOp) -> LogicalResult {
return TypeSwitch<Operation *, LogicalResult>(rootOp)
.Case<linalg::BatchMatmulOp,
@@ -715,8 +717,7 @@
// If there are still no roots, check for any generic op.
if (!rootOperation) {
- for (Operation *computeOp : reverse(computeOps)) {
- if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
+ for (Operation *computeOp : computeOps) {
if (isa<linalg::FillOp, linalg::CopyOp>(computeOp)) continue;
if (failed(setDefaultRootConfig(funcOp, targetEnv, computeOp))) {
return failure();
@@ -731,31 +732,6 @@
}
}
- if (!rootOperation) {
- /// TODO(ravishankarm): This is setting the configuration for ops that are
- /// directly distributed to global invocation IDs. Remove this when
- /// SPIRVConvertToGPU is deprecated.
- for (Operation *computeOp : reverse(computeOps)) {
- if (hasMarker(computeOp, getWorkgroupMarker())) continue;
- if (isa<linalg::FillOp, linalg::CopyOp, linalg::GenericOp>(computeOp)) {
- std::array<int64_t, 3> workgroupSize = {1, 1, 1};
- auto linalgOp = cast<linalg::LinalgOp>(computeOp);
- if (getNumOuterParallelLoops(linalgOp)) {
- workgroupSize = {subgroupSize, 1, 1};
- }
- if (failed(setTranslationUsingDistributeToGlobalId(funcOp,
- workgroupSize))) {
- return computeOp->emitOpError(
- "failed to set translation info for distributing to global "
- "IDs");
- }
- rootOperation = computeOp;
- break;
- }
- }
- if (rootOperation) continue;
- }
-
// Propogate the configuration to the other ops.
// TODO(ravishankarm, antiagainst): This is a very specific use (and
// fragile). In general, this should not be needed. Things are already tiled
diff --git a/iree/compiler/Codegen/SPIRV/Passes.cpp b/iree/compiler/Codegen/SPIRV/Passes.cpp
index b552f04..4fa7ee2 100644
--- a/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -58,8 +58,6 @@
}
void addSPIRVVectorizationPassPipeline(OpPassManager &pm) {
- // Convert tensor to buffers.
- addLinalgBufferizePasses(pm, gpuAllocationFunction);
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
@@ -89,8 +87,6 @@
}
void addSPIRVDistributePassPipeline(OpPassManager &pm) {
- // Convert tensor to buffers.
- addLinalgBufferizePasses(pm, gpuAllocationFunction);
//===--------------------------------------------------------------------===//
// Initial clean up.
//===--------------------------------------------------------------------===//
@@ -118,9 +114,6 @@
}
void addSPIRVDistributeToGlobalIDPipeline(OpPassManager &pm) {
- // Convert tensor to buffers.
- addLinalgBufferizePasses(pm, gpuAllocationFunction);
-
// Handle ops that cannot go through the previous tiling, distribution, and
// vectorization flow. Only perform one level of distribution to map them to
// GPU global invocation IDs for distribution.
@@ -181,6 +174,10 @@
}
void buildSPIRVCodegenPassPipeline(OpPassManager &pm) {
+ {
+ OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
+ addLinalgBufferizePasses(nestedModulePM, gpuAllocationFunction);
+ }
pm.addPass(createSPIRVLowerExecutableTargetPass());
OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
addLowerToSPIRVPasses(nestedModulePM);
diff --git a/iree/test/e2e/regression/linalg_ops.mlir b/iree/test/e2e/regression/linalg_ops.mlir
index 4a0660a..8a742f9 100644
--- a/iree/test/e2e/regression/linalg_ops.mlir
+++ b/iree/test/e2e/regression/linalg_ops.mlir
@@ -31,3 +31,29 @@
[189, 220, 253, 288]]> : tensor<3x4xi32>) : tensor<3x4xi32>
return
}
+
+func @operand_fusion() {
+ %input = util.unfoldable_constant dense<1.0> : tensor<1x225x225x3xf32>
+ %filter = util.unfoldable_constant dense<1.0> : tensor<3x3x3x16xf32>
+ %bias = util.unfoldable_constant dense<1.0> : tensor<16xf32>
+ %init = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
+ %cst = constant 0.0 : f32
+ %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+ %conv = linalg.conv_2d_input_nhwc_filter_hwcf
+ {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+ ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>)
+ outs(%fill : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+ %result = linalg.generic {
+ indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d3)>,
+ affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+ iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+ ins(%conv, %bias : tensor<1x112x112x16xf32>, tensor<16xf32>)
+ outs(%init : tensor<1x112x112x16xf32>) {
+ ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32):
+ %0 = addf %arg0, %arg1 : f32
+ linalg.yield %0 : f32
+ } -> tensor<1x112x112x16xf32>
+ check.expect_eq_const(%result, dense<28.0> : tensor<1x112x112x16xf32>) : tensor<1x112x112x16xf32>
+ return
+}
\ No newline at end of file