Move bufferization before configuration selection. (#6789) * Move bufferization before configuration selection. Current configuraiton selection in presence of operand fusion relies on getting the original problem size. The current approach works reliably only on buffer ops. So move bufferization before configuration selection. * Re-enable operand fusion for Adrenos. Setting a default configuration for the root ops allows operand fusion on adrenos as well. * Address comment.

commit: 29a0b471b3baf889781981a9c8e4afefc676e024 [log] [tgz]
author: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Wed Aug 18 09:13:10 2021 -0700
committer: GitHub <noreply@github.com> Wed Aug 18 09:13:10 2021 -0700
tree: 4121226de93ab074026fe7d08851b31d25bb5a27
parent: bf378dbf8aaa36b49d7d9f75e74187b94a7696b6 [diff]
diff --git a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
index e4b2540..9ac980a 100644
--- a/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp
+++ b/iree/compiler/Codegen/SPIRV/KernelDispatchUtils.cpp

@@ -530,7 +530,7 @@
       succeeded(setMaliSpecificConfig(entryPoint, op))) {
     return success();
   }
-  return success();
+  return setDefaultRootConfig(entryPoint, targetEnv, op);
 }
 
 static LogicalResult setMaliSpecificConfig(
@@ -550,6 +550,7 @@
       {{2, 2, 32}, {8, 2, 2}},
       {{1, 4, 16}, {4, 4, 1}},
       {{1, 1, 64}, {16, 1, 1}},
+      {{4, 4, 8}, {2, 4, 2}},
   };
 
   for (const auto &pair : tileWorkgroupSizePairs) {
@@ -617,7 +618,7 @@
       succeeded(setMaliSpecificConfig(entryPoint, op))) {
     return success();
   }
-  return success();
+  return setDefaultRootConfig(entryPoint, targetEnv, op);
 }
 
 /// Helper function to generate the number of workgroups when the
@@ -665,7 +666,9 @@
     int64_t subgroupSize =
         targetEnv.getResourceLimits().subgroup_size().getValue().getSExtValue();
 
-    if (computeOps.empty()) {
+    if (computeOps.empty() || llvm::none_of(computeOps, [](Operation *op) {
+          return hasMarker(op, getWorkgroupMarker());
+        })) {
       // TODO(ravishankarm): `tensor.insert_slice` is not a compute op but still
       // needs to be handled in dispatch region. For now it is handled in
       // ConvertToGPU pass. Eventually this will be handled as a compute
@@ -690,8 +693,7 @@
     }
 
     Operation *rootOperation = nullptr;
-    for (Operation *computeOp : reverse(computeOps)) {
-      if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
+    for (Operation *computeOp : computeOps) {
       auto setConfigFn = [&](Operation *rootOp) -> LogicalResult {
         return TypeSwitch<Operation *, LogicalResult>(rootOp)
             .Case<linalg::BatchMatmulOp,
@@ -715,8 +717,7 @@
 
     // If there are still no roots, check for any generic op.
     if (!rootOperation) {
-      for (Operation *computeOp : reverse(computeOps)) {
-        if (!hasMarker(computeOp, getWorkgroupMarker())) continue;
+      for (Operation *computeOp : computeOps) {
         if (isa<linalg::FillOp, linalg::CopyOp>(computeOp)) continue;
         if (failed(setDefaultRootConfig(funcOp, targetEnv, computeOp))) {
           return failure();
@@ -731,31 +732,6 @@
       }
     }
 
-    if (!rootOperation) {
-      /// TODO(ravishankarm): This is setting the configuration for ops that are
-      /// directly distributed to global invocation IDs. Remove this when
-      /// SPIRVConvertToGPU is deprecated.
-      for (Operation *computeOp : reverse(computeOps)) {
-        if (hasMarker(computeOp, getWorkgroupMarker())) continue;
-        if (isa<linalg::FillOp, linalg::CopyOp, linalg::GenericOp>(computeOp)) {
-          std::array<int64_t, 3> workgroupSize = {1, 1, 1};
-          auto linalgOp = cast<linalg::LinalgOp>(computeOp);
-          if (getNumOuterParallelLoops(linalgOp)) {
-            workgroupSize = {subgroupSize, 1, 1};
-          }
-          if (failed(setTranslationUsingDistributeToGlobalId(funcOp,
-                                                             workgroupSize))) {
-            return computeOp->emitOpError(
-                "failed to set translation info for distributing to global "
-                "IDs");
-          }
-          rootOperation = computeOp;
-          break;
-        }
-      }
-      if (rootOperation) continue;
-    }
-
     // Propogate the configuration to the other ops.
     // TODO(ravishankarm, antiagainst): This is a very specific use (and
     // fragile). In general, this should not be needed. Things are already tiled

diff --git a/iree/compiler/Codegen/SPIRV/Passes.cpp b/iree/compiler/Codegen/SPIRV/Passes.cpp
index b552f04..4fa7ee2 100644
--- a/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/iree/compiler/Codegen/SPIRV/Passes.cpp

@@ -58,8 +58,6 @@
 }
 
 void addSPIRVVectorizationPassPipeline(OpPassManager &pm) {
-  // Convert tensor to buffers.
-  addLinalgBufferizePasses(pm, gpuAllocationFunction);
   //===--------------------------------------------------------------------===//
   // Initial clean up.
   //===--------------------------------------------------------------------===//
@@ -89,8 +87,6 @@
 }
 
 void addSPIRVDistributePassPipeline(OpPassManager &pm) {
-  // Convert tensor to buffers.
-  addLinalgBufferizePasses(pm, gpuAllocationFunction);
   //===--------------------------------------------------------------------===//
   // Initial clean up.
   //===--------------------------------------------------------------------===//
@@ -118,9 +114,6 @@
 }
 
 void addSPIRVDistributeToGlobalIDPipeline(OpPassManager &pm) {
-  // Convert tensor to buffers.
-  addLinalgBufferizePasses(pm, gpuAllocationFunction);
-
   // Handle ops that cannot go through the previous tiling, distribution, and
   // vectorization flow. Only perform one level of distribution to map them to
   // GPU global invocation IDs for distribution.
@@ -181,6 +174,10 @@
 }
 
 void buildSPIRVCodegenPassPipeline(OpPassManager &pm) {
+  {
+    OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
+    addLinalgBufferizePasses(nestedModulePM, gpuAllocationFunction);
+  }
   pm.addPass(createSPIRVLowerExecutableTargetPass());
   OpPassManager &nestedModulePM = pm.nest<ModuleOp>();
   addLowerToSPIRVPasses(nestedModulePM);

diff --git a/iree/test/e2e/regression/linalg_ops.mlir b/iree/test/e2e/regression/linalg_ops.mlir
index 4a0660a..8a742f9 100644
--- a/iree/test/e2e/regression/linalg_ops.mlir
+++ b/iree/test/e2e/regression/linalg_ops.mlir

@@ -31,3 +31,29 @@
       [189, 220, 253, 288]]> : tensor<3x4xi32>) : tensor<3x4xi32>
   return
 }
+
+func @operand_fusion() {
+  %input = util.unfoldable_constant dense<1.0> : tensor<1x225x225x3xf32>
+  %filter = util.unfoldable_constant dense<1.0> : tensor<3x3x3x16xf32>
+  %bias = util.unfoldable_constant dense<1.0> : tensor<16xf32>
+  %init = linalg.init_tensor [1, 112, 112, 16] : tensor<1x112x112x16xf32>
+  %cst = constant 0.0 : f32
+  %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x16xf32> -> tensor<1x112x112x16xf32>
+  %conv = linalg.conv_2d_input_nhwc_filter_hwcf
+      {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
+      ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>)
+      outs(%fill : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32>
+  %result = linalg.generic {
+      indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d3)>,
+                       affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
+      iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+      ins(%conv, %bias : tensor<1x112x112x16xf32>, tensor<16xf32>)
+      outs(%init : tensor<1x112x112x16xf32>) {
+      ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32):
+        %0 = addf %arg0, %arg1 : f32
+        linalg.yield %0 : f32
+      } -> tensor<1x112x112x16xf32>
+  check.expect_eq_const(%result, dense<28.0> : tensor<1x112x112x16xf32>) : tensor<1x112x112x16xf32>
+  return
+}
\ No newline at end of file
commit	29a0b471b3baf889781981a9c8e4afefc676e024	[log] [tgz]
author	MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com>	Wed Aug 18 09:13:10 2021 -0700
committer	GitHub <noreply@github.com>	Wed Aug 18 09:13:10 2021 -0700
tree	4121226de93ab074026fe7d08851b31d25bb5a27
parent	bf378dbf8aaa36b49d7d9f75e74187b94a7696b6 [diff]