Address regressions with default heuristics on CPU. (#7260)

The heuristic used earlier resulted in too much splitting resulting in
large number of tiny workgroups. For now we dont have a good way to
model the amount of work that each dispatch contains, so this change
just limits the number of workgroups.
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 8dd235f..5b7bcc2 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -19,19 +19,28 @@
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace iree_compiler {
 
-// TODO(ravishankarm): This needs to be put in a common place for the CPU and
-// GPU backends to use.
+/// NOTE: None of these flags are supported in any form long term. This are
+/// temporary hooks added for development purposes. They could be
+/// changed/modified at any time.
+/// TODO: Find a way to plumb this through to not rely on these flags.
+
 static llvm::cl::opt<int> clNativeVectorSizeInBytes(
     "iree-codegen-llvm-vector-size-in-bytes",
     llvm::cl::desc("native vector size to use on the hardware"),
     llvm::cl::init(16));
 
+static llvm::cl::opt<int> clNumberOfRuntimeThreads(
+    "iree-codegen-llvm-number-of-threads",
+    llvm::cl::desc("number of threads that are used at runtime"),
+    llvm::cl::init(8));
+
 static llvm::cl::opt<int> matmulWorkgroupTileSize(
     "iree-codegen-llvm-matmul-workgroup-size",
     llvm::cl::desc(
@@ -134,33 +143,67 @@
     maxDim = std::max<unsigned>(tiledLoop.distributionDim, maxDim);
   }
   SmallVector<int64_t> workloadPerWorkgroup(maxDim + 1, 1);
+  SmallVector<int64_t> numWorkgroupsPerDim(maxDim + 1, 1);
+  SmallVector<int64_t> workload(maxDim + 1, 1);
+  auto getStaticValue = [](OpFoldResult ofr) -> Optional<int64_t> {
+    return (ofr ? getConstantIntValue(ofr) : llvm::None);
+  };
+  auto ceilFn = [](int64_t a, int64_t b) { return (a + b - 1) / b; };
 
   for (auto tiledLoop : enumerate(tiledLoops)) {
-    if (!tiledLoop.value().ub || !tiledLoop.value().ub.is<Attribute>() ||
-        !tiledLoop.value().lb || !tiledLoop.value().lb.is<Attribute>()) {
-      workloadPerWorkgroup[tiledLoop.value().distributionDim] =
-          defaultWorkgroupTileSize;
+    Optional<int64_t> lb = getStaticValue(tiledLoop.value().lb);
+    Optional<int64_t> ub = getStaticValue(tiledLoop.value().ub);
+    unsigned dim = tiledLoop.value().distributionDim;
+    if (!lb || !ub) {
+      workloadPerWorkgroup[dim] = defaultWorkgroupTileSize;
+      workload[dim] = ShapedType::kDynamicSize;
       continue;
     }
-    int64_t lb =
-        tiledLoop.value().lb.get<Attribute>().cast<IntegerAttr>().getInt();
-    int64_t ub =
-        tiledLoop.value().ub.get<Attribute>().cast<IntegerAttr>().getInt();
     int64_t candidateTileSize = nativeVectorSizeInElements[tiledLoop.index()];
-    if (ub <= lb) {
+    if (*ub <= *lb) {
       // Should be avoiding tiling this loop, but use tile size of 1.
       candidateTileSize = 1;
     } else {
       // Pick a value that evenly distributes the workload.
       candidateTileSize = std::max<int64_t>(
-          llvm::PowerOf2Floor(static_cast<uint64_t>(ub - lb) / 2),
+          llvm::PowerOf2Floor(static_cast<uint64_t>(*ub - *lb) / 2),
           candidateTileSize);
     }
 
     // Limit the workload per workgroup to the default being the max to keep the
     // work per invocation reasonable.
-    workloadPerWorkgroup[tiledLoop.value().distributionDim] =
+    workloadPerWorkgroup[dim] =
         std::min<int64_t>(candidateTileSize, defaultWorkgroupTileSize);
+    workload[dim] = (*ub <= *lb ? 1 : *ub - *lb);
+    numWorkgroupsPerDim[dim] = ceilFn(workload[dim], workloadPerWorkgroup[dim]);
+  }
+
+  // Reduce the number of workgroups in cases where we are dividing the work too
+  // much. Over-provision the number of workgroups to twice the number of
+  // threads.
+  int64_t numWorkgroupsLimit = 2 * clNumberOfRuntimeThreads;
+  int64_t numWorkgroups = 1;
+  for (auto ng : numWorkgroupsPerDim) {
+    numWorkgroups *= ng;
+  }
+  unsigned currDim = 0;
+  while (numWorkgroups > numWorkgroupsLimit &&
+         currDim < numWorkgroupsPerDim.size()) {
+    if (workloadPerWorkgroup[currDim] >= defaultWorkgroupTileSize ||
+        workload[currDim] == ShapedType::kDynamicSize ||
+        workloadPerWorkgroup[currDim] >= workload[currDim]) {
+      currDim++;
+      continue;
+    }
+    workloadPerWorkgroup[currDim] = std::min<int64_t>(
+        workloadPerWorkgroup[currDim] * 2, defaultWorkgroupTileSize);
+    int64_t nwg = ceilFn(workload[currDim], workloadPerWorkgroup[currDim]);
+    if (nwg < numWorkgroupsPerDim[currDim]) {
+      numWorkgroups /= numWorkgroupsPerDim[currDim];
+      numWorkgroups *= nwg;
+    } else {
+      currDim++;
+    }
   }
   return workloadPerWorkgroup;
 }
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 2843806..0a07d23 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -842,13 +842,14 @@
     }
   }
 }
-//      CHECK: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
+//  CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)>
 //      CHECK: hal.executable.entry_point public @conv_static attributes
-// CHECK-SAME:     translation.info = {passPipeline = "CPUDefault", workloadPerWorkgroup = [32, 32, 32]}
+// CHECK-SAME:     translation.info = {passPipeline = "CPUDefault", workloadPerWorkgroup = [64, 64, 32]}
 // CHECK-NEXT:   ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index, %[[ARG2:[a-zA-Z0-9]+]]: index)
 //  CHECK-DAG:     %[[D0:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]]
 //  CHECK-DAG:     %[[D1:.+]] = affine.apply #[[MAP0]]()[%[[ARG1]]
-//  CHECK-DAG:     %[[D2:.+]] = affine.apply #[[MAP0]]()[%[[ARG2]]
+//  CHECK-DAG:     %[[D2:.+]] = affine.apply #[[MAP1]]()[%[[ARG2]]
 //      CHECK:     hal.return %[[D0]], %[[D1]], %[[D2]]
 //      CHECK:     linalg.depthwise_conv2D_nhw
 //  CHECK-NOT:       lowering.config
@@ -974,6 +975,76 @@
 
 // -----
 
+hal.executable private @restrict_num_workgroups {
+  hal.executable.variant public @system_elf_arm_64, target = #hal.executable.target<"llvm", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> {
+    hal.executable.entry_point public @restrict_num_workgroups attributes {interface = @io, ordinal = 0 : index}
+    builtin.module  {
+      func @restrict_num_workgroups() {
+        %cst = constant 0.000000e+00 : f32
+        %c7 = constant 7 : index
+        %c576 = constant 576 : index
+        %c0 = constant 0 : index
+        %c64 = constant 64 : index
+        %c2 = constant 2 : index
+        %0 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:1x11x11x576xf32>
+        %1 = hal.interface.binding.subspan @io::@s0b0_ro_constant[%c0] : !flow.dispatch.tensor<readonly:5x5x576xf32>
+        %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:1x7x7x576xf32>
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_z = hal.interface.workgroup.id[2] : index
+        %workgroup_count_z = hal.interface.workgroup.count[2] : index
+        %workgroup_size_z = hal.interface.workgroup.size[2] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_z, %workgroup_size_z]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_z, %workgroup_size_z]
+        scf.for %arg0 = %3 to %c7 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+          scf.for %arg1 = %5 to %c7 step %6 {
+            %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+            %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+            scf.for %arg2 = %7 to %c576 step %8 {
+              %9 = affine.min affine_map<(d0) -> (6, -d0 + 12)>(%arg0)
+              %10 = affine.min affine_map<(d0) -> (11, -d0 + 12)>(%arg1)
+              %11 = flow.dispatch.tensor.load %0, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %9, %10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x11x11x576xf32> -> tensor<1x?x?x64xf32>
+              %12 = tensor.cast %11 : tensor<1x?x?x64xf32> to tensor<1x?x?x?xf32>
+              %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, %arg2], sizes = [5, 5, 64], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:5x5x576xf32> -> tensor<5x5x64xf32>
+              %14 = tensor.cast %13 : tensor<5x5x64xf32> to tensor<5x5x?xf32>
+              %15 = affine.min affine_map<(d0) -> (2, -d0 + 7)>(%arg0)
+              %16 = affine.min affine_map<(d0) -> (-d0 + 7, 2)>(%arg0)
+              %17 = linalg.init_tensor [1, %16, %c7, %c64] : tensor<1x?x?x?xf32>
+              %18 = linalg.fill(%cst, %17) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x?x?x?xf32> -> tensor<1x?x?x?xf32> 
+              %19 = linalg.depthwise_conv2D_nhw {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%12, %14 : tensor<1x?x?x?xf32>, tensor<5x5x?xf32>) outs(%18 : tensor<1x?x?x?xf32>) -> tensor<1x?x?x?xf32>
+              flow.dispatch.tensor.store %19, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %15, %c7, %c64], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x7x7x576xf32>
+            }
+          }
+        }
+        return
+      }
+      hal.interface private @io {
+        hal.interface.binding public @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
+        hal.interface.binding public @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
+      }
+    }
+  }
+}
+//   CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
+//   CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
+//   CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
+//       CHECK: hal.executable.entry_point public @restrict_num_workgroups attributes
+//  CHECK-SAME:     translation.info = {passPipeline = "CPUDefault", workloadPerWorkgroup = [64, 8, 4]}
+//  CHECK-NEXT:   ^bb0(%[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index, %[[ARG2:[a-zA-Z0-9]+]]: index)
+//   CHECK-DAG:     %[[D0:.+]] = affine.apply #[[MAP0]]()[%[[ARG0]]]
+//   CHECK-DAG:     %[[D1:.+]] = affine.apply #[[MAP1]]()[%[[ARG1]]]
+//   CHECK-DAG:     %[[D2:.+]] = affine.apply #[[MAP2]]()[%[[ARG2]]]
+//       CHECK:     hal.return %[[D0]], %[[D1]], %[[D2]]
+
+// -----
+
 hal.executable private @test_exp_0 {
   hal.executable.variant public @system_elf_arm_64, target = #hal.executable.target<"llvm", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> {
     hal.executable.entry_point public @test_exp_0 attributes {interface = @io, ordinal = 0 : index}