Heuristically pick up tiling sizes to vectorize Linalg operations. (#8517)

Configurations: taskset 80 + dylib-sync on local Pixel 4

| Model            | Before  | After   |
| ---------------- | ------- | ------- |
| DeepLabV3        | 711 ms  | 188 ms  |
| MobileBertSquad  | 672 ms  | 673 ms  |
| MobileNetV2      | 103 ms  | 46.4 ms |
| MobileNetV3Small | 24.6 ms | 14.6 ms |
| MobileSSD        | 204 ms  | 166 ms  |
| PoseNet          | 545 ms  | 241 ms  |

Configurations: taskset f0 + dylib on local Pixel 4

| Model            | Before  | After   |
| ---------------- | ------- | ------- |
| DeepLabV3        | 358 ms  | 161 ms  |
| MobileBertSquad  | 376 ms  | 376 ms  |
| MobileNetV2      | 65.6 ms | 29.8 ms |
| MobileNetV3Small | 20.7 ms | 15.7 ms |
| MobileSSD        | 94 ms   | 79.9 ms |
| PoseNet          | 183 ms  | 121 ms  |
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 2899681..7dcf9d3 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -198,8 +198,7 @@
 /// Adjusts the workload per workgroup to be a multiple of vector size to ensure
 /// that the op vectorizes.
 static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
-                              int64_t vectorSizeVal,
-                              Optional<int64_t> fallbackSizeVal = llvm::None) {
+                              int64_t vectorSizeVal) {
   if (ub == ShapedType::kDynamicSize || lb == ShapedType::kDynamicSize) {
     return maxSize;
   }
@@ -210,7 +209,16 @@
       return i;
     }
   }
-  return fallbackSizeVal ? fallbackSizeVal.getValue() : vectorSizeVal;
+  // If it can't be a multiple of vectorSizeVal, let's choose a factor of dim
+  // sizes heuristically.
+  int64_t start = std::min(maxSize, dim);
+  start = std::min(start, vectorSizeVal * 2);
+  for (int64_t i = start; i > 0; --i) {
+    if (dim % i == 0) {
+      return i;
+    }
+  }
+  return 1;
 }
 
 /// Returns the tile size to use for the Flow level of an operation that
@@ -340,15 +348,17 @@
   SmallVector<int64_t> l1TileSizes;
   int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
   l1TileSizes.append(nLoops - 3, 1);
-  l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, 8));
-  l1TileSizes.push_back(getMaxTileSize(0, flowTileSizes[nLoops - 2], 32, 32));
+  l1TileSizes.push_back(
+      getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, vectorSize));
+  l1TileSizes.push_back(
+      getMaxTileSize(0, flowTileSizes[nLoops - 2], 32, vectorSize));
   l1TileSizes.push_back(0);
 
   auto lhsShapedType = op.lhs().getType().cast<ShapedType>();
   int64_t K = lhsShapedType.getShape().back();
   SmallVector<int64_t> vectorTileSizes;
   vectorTileSizes.append(nLoops - 1, 0);
-  vectorTileSizes.push_back(getMaxTileSize(0, K, 16, 16));
+  vectorTileSizes.push_back(getMaxTileSize(0, K, 16, vectorSize));
 
   TileSizesListType tileSizes;
   tileSizes.emplace_back(flowTileSizes.begin(), flowTileSizes.end());
@@ -668,8 +678,7 @@
     // If the tile size is intended to be 1, do not adjust it to `vectorSize`.
     // The ops will be decomposed to lower-rank named ops.
     if (l1TileSizes[i] != 1) {
-      l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize,
-                                      /*fallbackTileSize=*/1);
+      l1TileSizes[i] = getMaxTileSize(0, tileSize, l1TileSizes[i], vectorSize);
     }
   }
   SmallVector<int64_t> vectorTileSizes;
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index 5896aaf..f2150cf 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -872,7 +872,7 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 0, 0], [4, 0, 0], [0, 1, 4]{{\]}}>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 0, 0], [1, 0, 0], [0, 1, 4]{{\]}}>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.entry_point public @predict_dispatch_86
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -1033,6 +1033,52 @@
 
 // -----
 
+#executable_layout = #hal.executable.layout<push_constants = 4, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<
+  "llvm", "embedded-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "x86_64-unknown-unknown-eabi-elf"
+  }
+>
+hal.executable private @matmul_odd {
+  hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
+    hal.executable.entry_point public @matmul_odd ordinal(0) layout(#executable_layout)
+    builtin.module {
+      func @matmul_odd() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:33x16xf32>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:16x49xf32>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<readonly:33x49xf32>
+        %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) offset(%c0) alignment(32) : !flow.dispatch.tensor<writeonly:33x49xf32>
+        %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x16xf32> -> tensor<33x16xf32>
+        %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:16x49xf32> -> tensor<16x49xf32>
+        %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor<readonly:33x49xf32> -> tensor<33x49xf32>
+        %7 = linalg.init_tensor [33, 49] : tensor<33x49xf32>
+        %8 = linalg.fill(%cst, %7) : f32, tensor<33x49xf32> -> tensor<33x49xf32>
+        %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
+        flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor<writeonly:33x49xf32>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[3, 7, 0], [3, 7, 0], [0, 0, 16]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.entry_point public @matmul_odd
+// CHECK-SAME:       translation_info = #[[TRANSLATION]]
+//      CHECK:   linalg.matmul
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
 #executable_layout = #hal.executable.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,