Make default distribution logic divide work evenly. (#15414)

Also fix a bug in tile size adjustment, which reduce the number of
workgroups in cases where we are dividing the work too much. The
`numWorkgroupsPerDim[index]` was not updated after adjusting it to a new
tile size.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 3c6f93e..a39d0b8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -461,11 +461,32 @@
         llvm::divideCeil(workload[index], distributedTileSizes[index]);
     if (nwg < numWorkgroupsPerDim[index]) {
       numWorkgroups /= numWorkgroupsPerDim[index];
+      numWorkgroupsPerDim[index] = nwg;
       numWorkgroups *= nwg;
     } else {
       currDim--;
     }
   }
+
+  // Final fixup for dividing workload evenly.
+  for (auto i : llvm::seq<unsigned>(0, distributedTileSizes.size())) {
+    if (distributedTileSizes[i] == 0 || ShapedType::isDynamic(workload[i]))
+      continue;
+
+    int64_t nwg = llvm::divideCeil(workload[i], distributedTileSizes[i]);
+    int64_t newSize = llvm::divideCeil(workload[i], nwg);
+
+    // Chech if it's the ideal size with vector size hint. And skip if the new
+    // size will break the ideal size.
+    int64_t vectorSize = vectorSizeHints[i];
+    if (vectorSize > 1 &&
+        (newSize % vectorSize != 0 || workload[i] % newSize != 0)) {
+      continue;
+    }
+
+    distributedTileSizes[i] = newSize;
+  }
+
   return distributedTileSizes;
 }
 
@@ -1134,9 +1155,13 @@
       maxTileSizes[0] = 192;
       maxTileSizes[1] = 128;
     }
+    SmallVector<int64_t> vectorSizeHints(numLoops, vectorSize);
+    if (isBM) {
+      vectorSizeHints[0] = 1;
+    }
     distTileSizes = getDefaultDistributedLevelTileSizes(
         linalgOp, vecTileSizes, maxTileSizes,
-        /*allowIncompleteTile=*/true);
+        /*allowIncompleteTile=*/true, vectorSizeHints);
   } else {
     distTileSizes = getDefaultDistributedLevelTileSizes(linalgOp, vecTileSizes,
                                                         maxTileSizes);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index f86d347..020fce6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -368,7 +368,7 @@
     }
   }
 }
-//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 1, 7, 64, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0]]>
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 7, 7, 64, 0, 0], [1, 1, 1, 4, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0]]>
 //   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
 //       CHECK: hal.executable.export public @restrict_num_workgroups
 //  CHECK-SAME:     translation_info = #[[TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index b2b3a4a..6c3d495 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -977,7 +977,7 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 64, 0], [8, 32, 0], [0, 0, 16], [0, 0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
 //      CHECK: hal.executable.export public @matmul_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -1863,7 +1863,7 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[256, 72, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[192, 144, 0], [8, 32, 0], [0, 0, 12], [0, 0, 0]]>
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
 //      CHECK: hal.executable.export public @quant_model
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]