[LLVMCPU] Add a flag to disable distribution. (#12942)

Co-authored-by: Diego Caballero <diegocaballero@google.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index b231ec6..f96092e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -50,9 +50,15 @@
 
 static llvm::cl::opt<int> clNumberOfRuntimeThreads(
     "iree-codegen-llvm-number-of-threads",
-    llvm::cl::desc("number of threads that are used at runtime"),
+    llvm::cl::desc("number of threads that are used at runtime if codegen "
+                   "thread distribution is enabled"),
     llvm::cl::init(8));
 
+static llvm::cl::opt<bool> clDisableDistribution(
+    "iree-codegen-llvm-disable-distribution",
+    llvm::cl::desc("disable thread distribution in codegen"),
+    llvm::cl::init(true));
+
 static llvm::cl::list<int> mmt4dWorkgroupTileSizes(
     "iree-codegen-llvm-mmt4d-workgroup-tile-sizes",
     llvm::cl::desc("linalg.mmt4d workgroup tile size"), llvm::cl::ZeroOrMore);
@@ -358,7 +364,14 @@
   assert(lbs.size() == ubs.size() && lbs.size() == minTileSizes.size() &&
          lbs.size() == maxTileSizes.size() &&
          "expected all vectors to be of equal size");
+
   size_t numDims = lbs.size();
+  // Set all the distribution tile sizes to zero if thread distribution is
+  // disabled.
+  if (!clDisableDistribution) {
+    return SmallVector<int64_t>(numDims, 0);
+  }
+
   SmallVector<int64_t> distributedTileSizes(numDims, 1);
   SmallVector<int64_t> numWorkgroupsPerDim(numDims, 1);
   SmallVector<int64_t> workload(numDims, 1);
@@ -1093,6 +1106,12 @@
 static SmallVector<int64_t> getLinalgExtDefaultWorkgroupTileSizes(
     TilingInterface op) {
   unsigned numLoops = op.getLoopIteratorTypes().size();
+  // Set all the distribution tile sizes to zero if thread distribution is
+  // disabled.
+  if (!clDisableDistribution) {
+    return SmallVector<int64_t>(numLoops, 0);
+  }
+
   auto partitionedLoops = cast<PartitionableLoopsInterface>(op.getOperation())
                               .getPartitionableLoops(kNumMaxParallelDims);
   SmallVector<int64_t> workgroupTileSizes(numLoops, defaultWorkgroupTileSize);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index 7aec5f9..8621b78 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -36,6 +36,7 @@
             "illegal_configuration.mlir",
             "lower_to_ukernel_ops.mlir",
             "materialize_aarch64_launch_configuration.mlir",
+            "materialize_configuration_without_distribution.mlir",
             "materialize_encoding.mlir",
             "materialize_riscv_launch_configuration.mlir",
             "materialize_vmvx_launch_configuration.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index d002300..4c782ca 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -31,6 +31,7 @@
     "illegal_configuration.mlir"
     "lower_to_ukernel_ops.mlir"
     "materialize_aarch64_launch_configuration.mlir"
+    "materialize_configuration_without_distribution.mlir"
     "materialize_encoding.mlir"
     "materialize_riscv_launch_configuration.mlir"
     "materialize_vmvx_launch_configuration.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_configuration_without_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_configuration_without_distribution.mlir
new file mode 100644
index 0000000..887f4c1
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_configuration_without_distribution.mlir
@@ -0,0 +1,47 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true})))' --iree-codegen-llvm-disable-distribution=false --split-input-file %s | FileCheck %s
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_static  {
+  hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+    "llvm-cpu",
+    "embedded-elf-x86_64", {
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 16 : index,
+      target_triple = "x86_64-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.export public @matmul_static layout(#pipeline_layout)
+    builtin.module {
+      func.func @matmul_static() {
+        %cst = arith.constant 0.0 : f32
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<384x512xf32>>
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<512x128xf32>>
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<384x512xf32>> -> tensor<384x512xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<512x128xf32>> -> tensor<512x128xf32>
+        %init = tensor.empty() : tensor<384x128xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<384x128xf32>) -> tensor<384x128xf32>
+        %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
+            outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
+        flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
+            : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:tensor<384x128xf32>>
+        return
+      }
+    }
+  }
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
+//      CHECK: hal.executable.export public @matmul_static
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK: linalg.matmul
+// CHECK-SAME:     lowering_config = #[[CONFIG]]
+