[CPU] Tile outer parallel dims with 1 before lowering to ukernels. (#17731) The revision drops the support of "cache level tiling" because 1. Nobody is actively developing the path. 2. The dummy config is set which is doing nothing. 3. It is causing maintenance burden when we're developing new features. The new pipeline is: 1. Distribute mmt4d ops 2. Tile and fuse ops along parallel dims 3. Convert the mmt4d ops to ukernel ops. 4. Tile reduction dims if there are no ukernels for the mmt4d ops. 5. The rest is still the same. Fixes https://github.com/iree-org/iree/issues/17717 --------- Signed-off-by: hanhanW <hanhan0912@gmail.com>

commit: 2401be20d63aa2abb9ebf2a5a763a8384d10c205 [log] [tgz]
author: Han-Chung Wang <hanhan0912@gmail.com> Wed Jun 26 14:21:30 2024 -0700
committer: GitHub <noreply@github.com> Wed Jun 26 14:21:30 2024 -0700
tree: 26302a7137b8ad25535d877498cf05bdf0405a55
parent: 9da0309b0491df57629a2177ab1dbec4aa73ae6e [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index c485b7f..15ccb3c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -1589,10 +1589,7 @@
   SmallVector<int64_t> reductionTileSizes;
   splitParallelAndReductionTiles(op, parallelTileSizes, reductionTileSizes);
 
-  SmallVector<int64_t> vectorInnerParallelTileSizes(numLoops, 0);
-  return {distTileSizes,           cacheParallelTileSizes,
-          cacheReductionTileSizes, parallelTileSizes,
-          reductionTileSizes,      vectorInnerParallelTileSizes};
+  return {distTileSizes, parallelTileSizes, reductionTileSizes};
 }
 
 /// Sets the lowering configuration for dispatch region for linalg.mmt4d

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 10bbc3b..be6aeb9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp

@@ -499,43 +499,15 @@
                                       LLVMCPUPipelineOptions &pipelineOpt) {
   addTileAndDistributePasses(funcPassManager);
 
+  funcPassManager.addPass(createLLVMCPUTileAndFusePass(
+      static_cast<int64_t>(tilingConfig.getVectorCommonParallelLevel())));
   if (pipelineOpt.enableUkernels) {
     funcPassManager.addPass(createCPUPrepareUkernelsPass());
     funcPassManager.addPass(
         createCPULowerToUKernelsPass(clSkipIntermediateRoundings));
   }
-
-  // We still run codegen pipeline because we want a better fallback when
-  // ukernels are not available. They are nop if the mmt4d op is convereted to
-  // ukernels. If ukernels are not implemented, the lowering config is still
-  // carried by compute ops, so we can use it as a fallback solution.
-
-  // Apply tile and fuse to all the non-distribution fusable levels. Skip
-  // distribution level as such a level has been fused already.
-  SmallVector<int64_t> allFusableLevels(tilingConfig.getFusableLevels());
-  if (allFusableLevels.size() > 1) {
-    llvm::SmallSetVector<int64_t, 4> fusableLevels(allFusableLevels.begin(),
-                                                   allFusableLevels.end());
-    for (int i = 0, end = tilingConfig.getNumTilingLevels(); i < end; ++i) {
-      if (i == tilingConfig.getDistributionLevel())
-        continue;
-      if (fusableLevels.contains(i)) {
-        funcPassManager.addPass(createLLVMCPUTileAndFusePass(i));
-        continue;
-      }
-
-      if (i == tilingConfig.getVectorReductionLevel()) {
-        // Run SplitReductionPass before the final reduction Fuse pass, because
-        // SplitReductionPass takes care of banked-tiling.
-        funcPassManager.addPass(
-            createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
-        funcPassManager.addPass(createLLVMCPUTilePass(i));
-        continue;
-      }
-
-      funcPassManager.addPass(createLLVMCPUTilePass(i));
-    }
-  }
+  funcPassManager.addPass(createLLVMCPUTilePass(
+      static_cast<int64_t>(tilingConfig.getVectorReductionLevel())));
 
   {
     GenericVectorizationPassOptions options;

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 3f3ea84..4d6027f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir

@@ -251,7 +251,11 @@
 // Checks scf.for for distribution loops.
 //       CHECK:   scf.for
 //       CHECK:     scf.for
-//   CHECK-NOT:       scf.for
+// Checks scf.for for outer and inner parallel loops.
+//       CHECK:       scf.for
+//       CHECK:         scf.for
+//       CHECK:           scf.for
+//   CHECK-NOT:             scf.for
 //       CHECK:   iree_codegen.ukernel.generic "iree_uk_mmt4d"
 
 // -----

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
index c7697c7..841615b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir

@@ -289,7 +289,7 @@
     return
   }
 }
-//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 16, 0, 0, 0, 0], [16, 16, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 4, 4, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]{{\]}}
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 16, 0, 0, 0, 0], [1, 1, 0, 4, 4, 0], [0, 0, 1, 0, 0, 1]]
 //       CHECK: func.func @mmt4d_384x384x512_4x1x4_dispatch_0()
 //       CHECK:   linalg.mmt4d
 //  CHECK-SAME:     lowering_config = #[[CONFIG]]

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 53e33f9..f8b8412 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir

@@ -1455,7 +1455,7 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 10, 80, 0, 0, 0, 0], [1, 10, 80, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 8, 4, 0], [0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0]{{\]}}>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 10, 80, 0, 0, 0, 0], [1, 1, 1, 0, 8, 4, 0], [0, 0, 0, 1, 0, 0, 1]]
 //      CHECK: func.func @batch_mmt4d()
 //      CHECK:   linalg.batch_mmt4d
 // CHECK-SAME:     lowering_config = #[[CONFIG]]
@@ -1480,7 +1480,7 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0]]>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>
 //      CHECK: func.func @mmt4d_with_large_reduction()
 //      CHECK:   linalg.mmt4d
 // CHECK-SAME:     lowering_config = #[[CONFIG]]
commit	2401be20d63aa2abb9ebf2a5a763a8384d10c205	[log] [tgz]
author	Han-Chung Wang <hanhan0912@gmail.com>	Wed Jun 26 14:21:30 2024 -0700
committer	GitHub <noreply@github.com>	Wed Jun 26 14:21:30 2024 -0700
tree	26302a7137b8ad25535d877498cf05bdf0405a55
parent	9da0309b0491df57629a2177ab1dbec4aa73ae6e [diff]