Enable reduction masking (#12003)

This PR adds the bits to enable the masking support built upstream for generic ops and reductions. For cases where masking is enabled, we simplified the tile size computation for vectorization/unrolling and pick a default value rounded up to the nearest power of 2. I tried really hard to preserve the original tile size configuration and pre-vectorization strategies for cases that are not generic ops, so peeling/padding are still used in some cases/targets. I'll follow up with more masking enablement.

benchmarks: x86_64
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index d2a44c8..0aa3281 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -126,6 +126,10 @@
   // Peel iterations from the vector dimensions so that they become multiple of
   // the vector length.
   Peeling,
+  // Compute vector dimensions assuming vector masking support. Vector sizes may
+  // be rounded up to the nearest power of two and out-of-bounds elements would
+  // be masked-out.
+  Masking,
   // Do not apply any vectorization pre-processing transformation.
   None
 };
@@ -140,6 +144,9 @@
     case VectorPreProcStrategy::Peeling:
       os << "Peeling";
       break;
+    case VectorPreProcStrategy::Masking:
+      os << "Masking";
+      break;
     case VectorPreProcStrategy::None:
       os << "None";
       break;
@@ -193,23 +200,42 @@
     return VectorPreProcStrategy::None;
   }
 
-  if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
-    // Peeling is only enabled on fully dynamic shape ops for now.
-    return VectorPreProcStrategy::Peeling;
-  }
-
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
+  bool isLinalgGeneric = isa<linalg::GenericOp>(linalgOp.getOperation());
 
   // Default X86 specific strategy.
-  if (isX86(targetAttr) && enableVectorPadding) {
-    // Padding is only enabled on x86. It leads to too much overhead on RISC-V
-    // and ARM.
-    return VectorPreProcStrategy::Padding;
+  if (isX86(targetAttr)) {
+    if (isLinalgGeneric) {
+      return VectorPreProcStrategy::Masking;
+    }
+
+    if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
+
+    if (enableVectorPadding) {
+      // Padding is only enabled on x86. It leads to too much overhead on
+      // RISC-V and ARM.
+      return VectorPreProcStrategy::Padding;
+    }
   }
 
   // Default RISC-V specific strategies.
-  if (isRISCV(targetAttr) && enableVectorPeeling) {
-    return VectorPreProcStrategy::Peeling;
+  if (isRISCV(targetAttr)) {
+    if (isLinalgGeneric) {
+      return VectorPreProcStrategy::Masking;
+    }
+
+    if (enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
+  }
+
+  // Default AArch64 specific strategies.
+  if (isAArch64(targetAttr)) {
+    if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
   }
 
   return VectorPreProcStrategy::None;
@@ -429,17 +455,33 @@
   return distributedTileSizes;
 }
 
+/// Returns the nearest power of two of `size` if `predicate` is true.
+/// Otherwise, returns `size`.
+static int64_t roundUpToPow2(int64_t size, bool predicate) {
+  if (!predicate) {
+    return size;
+  }
+  assert(size > 0 && "Negative size");
+  return llvm::PowerOf2Ceil(size);
+}
+
 /// Adjusts the workload per workgroup to be a multiple of vector size to ensure
 /// that the op vectorizes.
 static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
                               int64_t vectorSize,
-                              bool allowIncompleteTile = false) {
+                              bool allowIncompleteTile = false,
+                              bool enforcePowerOfTwo = false) {
   if (ub == ShapedType::kDynamic || lb == ShapedType::kDynamic) {
-    return maxSize;
+    return roundUpToPow2(maxSize, enforcePowerOfTwo);
   }
   int64_t numIters = ub - lb;
   if (numIters <= maxSize && numIters < vectorSize) {
-    return numIters;
+    return roundUpToPow2(numIters, enforcePowerOfTwo);
+  }
+
+  // Return the largest suitable power of two if power of two is enforced.
+  if (enforcePowerOfTwo) {
+    return roundUpToPow2(std::min(maxSize, numIters), enforcePowerOfTwo);
   }
 
   int64_t scaledUB = std::min(maxSize, numIters) / vectorSize * vectorSize;
@@ -580,6 +622,11 @@
                                          parallelSizes.end());
   SmallVector<int64_t> origReductionSizes(reductionSizes.begin(),
                                           reductionSizes.end());
+  // Masking doesn't need any dim set to 1.
+  if (vecPreProcStrategy == VectorPreProcStrategy::Masking) {
+    return;
+  }
+
   setAlwaysVectorizeSizes(op, parallelSizes, reductionSizes);
 
   // If peeling is enabled and the 'op' is fully dynamic, we only vectorize the
@@ -763,11 +810,14 @@
   SmallVector<int64_t> parallelTileSizes;
   for (auto [index, tileSize] : llvm::enumerate(workgroupTileSizes)) {
     int64_t sz = tileSize;
+    bool allowIncompleteTile =
+        vecPreProcStrategy == VectorPreProcStrategy::Peeling ||
+        vecPreProcStrategy == VectorPreProcStrategy::Masking;
+
     if (sz != 0) {
-      sz = getMaxTileSize(/*lb=*/0, /*ub=*/shape[index],
-                          /*maxTileSize=*/sz, vectorSize,
-                          /*allowIncompleteTile=*/vecPreProcStrategy ==
-                              VectorPreProcStrategy::Peeling);
+      sz = getMaxTileSize(
+          /*lb=*/0, /*ub=*/shape[index],
+          /*maxTileSize=*/sz, vectorSize, allowIncompleteTile);
     }
     parallelTileSizes.push_back(sz);
   }
@@ -940,11 +990,12 @@
   // works for linalg.matmul cases. We can relax it once we have better
   // scheduling, e.g., transform dialect.
   SmallVector<int64_t> flowTileSizes;
-  auto preProcStrategy = getVectorPreProcStrategy(linalgOp);
-  bool usePaddingPipeline = preProcStrategy == VectorPreProcStrategy::Padding;
+  auto vecPreProcStrategy = getVectorPreProcStrategy(linalgOp);
+  bool usePaddingPipeline =
+      vecPreProcStrategy == VectorPreProcStrategy::Padding;
 
-  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: " << preProcStrategy
-                       << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+                       << vecPreProcStrategy << "\n");
 
   if (usePaddingPipeline) {
     // It's inspired from Sandbox configuration. Sandbox has
@@ -989,11 +1040,11 @@
     if (isNoPadMultiTilingBeneficial(contractionOp, tripleTileSizes)) {
       return setMatmulNoPadRootConfig(entryPointFn, contractionOp,
                                       tripleTileSizes, vectorSize,
-                                      preProcStrategy);
+                                      vecPreProcStrategy);
     }  // else fall back to the default configuration.
   }
   return setMatmulNoPadRootConfig(entryPointFn, contractionOp, tileSizes,
-                                  vectorSize, preProcStrategy);
+                                  vectorSize, vecPreProcStrategy);
 }
 
 /// Sets the lowering configuration for dispatch region for linalg.mmt4d root
@@ -1145,7 +1196,7 @@
 static void setX86WorkgroupTileSizes(
     linalg::GenericOp genericOp, unsigned numLoops,
     ArrayRef<int64_t> flowTileSizes, ArrayRef<int64_t> minTileSizes,
-    ArrayRef<int64_t> maxTileSizes,
+    ArrayRef<int64_t> maxTileSizes, VectorPreProcStrategy vecPreProcStrategy,
     SmallVectorImpl<int64_t> &workgroupTileSizes) {
   workgroupTileSizes.append(numLoops, 0);
   SmallVector<int64_t, 4> staticLoopRanges = genericOp.getStaticLoopRanges();
@@ -1153,7 +1204,9 @@
     if (flowTileSizes[loopNum]) {
       workgroupTileSizes[loopNum] =
           getMaxTileSize(0, flowTileSizes[loopNum], minTileSizes[loopNum],
-                         minTileSizes[loopNum]);
+                         minTileSizes[loopNum], /*allowIncompleteTile=*/false,
+                         /*enforcePowerOfTwo=*/vecPreProcStrategy ==
+                             VectorPreProcStrategy::Masking);
     } else {
       // If the flow level tile size is zero, and static loop range is 0 as
       // well, set the tile sizes here to zero as well.
@@ -1201,6 +1254,8 @@
     return success();
   }
 
+  LLVM_DEBUG(KD_DBGS() << "Setting default generic op root configuration\n");
+
   // If there are no loops, there is nothing to do.
   unsigned numLoops = genericOp.getNumLoops();
   if (numLoops == 0) {
@@ -1215,21 +1270,34 @@
   // allocation limit See #9469 for example.
   SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize / 2);
 
+  LLVM_DEBUG(KD_DBGS() << "Min tile sizes for distribution: " << minTileSizes
+                       << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Max tile sizes for distribution: " << maxTileSizes
+                       << "\n");
+
   // Set the flow level tiling to the default.
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
       genericOp, minTileSizes, maxTileSizes);
 
+  LLVM_DEBUG(KD_DBGS() << "Final tile sizes for distribution: " << flowTileSizes
+                       << "\n");
+
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+                       << vecPreProcStrategy << "\n");
+
   // Set the next level tile sizes.
   SmallVector<int64_t> parallelTileSizes;
   SmallVector<int64_t> reductionTileSizes;
   setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
-                           maxTileSizes, parallelTileSizes);
+                           maxTileSizes, vecPreProcStrategy, parallelTileSizes);
   splitParallelAndReductionTiles(genericOp, parallelTileSizes,
                                  reductionTileSizes);
 
-  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
-  setVectorSizesForDynamicShapes(genericOp, vecPreProcStrategy,
-                                 parallelTileSizes, reductionTileSizes);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (parallel): "
+                       << parallelTileSizes << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (reduction): "
+                       << reductionTileSizes << "\n");
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -1319,10 +1387,14 @@
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
       genericOp, minTileSizes, maxTileSizes);
 
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+                       << vecPreProcStrategy << "\n");
+
   // Set the next level tile sizes.
   SmallVector<int64_t> parallelTileSizes;
   setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
-                           maxTileSizes, parallelTileSizes);
+                           maxTileSizes, vecPreProcStrategy, parallelTileSizes);
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -1361,6 +1433,9 @@
       getDefaultDistributedLevelTileSizes(genericOp, minTileSizes, maxTileSizes,
                                           /*allowIncompleteTile=*/true);
 
+  // TODO(dcaballe): The logic below is disconnected from the main tile size
+  // selection logic in getMaxTileSize. We should either port it there or remove
+  // it.
   // Adjust the number of workload per workgroup to at least 4096. This
   // prevents the runtime overheads domiating the execution time. The number is
   // derived from experimients. We should be able to make it related to target.
@@ -1388,6 +1463,10 @@
     flowTileSizes[currDim] = newSize;
   }
 
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+                       << vecPreProcStrategy << "\n");
+
   // Adjust tiling sizes of vector levels to avoid large unroll factors. Most of
   // the cases are f32 and i32, so we divide it by 4.
   auto nativeVecSize = getNativeVectorSizeInBytes(entryPointFn);
@@ -1395,7 +1474,10 @@
       nativeVecSize ? nativeVecSize.value() : clNativeVectorSizeInBytes;
   vecSize /= 4;
   SmallVector<int64_t> vecTileSizes(minTileSizes.begin(), minTileSizes.end());
-  for (auto &i : vecTileSizes) i = std::min(i, vecSize);
+  for (auto &i : vecTileSizes) {
+    i = roundUpToPow2(std::min(i, vecSize),
+                      vecPreProcStrategy == VectorPreProcStrategy::Masking);
+  }
 
   // Setting reduction tile sizes is a workaround to kick in peeling transform.
   // The tiling won't happen because the sizes are zeros.
@@ -1406,10 +1488,18 @@
   tileSizes.push_back(vecTileSizes);
   tileSizes.push_back(zeros);
 
-  auto passPipeline =
-      genericOp.hasTensorSemantics()
-          ? DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert
-          : DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+  LLVM_DEBUG(KD_DBGS() << "Final tile sizes for element-wise op: " << tileSizes
+                       << "\n");
+
+  DispatchLoweringPassPipeline passPipeline;
+  if (genericOp.hasBufferSemantics()) {
+    passPipeline = DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+  } else if (vecPreProcStrategy == VectorPreProcStrategy::Peeling) {
+    passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert;
+  } else {
+    passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
+  }
+
   return setOpConfigAndEntryPointFnTranslation(entryPointFn, genericOp,
                                                tileSizes, passPipeline);
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index 6f91452..1ab380b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -32,12 +32,13 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
-//      CHECK: hal.executable.export public @matvec_static
-// CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//      CHECK: linalg.matvec
-// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
+//       CHECK: hal.executable.export public @matvec_static
+//  CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//       CHECK: linalg.matvec
+//  CHECK-SAME:     lowering_config = #[[CONFIG]]
 
 // -----
 
@@ -183,15 +184,15 @@
     #hal.descriptor_set.binding<2, storage_buffer>
   ]>
 ]>
-hal.executable private @add {
+hal.executable private @dynamic_add {
   hal.executable.variant @llvm, target = <"llvm-cpu", "embedded-elf-x86_64", {
     data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
     native_vector_size = 16 : index,
     target_triple = "x86_64-unknown-linux-gnu"
   }> {
-    hal.executable.export @add layout(#pipeline_layout)
+    hal.executable.export @dynamic_add layout(#pipeline_layout)
     builtin.module {
-      func.func @add() {
+      func.func @dynamic_add() {
         %c0 = arith.constant 0 : index
         %dim0 = hal.interface.constant.load[0] : index
         %dim1 = hal.interface.constant.load[1] : index
@@ -218,8 +219,8 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-//      CHECK: hal.executable.export public @add
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.export public @dynamic_add
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
 // CHECK-SAME:     lowering_config = #[[CONFIG]]
@@ -276,7 +277,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @add4D
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -317,7 +318,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @add_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -544,8 +545,8 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 1]{{\]}}>
-//      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 4]{{\]}}>
+//      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @outs_fusion_fn
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 //      CHECK: func.func @outs_fusion_fn()
@@ -826,7 +827,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 16], [0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @generic_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   linalg.generic
@@ -1176,7 +1177,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @generic_unit_dims_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -1264,7 +1265,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @reduce_to_scalar_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 9824bc0..ce768c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -56,12 +56,12 @@
     }
   }
 }
-//      CHECK: func.func @check_no_cse()
-//  CHECK-NOT:    memref.alloc
-//      CHECK:    %[[FOR:.+]] = scf.for
-//      CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
-//      CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
-//      CHECK:    memref.store %[[RES]]
+// CHECK-LABEL: func.func @check_no_cse()
+//   CHECK-NOT:    memref.alloc
+//       CHECK:    %[[FOR:.+]] = scf.for
+//       CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
+//       CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
+//       CHECK:    memref.store %[[RES]]
 
 // -----
 
@@ -106,9 +106,10 @@
     }
   }
 }
-// CHECK: func.func @preset_config_matmul
-// CHECK:   vector.outerproduct
-// HOIST-PAD:         func.func @preset_config_matmul
+// CHECK-LABEL: func.func @preset_config_matmul
+//       CHECK:   vector.outerproduct
+
+// HOIST-PAD-LABEL:   func.func @preset_config_matmul
 // HOIST-PAD-DAG:       %[[BUF1:.+]] = memref.alloca() {{.+}} : memref<3x4x16x32xf32>
 // HOIST-PAD-DAG:       %[[BUF2:.+]] = memref.alloca() {{.+}} : memref<4x8x16xf32>
 // HOIST-PAD-16-DAG:      vector.store {{.+}}, %[[BUF1]]
@@ -167,8 +168,8 @@
     }
   }
 }
-// CHECK: func.func @batch_matmul_dynamic
-// CHECK:   vector.outerproduct
+// CHECK-LABEL: func.func @batch_matmul_dynamic
+//       CHECK:   vector.outerproduct
 
 // -----
 
@@ -208,10 +209,10 @@
     }
   }
 }
-// CHECK:      #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
-// CHECK:      func.func @check_buffer_ops_vectorization
-// CHECK:        vector.load
-// CHECK-NEXT:   vector.store
+// CHECK-LABEL:  #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
+//       CHECK:      func.func @check_buffer_ops_vectorization
+//       CHECK:        vector.load
+//  CHECK-NEXT:        vector.store
 
 // -----
 
@@ -276,12 +277,12 @@
   }
 }
 
-// CHECK:      func.func @vectorize_fill_conv2d_generic
-// CHECK-NOT:    memref.alloca
-// CHECK-NOT:    linalg.fill
-// CHECK:        vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
-// CHECK-NOT:    linalg.generic
-// CHECK:        arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
+// CHECK-LABEL:  func.func @vectorize_fill_conv2d_generic
+//   CHECK-NOT:    memref.alloca
+//   CHECK-NOT:    linalg.fill
+//       CHECK:    vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
+//   CHECK-NOT:    linalg.generic
+//       CHECK:    arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
index 37e6ffd..37afde9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
@@ -39,18 +39,18 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C64:.+]] = arith.constant 64 : index
 // CHECK:         scf.for
 // CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
-// CHECK:           arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
+// CHECK:             scf.for
+// CHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                 %[[RES:.+]] = arith.addi
+// CHECK:                 scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:               vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK:             arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
 
 // -----
 
@@ -92,21 +92,21 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xf32>
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// CHECK-NOT:     scf.yield %{{.+}} : vector<1x1x4xf32>
 
-// REORDERCHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// REORDERCHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// REORDERCHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// REORDERCHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// REORDERCHECK:       scf.for
+// REORDERCHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// REORDERCHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// REORDERCHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// REORDERCHECK-DAG:     %[[C64:.+]] = arith.constant 64 : index
 // REORDERCHECK:         scf.for
 // REORDERCHECK:           scf.for
-// REORDERCHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// REORDERCHECK:               %[[RES:.+]] = arith.addf
-// REORDERCHECK:               scf.yield %[[RES]] : vector<1x1x4xf32>
-// REORDERCHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
-// REORDERCHECK:           arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
+// REORDERCHECK:             scf.for
+// REORDERCHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// REORDERCHECK:                 %[[RES:.+]] = arith.addf
+// REORDERCHECK:                 scf.yield %[[RES]] : vector<1x1x4xf32>
+// REORDERCHECK:               vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
+// REORDERCHECK:             arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
 
 // -----
 
@@ -144,17 +144,17 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_next_dynamic_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL:  func.func @split_reduction_innermost_reduction_next_dynamic_supported()
+// CHECK-DAG:      %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C64:.+]] = arith.constant 64 : index
+// CHECK:          scf.for
+// CHECK:            scf.for
+// CHECK:              scf.for
+// CHECK:                scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                  %[[RES:.+]] = arith.addi
+// CHECK:                  scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:                vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
 
 // -----
 
@@ -188,17 +188,17 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL:  func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
+// CHECK-DAG:      %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C64:.+]] = arith.constant 64 : index
+// CHECK:          scf.for
+// CHECK:            scf.for
+// CHECK:              scf.for
+// CHECK:                scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                  %[[RES:.+]] = arith.addi
+// CHECK:                  scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:                vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
 
 // -----
 
@@ -234,9 +234,8 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_dynamic_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_innermost_dynamic_reduction_unsupported()
+//     CHECK-4:    vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 
@@ -270,9 +269,8 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_imperfect_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_innermost_imperfect_reduction_unsupported()
+//     CHECK-4:    vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 
@@ -306,9 +304,9 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_not_innermost_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_not_innermost_reduction_unsupported()
+// CHECK-NOT:      scf.yield %{{.+}} : vector<1x1x4xi32>
+// CHECK-NOT:      vector.reduction
 
 // -----
 
@@ -342,6 +340,6 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_double_reduction_unsupported()
-// CHECK:       vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
-// CHECK-NOT:   vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
+// CHECK-LABEL:  func.func @split_reduction_double_reduction_unsupported()
+// CHECK:          vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
+// CHECK-NOT:      vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 4802836..7ae64ee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
@@ -55,17 +55,68 @@
   }
 }
 
-// Masking is not applied to the main vector loop when the peeling is used.
+// Masking is applied to the main vector loop when peeling is not used.
 
 // CHECK-LABEL: func.func @mask_dynamic_generic_add
 // Main loop
-// CHECK:       scf.for
-// CHECK:         vector.load
-// CHECK:         vector.load
-// CHECK:         vector.store
-// Peel loop
-// CHECK:       scf.for
-// CHECK:         vector.maskedload
-// CHECK:         vector.maskedload
-// CHECK:         vector.maskedstore
+//         CHECK: scf.for
+// CHECK-COUNT-2:   vector.maskedload
+//         CHECK:   vector.maskedstore
+// No epilogue
+//     CHECK-NOT: scf.for
+
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+    lowering_config = <tile_sizes = [[127, 255], [8, 0], [0, 32]]>,
+    translation_info  = <CPUDoubleTilingExpert>,
+    workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @preset_config_reduction  {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+    hal.executable.export @mask_dynamic_reduction layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @mask_dynamic_reduction() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %6 = arith.index_cast %0 : i32 to index
+        %7 = arith.index_cast %1 : i32 to index
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+            : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %init = tensor.empty(%6) : tensor<?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
+        %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0)>],
+                                   iterator_types = ["parallel", "reduction"]}
+          ins(%lhs : tensor<?x?xf32>) outs(%fill : tensor<?xf32>) {
+            ^bb0(%in0: f32, %out: f32):
+          %add = arith.addf %out, %in0 : f32
+          linalg.yield %add: f32
+        } -> tensor<?xf32>
+        flow.dispatch.tensor.store %generic, %result_binding, offsets = [0], sizes = [%6], strides = [1]
+            : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-LABEL: func.func @mask_dynamic_reduction
+// CHECK-COUNT-5:   vector.maskedload
+// CHECK-COUNT-4:   vector.mask %{{.*}} { vector.reduction <add>
+//         CHECK:   vector.maskedstore
 
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
index 4d021c1..1c890f0 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
@@ -43,19 +43,46 @@
 // IREE specific functions
 //===----------------------------------------------------------------------===//
 
-/// Returns the op that contains lowering config. Returns failure if there are
-/// multiple op having lowering config.
-static FailureOr<Operation *> getRootOp(func::FuncOp funcOp) {
+/// Returns the op that contains lowering config. Checks whether the provided op
+/// contains the lowering config and returns it. Otherwise, tries to find the
+/// lowering config across the function. If there are multiple ops with the same
+/// lowering configs, returns the first one found. Returns failure if there are
+/// multiple op with different lowering config.
+static FailureOr<Operation *> getRootOp(Operation *op) {
+  // Check for self first.
+  if (iree_compiler::getLoweringConfig(op)) {
+    return op;
+  }
+
+  // Get the function op.
+  auto funcOp = dyn_cast<func::FuncOp>(op);
+  if (!funcOp) {
+    funcOp = op->getParentOfType<func::FuncOp>();
+  }
+
+  assert(funcOp && "Missing funcOp");
+
   Operation *rootOp = nullptr;
+  mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr rootLoweringConfig;
   auto result = funcOp.walk([&](Operation *op) -> WalkResult {
-    if (!iree_compiler::getLoweringConfig(op)) return WalkResult::advance();
-    if (rootOp) {
-      return WalkResult::interrupt();
+    auto loweringConfig = iree_compiler::getLoweringConfig(op);
+    if (!loweringConfig) {
+      return WalkResult::advance();
     }
-    rootOp = op;
+    if (rootLoweringConfig) {
+      if (rootLoweringConfig != loweringConfig) {
+        return WalkResult::interrupt();
+      }
+    } else {
+      rootOp = op;
+      rootLoweringConfig = loweringConfig;
+    }
     return WalkResult::advance();
   });
-  if (!rootOp || result.wasInterrupted()) return failure();
+
+  if (!rootOp || result.wasInterrupted()) {
+    return failure();
+  }
   return rootOp;
 }
 
@@ -141,8 +168,7 @@
 // particular linalg op within that dispatch.
 static SmallVector<int64_t> getVectorSizes(
     linalg::LinalgOp linalgOp, ArrayRef<int64_t> canonicalVectorShape) {
-  FailureOr<Operation *> rootOp =
-      getRootOp(linalgOp->getParentOfType<func::FuncOp>());
+  FailureOr<Operation *> rootOp = getRootOp(linalgOp);
   if (failed(rootOp)) {
     return {};
   }
@@ -152,21 +178,6 @@
     return {};
   }
 
-  // TODO: Support masking for static shapes.
-  if (llvm::any_of(linalgOp.getStaticLoopRanges(), [](int64_t dimSize) {
-        return !ShapedType::isDynamicShape(dimSize) && dimSize != 1;
-      })) {
-    return {};
-  }
-
-  // TODO: Support masking for reduction.
-  if (llvm::any_of(linalgOp.getIteratorTypesArray(),
-                   [](utils::IteratorType iter) {
-                     return !linalg::isParallelIterator(iter);
-                   })) {
-    return {};
-  }
-
   if (canonicalVectorShape.empty()) {
     return {};
   }
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
index a205824..e96141f 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
@@ -1,5 +1,4 @@
-# TODO(#11923): Enable after fixing.
-# XFAIL: *
+# TODO(#11923): If this test fails again, check #11923.
 
 # REQUIRES: llvmcpu
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=lbeta --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
index eee6dbb..0753eb1 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
@@ -1,4 +1,3 @@
 # REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=log_softmax --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
index 0f0cde2..092b784 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
@@ -1,4 +1,3 @@
 # REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=softmax --artifacts_dir=%t