Re-Enable reduction masking (#12472)

https://github.com/openxla/iree/pull/12003
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index b67b85f..2a91ba7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -126,6 +126,10 @@
   // Peel iterations from the vector dimensions so that they become multiple of
   // the vector length.
   Peeling,
+  // Compute vector dimensions assuming vector masking support. Vector sizes may
+  // be rounded up to the nearest power of two and out-of-bounds elements would
+  // be masked-out.
+  Masking,
   // Do not apply any vectorization pre-processing transformation.
   None
 };
@@ -140,6 +144,9 @@
     case VectorPreProcStrategy::Peeling:
       os << "Peeling";
       break;
+    case VectorPreProcStrategy::Masking:
+      os << "Masking";
+      break;
     case VectorPreProcStrategy::None:
       os << "None";
       break;
@@ -193,23 +200,42 @@
     return VectorPreProcStrategy::None;
   }
 
-  if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
-    // Peeling is only enabled on fully dynamic shape ops for now.
-    return VectorPreProcStrategy::Peeling;
-  }
-
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
+  bool isLinalgGeneric = isa<linalg::GenericOp>(linalgOp.getOperation());
 
   // Default X86 specific strategy.
-  if (isX86(targetAttr) && enableVectorPadding) {
-    // Padding is only enabled on x86. It leads to too much overhead on RISC-V
-    // and ARM.
-    return VectorPreProcStrategy::Padding;
+  if (isX86(targetAttr)) {
+    if (isLinalgGeneric) {
+      return VectorPreProcStrategy::Masking;
+    }
+
+    if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
+
+    if (enableVectorPadding) {
+      // Padding is only enabled on x86. It leads to too much overhead on
+      // RISC-V and ARM.
+      return VectorPreProcStrategy::Padding;
+    }
   }
 
   // Default RISC-V specific strategies.
-  if (isRISCV(targetAttr) && enableVectorPeeling) {
-    return VectorPreProcStrategy::Peeling;
+  if (isRISCV(targetAttr)) {
+    if (isLinalgGeneric) {
+      return VectorPreProcStrategy::Masking;
+    }
+
+    if (enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
+  }
+
+  // Default AArch64 specific strategies.
+  if (isAArch64(targetAttr)) {
+    if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+      return VectorPreProcStrategy::Peeling;
+    }
   }
 
   return VectorPreProcStrategy::None;
@@ -429,17 +455,33 @@
   return distributedTileSizes;
 }
 
+/// Returns the nearest power of two of `size` if `predicate` is true.
+/// Otherwise, returns `size`.
+static int64_t roundUpToPow2(int64_t size, bool predicate) {
+  if (!predicate) {
+    return size;
+  }
+  assert(size > 0 && "Negative size");
+  return llvm::PowerOf2Ceil(size);
+}
+
 /// Adjusts the workload per workgroup to be a multiple of vector size to ensure
 /// that the op vectorizes.
 static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
                               int64_t vectorSize,
-                              bool allowIncompleteTile = false) {
+                              bool allowIncompleteTile = false,
+                              bool enforcePowerOfTwo = false) {
   if (ub == ShapedType::kDynamic || lb == ShapedType::kDynamic) {
-    return maxSize;
+    return roundUpToPow2(maxSize, enforcePowerOfTwo);
   }
   int64_t numIters = ub - lb;
   if (numIters <= maxSize && numIters < vectorSize) {
-    return numIters;
+    return roundUpToPow2(numIters, enforcePowerOfTwo);
+  }
+
+  // Return the largest suitable power of two if power of two is enforced.
+  if (enforcePowerOfTwo) {
+    return roundUpToPow2(std::min(maxSize, numIters), enforcePowerOfTwo);
   }
 
   int64_t scaledUB = std::min(maxSize, numIters) / vectorSize * vectorSize;
@@ -580,6 +622,11 @@
                                          parallelSizes.end());
   SmallVector<int64_t> origReductionSizes(reductionSizes.begin(),
                                           reductionSizes.end());
+  // Masking doesn't need any dim set to 1.
+  if (vecPreProcStrategy == VectorPreProcStrategy::Masking) {
+    return;
+  }
+
   setAlwaysVectorizeSizes(op, parallelSizes, reductionSizes);
 
   // If peeling is enabled and the 'op' is fully dynamic, we only vectorize the
@@ -763,11 +810,14 @@
   SmallVector<int64_t> parallelTileSizes;
   for (auto [index, tileSize] : llvm::enumerate(workgroupTileSizes)) {
     int64_t sz = tileSize;
+    bool allowIncompleteTile =
+        vecPreProcStrategy == VectorPreProcStrategy::Peeling ||
+        vecPreProcStrategy == VectorPreProcStrategy::Masking;
+
     if (sz != 0) {
-      sz = getMaxTileSize(/*lb=*/0, /*ub=*/shape[index],
-                          /*maxTileSize=*/sz, vectorSize,
-                          /*allowIncompleteTile=*/vecPreProcStrategy ==
-                              VectorPreProcStrategy::Peeling);
+      sz = getMaxTileSize(
+          /*lb=*/0, /*ub=*/shape[index],
+          /*maxTileSize=*/sz, vectorSize, allowIncompleteTile);
     }
     parallelTileSizes.push_back(sz);
   }
@@ -940,11 +990,12 @@
   // works for linalg.matmul cases. We can relax it once we have better
   // scheduling, e.g., transform dialect.
   SmallVector<int64_t> flowTileSizes;
-  auto preProcStrategy = getVectorPreProcStrategy(linalgOp);
-  bool usePaddingPipeline = preProcStrategy == VectorPreProcStrategy::Padding;
+  auto vecPreProcStrategy = getVectorPreProcStrategy(linalgOp);
+  bool usePaddingPipeline =
+      vecPreProcStrategy == VectorPreProcStrategy::Padding;
 
-  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: " << preProcStrategy
-                       << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+                       << vecPreProcStrategy << "\n");
 
   if (usePaddingPipeline) {
     // It's inspired from Sandbox configuration. Sandbox has
@@ -989,11 +1040,11 @@
     if (isNoPadMultiTilingBeneficial(contractionOp, tripleTileSizes)) {
       return setMatmulNoPadRootConfig(entryPointFn, contractionOp,
                                       tripleTileSizes, vectorSize,
-                                      preProcStrategy);
+                                      vecPreProcStrategy);
     }  // else fall back to the default configuration.
   }
   return setMatmulNoPadRootConfig(entryPointFn, contractionOp, tileSizes,
-                                  vectorSize, preProcStrategy);
+                                  vectorSize, vecPreProcStrategy);
 }
 
 /// Sets the lowering configuration for dispatch region for linalg.mmt4d root
@@ -1135,7 +1186,7 @@
 static void setX86WorkgroupTileSizes(
     linalg::GenericOp genericOp, unsigned numLoops,
     ArrayRef<int64_t> flowTileSizes, ArrayRef<int64_t> minTileSizes,
-    ArrayRef<int64_t> maxTileSizes,
+    ArrayRef<int64_t> maxTileSizes, VectorPreProcStrategy vecPreProcStrategy,
     SmallVectorImpl<int64_t> &workgroupTileSizes) {
   workgroupTileSizes.append(numLoops, 0);
   SmallVector<int64_t, 4> staticLoopRanges = genericOp.getStaticLoopRanges();
@@ -1143,7 +1194,9 @@
     if (flowTileSizes[loopNum]) {
       workgroupTileSizes[loopNum] =
           getMaxTileSize(0, flowTileSizes[loopNum], minTileSizes[loopNum],
-                         minTileSizes[loopNum]);
+                         minTileSizes[loopNum], /*allowIncompleteTile=*/false,
+                         /*enforcePowerOfTwo=*/vecPreProcStrategy ==
+                             VectorPreProcStrategy::Masking);
     } else {
       // If the flow level tile size is zero, and static loop range is 0 as
       // well, set the tile sizes here to zero as well.
@@ -1191,6 +1244,8 @@
     return success();
   }
 
+  LLVM_DEBUG(KD_DBGS() << "Setting default generic op root configuration\n");
+
   // If there are no loops, there is nothing to do.
   unsigned numLoops = genericOp.getNumLoops();
   if (numLoops == 0) {
@@ -1205,21 +1260,34 @@
   // allocation limit See #9469 for example.
   SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize / 2);
 
+  LLVM_DEBUG(KD_DBGS() << "Min tile sizes for distribution: " << minTileSizes
+                       << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Max tile sizes for distribution: " << maxTileSizes
+                       << "\n");
+
   // Set the flow level tiling to the default.
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
       genericOp, minTileSizes, maxTileSizes);
 
+  LLVM_DEBUG(KD_DBGS() << "Final tile sizes for distribution: " << flowTileSizes
+                       << "\n");
+
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+                       << vecPreProcStrategy << "\n");
+
   // Set the next level tile sizes.
   SmallVector<int64_t> parallelTileSizes;
   SmallVector<int64_t> reductionTileSizes;
   setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
-                           maxTileSizes, parallelTileSizes);
+                           maxTileSizes, vecPreProcStrategy, parallelTileSizes);
   splitParallelAndReductionTiles(genericOp, parallelTileSizes,
                                  reductionTileSizes);
 
-  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
-  setVectorSizesForDynamicShapes(genericOp, vecPreProcStrategy,
-                                 parallelTileSizes, reductionTileSizes);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (parallel): "
+                       << parallelTileSizes << "\n");
+  LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (reduction): "
+                       << reductionTileSizes << "\n");
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -1309,10 +1377,14 @@
   SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
       genericOp, minTileSizes, maxTileSizes);
 
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+                       << vecPreProcStrategy << "\n");
+
   // Set the next level tile sizes.
   SmallVector<int64_t> parallelTileSizes;
   setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
-                           maxTileSizes, parallelTileSizes);
+                           maxTileSizes, vecPreProcStrategy, parallelTileSizes);
 
   TileSizesListType tileSizes;
   tileSizes.push_back(flowTileSizes);
@@ -1351,6 +1423,9 @@
       getDefaultDistributedLevelTileSizes(genericOp, minTileSizes, maxTileSizes,
                                           /*allowIncompleteTile=*/true);
 
+  // TODO(dcaballe): The logic below is disconnected from the main tile size
+  // selection logic in getMaxTileSize. We should either port it there or remove
+  // it.
   // Adjust the number of workload per workgroup to at least 4096. This
   // prevents the runtime overheads domiating the execution time. The number is
   // derived from experimients. We should be able to make it related to target.
@@ -1378,6 +1453,10 @@
     flowTileSizes[currDim] = newSize;
   }
 
+  auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+  LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+                       << vecPreProcStrategy << "\n");
+
   // Adjust tiling sizes of vector levels to avoid large unroll factors. Most of
   // the cases are f32 and i32, so we divide it by 4.
   auto nativeVecSize = getNativeVectorSizeInBytes(entryPointFn);
@@ -1385,7 +1464,10 @@
       nativeVecSize ? nativeVecSize.value() : clNativeVectorSizeInBytes;
   vecSize /= 4;
   SmallVector<int64_t> vecTileSizes(minTileSizes.begin(), minTileSizes.end());
-  for (auto &i : vecTileSizes) i = std::min(i, vecSize);
+  for (auto &i : vecTileSizes) {
+    i = roundUpToPow2(std::min(i, vecSize),
+                      vecPreProcStrategy == VectorPreProcStrategy::Masking);
+  }
 
   // Setting reduction tile sizes is a workaround to kick in peeling transform.
   // The tiling won't happen because the sizes are zeros.
@@ -1396,10 +1478,18 @@
   tileSizes.push_back(vecTileSizes);
   tileSizes.push_back(zeros);
 
-  auto passPipeline =
-      genericOp.hasTensorSemantics()
-          ? DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert
-          : DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+  LLVM_DEBUG(KD_DBGS() << "Final tile sizes for element-wise op: " << tileSizes
+                       << "\n");
+
+  DispatchLoweringPassPipeline passPipeline;
+  if (genericOp.hasBufferSemantics()) {
+    passPipeline = DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+  } else if (vecPreProcStrategy == VectorPreProcStrategy::Peeling) {
+    passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert;
+  } else {
+    passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
+  }
+
   return setOpConfigAndEntryPointFnTranslation(entryPointFn, genericOp,
                                                tileSizes, passPipeline);
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index 07fae72..b7e4ff3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -32,12 +32,13 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
-//      CHECK: hal.executable.export public @matvec_static
-// CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//      CHECK: linalg.matvec
-// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
+//       CHECK: hal.executable.export public @matvec_static
+//  CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//       CHECK: linalg.matvec
+//  CHECK-SAME:     lowering_config = #[[CONFIG]]
 
 // -----
 
@@ -183,15 +184,15 @@
     #hal.descriptor_set.binding<2, storage_buffer>
   ]>
 ]>
-hal.executable private @add {
+hal.executable private @dynamic_add {
   hal.executable.variant @llvm, target = <"llvm-cpu", "embedded-elf-x86_64", {
     data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
     native_vector_size = 16 : index,
     target_triple = "x86_64-unknown-linux-gnu"
   }> {
-    hal.executable.export @add layout(#pipeline_layout)
+    hal.executable.export @dynamic_add layout(#pipeline_layout)
     builtin.module {
-      func.func @add() {
+      func.func @dynamic_add() {
         %c0 = arith.constant 0 : index
         %dim0 = hal.interface.constant.load[0] : index
         %dim1 = hal.interface.constant.load[1] : index
@@ -218,8 +219,8 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-//      CHECK: hal.executable.export public @add
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.export public @dynamic_add
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
 // CHECK-SAME:     lowering_config = #[[CONFIG]]
@@ -276,7 +277,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @add4D
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -317,7 +318,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @add_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -544,8 +545,8 @@
     }
   }
 }
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 1]{{\]}}>
-//      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 4]{{\]}}>
+//      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @outs_fusion_fn
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 //      CHECK: func.func @outs_fusion_fn()
@@ -826,7 +827,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 16], [0, 0]]>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @generic_static
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   linalg.generic
@@ -1176,7 +1177,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @generic_unit_dims_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
@@ -1264,7 +1265,7 @@
   }
 }
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @reduce_to_scalar_dynamic
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK: linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 9824bc0..ce768c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -56,12 +56,12 @@
     }
   }
 }
-//      CHECK: func.func @check_no_cse()
-//  CHECK-NOT:    memref.alloc
-//      CHECK:    %[[FOR:.+]] = scf.for
-//      CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
-//      CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
-//      CHECK:    memref.store %[[RES]]
+// CHECK-LABEL: func.func @check_no_cse()
+//   CHECK-NOT:    memref.alloc
+//       CHECK:    %[[FOR:.+]] = scf.for
+//       CHECK:    %[[DIVF:.+]] = arith.divf %[[FOR]]
+//       CHECK:    %[[RES:.+]] = vector.extract %[[DIVF]]
+//       CHECK:    memref.store %[[RES]]
 
 // -----
 
@@ -106,9 +106,10 @@
     }
   }
 }
-// CHECK: func.func @preset_config_matmul
-// CHECK:   vector.outerproduct
-// HOIST-PAD:         func.func @preset_config_matmul
+// CHECK-LABEL: func.func @preset_config_matmul
+//       CHECK:   vector.outerproduct
+
+// HOIST-PAD-LABEL:   func.func @preset_config_matmul
 // HOIST-PAD-DAG:       %[[BUF1:.+]] = memref.alloca() {{.+}} : memref<3x4x16x32xf32>
 // HOIST-PAD-DAG:       %[[BUF2:.+]] = memref.alloca() {{.+}} : memref<4x8x16xf32>
 // HOIST-PAD-16-DAG:      vector.store {{.+}}, %[[BUF1]]
@@ -167,8 +168,8 @@
     }
   }
 }
-// CHECK: func.func @batch_matmul_dynamic
-// CHECK:   vector.outerproduct
+// CHECK-LABEL: func.func @batch_matmul_dynamic
+//       CHECK:   vector.outerproduct
 
 // -----
 
@@ -208,10 +209,10 @@
     }
   }
 }
-// CHECK:      #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
-// CHECK:      func.func @check_buffer_ops_vectorization
-// CHECK:        vector.load
-// CHECK-NEXT:   vector.store
+// CHECK-LABEL:  #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
+//       CHECK:      func.func @check_buffer_ops_vectorization
+//       CHECK:        vector.load
+//  CHECK-NEXT:        vector.store
 
 // -----
 
@@ -276,12 +277,12 @@
   }
 }
 
-// CHECK:      func.func @vectorize_fill_conv2d_generic
-// CHECK-NOT:    memref.alloca
-// CHECK-NOT:    linalg.fill
-// CHECK:        vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
-// CHECK-NOT:    linalg.generic
-// CHECK:        arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
+// CHECK-LABEL:  func.func @vectorize_fill_conv2d_generic
+//   CHECK-NOT:    memref.alloca
+//   CHECK-NOT:    linalg.fill
+//       CHECK:    vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
+//   CHECK-NOT:    linalg.generic
+//       CHECK:    arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
index 37e6ffd..37afde9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
@@ -39,18 +39,18 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
+// CHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C64:.+]] = arith.constant 64 : index
 // CHECK:         scf.for
 // CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
-// CHECK:           arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
+// CHECK:             scf.for
+// CHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                 %[[RES:.+]] = arith.addi
+// CHECK:                 scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:               vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK:             arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
 
 // -----
 
@@ -92,21 +92,21 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xf32>
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// CHECK-NOT:     scf.yield %{{.+}} : vector<1x1x4xf32>
 
-// REORDERCHECK:     func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// REORDERCHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// REORDERCHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// REORDERCHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// REORDERCHECK:       scf.for
+// REORDERCHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// REORDERCHECK-DAG:     %[[C0:.+]] = arith.constant 0 : index
+// REORDERCHECK-DAG:     %[[C1:.+]] = arith.constant 1 : index
+// REORDERCHECK-DAG:     %[[C64:.+]] = arith.constant 64 : index
 // REORDERCHECK:         scf.for
 // REORDERCHECK:           scf.for
-// REORDERCHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// REORDERCHECK:               %[[RES:.+]] = arith.addf
-// REORDERCHECK:               scf.yield %[[RES]] : vector<1x1x4xf32>
-// REORDERCHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
-// REORDERCHECK:           arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
+// REORDERCHECK:             scf.for
+// REORDERCHECK:               scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// REORDERCHECK:                 %[[RES:.+]] = arith.addf
+// REORDERCHECK:                 scf.yield %[[RES]] : vector<1x1x4xf32>
+// REORDERCHECK:               vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
+// REORDERCHECK:             arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
 
 // -----
 
@@ -144,17 +144,17 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_next_dynamic_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL:  func.func @split_reduction_innermost_reduction_next_dynamic_supported()
+// CHECK-DAG:      %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C64:.+]] = arith.constant 64 : index
+// CHECK:          scf.for
+// CHECK:            scf.for
+// CHECK:              scf.for
+// CHECK:                scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                  %[[RES:.+]] = arith.addi
+// CHECK:                  scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:                vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
 
 // -----
 
@@ -188,17 +188,17 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C64:.+]] = arith.constant 64 : index
-// CHECK:       scf.for
-// CHECK:         scf.for
-// CHECK:           scf.for
-// CHECK:             scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK:               %[[RES:.+]] = arith.addi
-// CHECK:               scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK:             vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL:  func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
+// CHECK-DAG:      %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C64:.+]] = arith.constant 64 : index
+// CHECK:          scf.for
+// CHECK:            scf.for
+// CHECK:              scf.for
+// CHECK:                scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK:                  %[[RES:.+]] = arith.addi
+// CHECK:                  scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK:                vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
 
 // -----
 
@@ -234,9 +234,8 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_dynamic_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_innermost_dynamic_reduction_unsupported()
+//     CHECK-4:    vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 
@@ -270,9 +269,8 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_innermost_imperfect_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_innermost_imperfect_reduction_unsupported()
+//     CHECK-4:    vector.mask %{{.*}} { vector.reduction <add>
 
 // -----
 
@@ -306,9 +304,9 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_not_innermost_reduction_unsupported()
-// CHECK-NOT:   scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT:   vector.reduction
+// CHECK-LABEL:  func.func @split_reduction_not_innermost_reduction_unsupported()
+// CHECK-NOT:      scf.yield %{{.+}} : vector<1x1x4xi32>
+// CHECK-NOT:      vector.reduction
 
 // -----
 
@@ -342,6 +340,6 @@
   }
 }
 
-// CHECK:     func.func @split_reduction_double_reduction_unsupported()
-// CHECK:       vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
-// CHECK-NOT:   vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
+// CHECK-LABEL:  func.func @split_reduction_double_reduction_unsupported()
+// CHECK:          vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
+// CHECK-NOT:      vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 4802836..7ae64ee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
@@ -55,17 +55,68 @@
   }
 }
 
-// Masking is not applied to the main vector loop when the peeling is used.
+// Masking is applied to the main vector loop when peeling is not used.
 
 // CHECK-LABEL: func.func @mask_dynamic_generic_add
 // Main loop
-// CHECK:       scf.for
-// CHECK:         vector.load
-// CHECK:         vector.load
-// CHECK:         vector.store
-// Peel loop
-// CHECK:       scf.for
-// CHECK:         vector.maskedload
-// CHECK:         vector.maskedload
-// CHECK:         vector.maskedstore
+//         CHECK: scf.for
+// CHECK-COUNT-2:   vector.maskedload
+//         CHECK:   vector.maskedstore
+// No epilogue
+//     CHECK-NOT: scf.for
+
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+    lowering_config = <tile_sizes = [[127, 255], [8, 0], [0, 32]]>,
+    translation_info  = <CPUDoubleTilingExpert>,
+    workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>
+  ]>
+]>
+hal.executable private @preset_config_reduction  {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+    hal.executable.export @mask_dynamic_reduction layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @mask_dynamic_reduction() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %6 = arith.index_cast %0 : i32 to index
+        %7 = arith.index_cast %1 : i32 to index
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+            : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %init = tensor.empty(%6) : tensor<?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
+        %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0)>],
+                                   iterator_types = ["parallel", "reduction"]}
+          ins(%lhs : tensor<?x?xf32>) outs(%fill : tensor<?xf32>) {
+            ^bb0(%in0: f32, %out: f32):
+          %add = arith.addf %out, %in0 : f32
+          linalg.yield %add: f32
+        } -> tensor<?xf32>
+        flow.dispatch.tensor.store %generic, %result_binding, offsets = [0], sizes = [%6], strides = [1]
+            : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-LABEL: func.func @mask_dynamic_reduction
+// CHECK-COUNT-5:   vector.maskedload
+// CHECK-COUNT-4:   vector.mask %{{.*}} { vector.reduction <add>
+//         CHECK:   vector.maskedstore
 
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
index 4d021c1..1c890f0 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
@@ -43,19 +43,46 @@
 // IREE specific functions
 //===----------------------------------------------------------------------===//
 
-/// Returns the op that contains lowering config. Returns failure if there are
-/// multiple op having lowering config.
-static FailureOr<Operation *> getRootOp(func::FuncOp funcOp) {
+/// Returns the op that contains lowering config. Checks whether the provided op
+/// contains the lowering config and returns it. Otherwise, tries to find the
+/// lowering config across the function. If there are multiple ops with the same
+/// lowering configs, returns the first one found. Returns failure if there are
+/// multiple op with different lowering config.
+static FailureOr<Operation *> getRootOp(Operation *op) {
+  // Check for self first.
+  if (iree_compiler::getLoweringConfig(op)) {
+    return op;
+  }
+
+  // Get the function op.
+  auto funcOp = dyn_cast<func::FuncOp>(op);
+  if (!funcOp) {
+    funcOp = op->getParentOfType<func::FuncOp>();
+  }
+
+  assert(funcOp && "Missing funcOp");
+
   Operation *rootOp = nullptr;
+  mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr rootLoweringConfig;
   auto result = funcOp.walk([&](Operation *op) -> WalkResult {
-    if (!iree_compiler::getLoweringConfig(op)) return WalkResult::advance();
-    if (rootOp) {
-      return WalkResult::interrupt();
+    auto loweringConfig = iree_compiler::getLoweringConfig(op);
+    if (!loweringConfig) {
+      return WalkResult::advance();
     }
-    rootOp = op;
+    if (rootLoweringConfig) {
+      if (rootLoweringConfig != loweringConfig) {
+        return WalkResult::interrupt();
+      }
+    } else {
+      rootOp = op;
+      rootLoweringConfig = loweringConfig;
+    }
     return WalkResult::advance();
   });
-  if (!rootOp || result.wasInterrupted()) return failure();
+
+  if (!rootOp || result.wasInterrupted()) {
+    return failure();
+  }
   return rootOp;
 }
 
@@ -141,8 +168,7 @@
 // particular linalg op within that dispatch.
 static SmallVector<int64_t> getVectorSizes(
     linalg::LinalgOp linalgOp, ArrayRef<int64_t> canonicalVectorShape) {
-  FailureOr<Operation *> rootOp =
-      getRootOp(linalgOp->getParentOfType<func::FuncOp>());
+  FailureOr<Operation *> rootOp = getRootOp(linalgOp);
   if (failed(rootOp)) {
     return {};
   }
@@ -152,21 +178,6 @@
     return {};
   }
 
-  // TODO: Support masking for static shapes.
-  if (llvm::any_of(linalgOp.getStaticLoopRanges(), [](int64_t dimSize) {
-        return !ShapedType::isDynamicShape(dimSize) && dimSize != 1;
-      })) {
-    return {};
-  }
-
-  // TODO: Support masking for reduction.
-  if (llvm::any_of(linalgOp.getIteratorTypesArray(),
-                   [](utils::IteratorType iter) {
-                     return !linalg::isParallelIterator(iter);
-                   })) {
-    return {};
-  }
-
   if (canonicalVectorShape.empty()) {
     return {};
   }
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
index a205824..e96141f 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
@@ -1,5 +1,4 @@
-# TODO(#11923): Enable after fixing.
-# XFAIL: *
+# TODO(#11923): If this test fails again, check #11923.
 
 # REQUIRES: llvmcpu
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=lbeta --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
index eee6dbb..0753eb1 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
@@ -1,4 +1,3 @@
 # REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=log_softmax --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
index 0f0cde2..092b784 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
@@ -1,4 +1,3 @@
 # REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
 # RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=softmax --artifacts_dir=%t