Re-Enable reduction masking (#12472)
https://github.com/openxla/iree/pull/12003
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index b67b85f..2a91ba7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -126,6 +126,10 @@
// Peel iterations from the vector dimensions so that they become multiple of
// the vector length.
Peeling,
+ // Compute vector dimensions assuming vector masking support. Vector sizes may
+ // be rounded up to the nearest power of two and out-of-bounds elements would
+ // be masked-out.
+ Masking,
// Do not apply any vectorization pre-processing transformation.
None
};
@@ -140,6 +144,9 @@
case VectorPreProcStrategy::Peeling:
os << "Peeling";
break;
+ case VectorPreProcStrategy::Masking:
+ os << "Masking";
+ break;
case VectorPreProcStrategy::None:
os << "None";
break;
@@ -193,23 +200,42 @@
return VectorPreProcStrategy::None;
}
- if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
- // Peeling is only enabled on fully dynamic shape ops for now.
- return VectorPreProcStrategy::Peeling;
- }
-
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
+ bool isLinalgGeneric = isa<linalg::GenericOp>(linalgOp.getOperation());
// Default X86 specific strategy.
- if (isX86(targetAttr) && enableVectorPadding) {
- // Padding is only enabled on x86. It leads to too much overhead on RISC-V
- // and ARM.
- return VectorPreProcStrategy::Padding;
+ if (isX86(targetAttr)) {
+ if (isLinalgGeneric) {
+ return VectorPreProcStrategy::Masking;
+ }
+
+ if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+ return VectorPreProcStrategy::Peeling;
+ }
+
+ if (enableVectorPadding) {
+ // Padding is only enabled on x86. It leads to too much overhead on
+ // RISC-V and ARM.
+ return VectorPreProcStrategy::Padding;
+ }
}
// Default RISC-V specific strategies.
- if (isRISCV(targetAttr) && enableVectorPeeling) {
- return VectorPreProcStrategy::Peeling;
+ if (isRISCV(targetAttr)) {
+ if (isLinalgGeneric) {
+ return VectorPreProcStrategy::Masking;
+ }
+
+ if (enableVectorPeeling) {
+ return VectorPreProcStrategy::Peeling;
+ }
+ }
+
+ // Default AArch64 specific strategies.
+ if (isAArch64(targetAttr)) {
+ if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+ return VectorPreProcStrategy::Peeling;
+ }
}
return VectorPreProcStrategy::None;
@@ -429,17 +455,33 @@
return distributedTileSizes;
}
+/// Returns the nearest power of two of `size` if `predicate` is true.
+/// Otherwise, returns `size`.
+static int64_t roundUpToPow2(int64_t size, bool predicate) {
+ if (!predicate) {
+ return size;
+ }
+ assert(size > 0 && "Negative size");
+ return llvm::PowerOf2Ceil(size);
+}
+
/// Adjusts the workload per workgroup to be a multiple of vector size to ensure
/// that the op vectorizes.
static int64_t getMaxTileSize(int64_t lb, int64_t ub, int64_t maxSize,
int64_t vectorSize,
- bool allowIncompleteTile = false) {
+ bool allowIncompleteTile = false,
+ bool enforcePowerOfTwo = false) {
if (ub == ShapedType::kDynamic || lb == ShapedType::kDynamic) {
- return maxSize;
+ return roundUpToPow2(maxSize, enforcePowerOfTwo);
}
int64_t numIters = ub - lb;
if (numIters <= maxSize && numIters < vectorSize) {
- return numIters;
+ return roundUpToPow2(numIters, enforcePowerOfTwo);
+ }
+
+ // Return the largest suitable power of two if power of two is enforced.
+ if (enforcePowerOfTwo) {
+ return roundUpToPow2(std::min(maxSize, numIters), enforcePowerOfTwo);
}
int64_t scaledUB = std::min(maxSize, numIters) / vectorSize * vectorSize;
@@ -580,6 +622,11 @@
parallelSizes.end());
SmallVector<int64_t> origReductionSizes(reductionSizes.begin(),
reductionSizes.end());
+ // Masking doesn't need any dim set to 1.
+ if (vecPreProcStrategy == VectorPreProcStrategy::Masking) {
+ return;
+ }
+
setAlwaysVectorizeSizes(op, parallelSizes, reductionSizes);
// If peeling is enabled and the 'op' is fully dynamic, we only vectorize the
@@ -763,11 +810,14 @@
SmallVector<int64_t> parallelTileSizes;
for (auto [index, tileSize] : llvm::enumerate(workgroupTileSizes)) {
int64_t sz = tileSize;
+ bool allowIncompleteTile =
+ vecPreProcStrategy == VectorPreProcStrategy::Peeling ||
+ vecPreProcStrategy == VectorPreProcStrategy::Masking;
+
if (sz != 0) {
- sz = getMaxTileSize(/*lb=*/0, /*ub=*/shape[index],
- /*maxTileSize=*/sz, vectorSize,
- /*allowIncompleteTile=*/vecPreProcStrategy ==
- VectorPreProcStrategy::Peeling);
+ sz = getMaxTileSize(
+ /*lb=*/0, /*ub=*/shape[index],
+ /*maxTileSize=*/sz, vectorSize, allowIncompleteTile);
}
parallelTileSizes.push_back(sz);
}
@@ -940,11 +990,12 @@
// works for linalg.matmul cases. We can relax it once we have better
// scheduling, e.g., transform dialect.
SmallVector<int64_t> flowTileSizes;
- auto preProcStrategy = getVectorPreProcStrategy(linalgOp);
- bool usePaddingPipeline = preProcStrategy == VectorPreProcStrategy::Padding;
+ auto vecPreProcStrategy = getVectorPreProcStrategy(linalgOp);
+ bool usePaddingPipeline =
+ vecPreProcStrategy == VectorPreProcStrategy::Padding;
- LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: " << preProcStrategy
- << "\n");
+ LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+ << vecPreProcStrategy << "\n");
if (usePaddingPipeline) {
// It's inspired from Sandbox configuration. Sandbox has
@@ -989,11 +1040,11 @@
if (isNoPadMultiTilingBeneficial(contractionOp, tripleTileSizes)) {
return setMatmulNoPadRootConfig(entryPointFn, contractionOp,
tripleTileSizes, vectorSize,
- preProcStrategy);
+ vecPreProcStrategy);
} // else fall back to the default configuration.
}
return setMatmulNoPadRootConfig(entryPointFn, contractionOp, tileSizes,
- vectorSize, preProcStrategy);
+ vectorSize, vecPreProcStrategy);
}
/// Sets the lowering configuration for dispatch region for linalg.mmt4d root
@@ -1135,7 +1186,7 @@
static void setX86WorkgroupTileSizes(
linalg::GenericOp genericOp, unsigned numLoops,
ArrayRef<int64_t> flowTileSizes, ArrayRef<int64_t> minTileSizes,
- ArrayRef<int64_t> maxTileSizes,
+ ArrayRef<int64_t> maxTileSizes, VectorPreProcStrategy vecPreProcStrategy,
SmallVectorImpl<int64_t> &workgroupTileSizes) {
workgroupTileSizes.append(numLoops, 0);
SmallVector<int64_t, 4> staticLoopRanges = genericOp.getStaticLoopRanges();
@@ -1143,7 +1194,9 @@
if (flowTileSizes[loopNum]) {
workgroupTileSizes[loopNum] =
getMaxTileSize(0, flowTileSizes[loopNum], minTileSizes[loopNum],
- minTileSizes[loopNum]);
+ minTileSizes[loopNum], /*allowIncompleteTile=*/false,
+ /*enforcePowerOfTwo=*/vecPreProcStrategy ==
+ VectorPreProcStrategy::Masking);
} else {
// If the flow level tile size is zero, and static loop range is 0 as
// well, set the tile sizes here to zero as well.
@@ -1191,6 +1244,8 @@
return success();
}
+ LLVM_DEBUG(KD_DBGS() << "Setting default generic op root configuration\n");
+
// If there are no loops, there is nothing to do.
unsigned numLoops = genericOp.getNumLoops();
if (numLoops == 0) {
@@ -1205,21 +1260,34 @@
// allocation limit See #9469 for example.
SmallVector<int64_t> maxTileSizes(numLoops, defaultWorkgroupTileSize / 2);
+ LLVM_DEBUG(KD_DBGS() << "Min tile sizes for distribution: " << minTileSizes
+ << "\n");
+ LLVM_DEBUG(KD_DBGS() << "Max tile sizes for distribution: " << maxTileSizes
+ << "\n");
+
// Set the flow level tiling to the default.
SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
genericOp, minTileSizes, maxTileSizes);
+ LLVM_DEBUG(KD_DBGS() << "Final tile sizes for distribution: " << flowTileSizes
+ << "\n");
+
+ auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+ LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+ << vecPreProcStrategy << "\n");
+
// Set the next level tile sizes.
SmallVector<int64_t> parallelTileSizes;
SmallVector<int64_t> reductionTileSizes;
setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
- maxTileSizes, parallelTileSizes);
+ maxTileSizes, vecPreProcStrategy, parallelTileSizes);
splitParallelAndReductionTiles(genericOp, parallelTileSizes,
reductionTileSizes);
- auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
- setVectorSizesForDynamicShapes(genericOp, vecPreProcStrategy,
- parallelTileSizes, reductionTileSizes);
+ LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (parallel): "
+ << parallelTileSizes << "\n");
+ LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (reduction): "
+ << reductionTileSizes << "\n");
TileSizesListType tileSizes;
tileSizes.push_back(flowTileSizes);
@@ -1309,10 +1377,14 @@
SmallVector<int64_t> flowTileSizes = getDefaultDistributedLevelTileSizes(
genericOp, minTileSizes, maxTileSizes);
+ auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+ LLVM_DEBUG(KD_DBGS() << "Vectorization pre-processing strategy "
+ << vecPreProcStrategy << "\n");
+
// Set the next level tile sizes.
SmallVector<int64_t> parallelTileSizes;
setX86WorkgroupTileSizes(genericOp, numLoops, flowTileSizes, minTileSizes,
- maxTileSizes, parallelTileSizes);
+ maxTileSizes, vecPreProcStrategy, parallelTileSizes);
TileSizesListType tileSizes;
tileSizes.push_back(flowTileSizes);
@@ -1351,6 +1423,9 @@
getDefaultDistributedLevelTileSizes(genericOp, minTileSizes, maxTileSizes,
/*allowIncompleteTile=*/true);
+ // TODO(dcaballe): The logic below is disconnected from the main tile size
+ // selection logic in getMaxTileSize. We should either port it there or remove
+ // it.
// Adjust the number of workload per workgroup to at least 4096. This
// prevents the runtime overheads domiating the execution time. The number is
// derived from experimients. We should be able to make it related to target.
@@ -1378,6 +1453,10 @@
flowTileSizes[currDim] = newSize;
}
+ auto vecPreProcStrategy = getVectorPreProcStrategy(genericOp);
+ LLVM_DEBUG(KD_DBGS() << "Vector pre-processing strategy: "
+ << vecPreProcStrategy << "\n");
+
// Adjust tiling sizes of vector levels to avoid large unroll factors. Most of
// the cases are f32 and i32, so we divide it by 4.
auto nativeVecSize = getNativeVectorSizeInBytes(entryPointFn);
@@ -1385,7 +1464,10 @@
nativeVecSize ? nativeVecSize.value() : clNativeVectorSizeInBytes;
vecSize /= 4;
SmallVector<int64_t> vecTileSizes(minTileSizes.begin(), minTileSizes.end());
- for (auto &i : vecTileSizes) i = std::min(i, vecSize);
+ for (auto &i : vecTileSizes) {
+ i = roundUpToPow2(std::min(i, vecSize),
+ vecPreProcStrategy == VectorPreProcStrategy::Masking);
+ }
// Setting reduction tile sizes is a workaround to kick in peeling transform.
// The tiling won't happen because the sizes are zeros.
@@ -1396,10 +1478,18 @@
tileSizes.push_back(vecTileSizes);
tileSizes.push_back(zeros);
- auto passPipeline =
- genericOp.hasTensorSemantics()
- ? DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert
- : DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+ LLVM_DEBUG(KD_DBGS() << "Final tile sizes for element-wise op: " << tileSizes
+ << "\n");
+
+ DispatchLoweringPassPipeline passPipeline;
+ if (genericOp.hasBufferSemantics()) {
+ passPipeline = DispatchLoweringPassPipeline::CPUBufferOpsTileAndVectorize;
+ } else if (vecPreProcStrategy == VectorPreProcStrategy::Peeling) {
+ passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingPeelingExpert;
+ } else {
+ passPipeline = DispatchLoweringPassPipeline::CPUDoubleTilingExpert;
+ }
+
return setOpConfigAndEntryPointFnTranslation(entryPointFn, genericOp,
tileSizes, passPipeline);
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index 07fae72..b7e4ff3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -32,12 +32,13 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
-// CHECK: hal.executable.export public @matvec_static
-// CHECK-SAME: translation_info = #[[TRANSLATION]]
-// CHECK: linalg.matvec
-// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 0], [32, 0], [0, 16]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPadExpert>
+// CHECK: hal.executable.export public @matvec_static
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.matvec
+// CHECK-SAME: lowering_config = #[[CONFIG]]
// -----
@@ -183,15 +184,15 @@
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
-hal.executable private @add {
+hal.executable private @dynamic_add {
hal.executable.variant @llvm, target = <"llvm-cpu", "embedded-elf-x86_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 16 : index,
target_triple = "x86_64-unknown-linux-gnu"
}> {
- hal.executable.export @add layout(#pipeline_layout)
+ hal.executable.export @dynamic_add layout(#pipeline_layout)
builtin.module {
- func.func @add() {
+ func.func @dynamic_add() {
%c0 = arith.constant 0 : index
%dim0 = hal.interface.constant.load[0] : index
%dim1 = hal.interface.constant.load[1] : index
@@ -218,8 +219,8 @@
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 4], [0, 0]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
-// CHECK: hal.executable.export public @add
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+// CHECK: hal.executable.export public @dynamic_add
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: lowering_config = #[[CONFIG]]
@@ -276,7 +277,7 @@
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 64, 64, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @add4D
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
@@ -317,7 +318,7 @@
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 8, 16, 64], [1, 1, 1, 4], [0, 0, 0, 0]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @add_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
@@ -544,8 +545,8 @@
}
}
}
-// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 1]{{\]}}>
-// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 4]{{\]}}>
+// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @outs_fusion_fn
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: func.func @outs_fusion_fn()
@@ -826,7 +827,7 @@
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[16, 96], [16, 16], [0, 0]]>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @generic_static
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
@@ -1176,7 +1177,7 @@
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0, 0, 64, 64, 0, 64], [1, 1, 1, 1, 1, 1, 1, 4], [0, 0, 0, 0, 0, 0, 0, 0]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @generic_unit_dims_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
@@ -1264,7 +1265,7 @@
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0], [0], [4]{{\]}}>
-// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingPeelingExpert>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: hal.executable.export public @reduce_to_scalar_dynamic
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
index 9824bc0..ce768c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir
@@ -56,12 +56,12 @@
}
}
}
-// CHECK: func.func @check_no_cse()
-// CHECK-NOT: memref.alloc
-// CHECK: %[[FOR:.+]] = scf.for
-// CHECK: %[[DIVF:.+]] = arith.divf %[[FOR]]
-// CHECK: %[[RES:.+]] = vector.extract %[[DIVF]]
-// CHECK: memref.store %[[RES]]
+// CHECK-LABEL: func.func @check_no_cse()
+// CHECK-NOT: memref.alloc
+// CHECK: %[[FOR:.+]] = scf.for
+// CHECK: %[[DIVF:.+]] = arith.divf %[[FOR]]
+// CHECK: %[[RES:.+]] = vector.extract %[[DIVF]]
+// CHECK: memref.store %[[RES]]
// -----
@@ -106,9 +106,10 @@
}
}
}
-// CHECK: func.func @preset_config_matmul
-// CHECK: vector.outerproduct
-// HOIST-PAD: func.func @preset_config_matmul
+// CHECK-LABEL: func.func @preset_config_matmul
+// CHECK: vector.outerproduct
+
+// HOIST-PAD-LABEL: func.func @preset_config_matmul
// HOIST-PAD-DAG: %[[BUF1:.+]] = memref.alloca() {{.+}} : memref<3x4x16x32xf32>
// HOIST-PAD-DAG: %[[BUF2:.+]] = memref.alloca() {{.+}} : memref<4x8x16xf32>
// HOIST-PAD-16-DAG: vector.store {{.+}}, %[[BUF1]]
@@ -167,8 +168,8 @@
}
}
}
-// CHECK: func.func @batch_matmul_dynamic
-// CHECK: vector.outerproduct
+// CHECK-LABEL: func.func @batch_matmul_dynamic
+// CHECK: vector.outerproduct
// -----
@@ -208,10 +209,10 @@
}
}
}
-// CHECK: #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
-// CHECK: func.func @check_buffer_ops_vectorization
-// CHECK: vector.load
-// CHECK-NEXT: vector.store
+// CHECK-LABEL: #{{.+}} = #iree_codegen.translation_info<CPUBufferOpsTileAndVectorize
+// CHECK: func.func @check_buffer_ops_vectorization
+// CHECK: vector.load
+// CHECK-NEXT: vector.store
// -----
@@ -276,12 +277,12 @@
}
}
-// CHECK: func.func @vectorize_fill_conv2d_generic
-// CHECK-NOT: memref.alloca
-// CHECK-NOT: linalg.fill
-// CHECK: vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
-// CHECK-NOT: linalg.generic
-// CHECK: arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
+// CHECK-LABEL: func.func @vectorize_fill_conv2d_generic
+// CHECK-NOT: memref.alloca
+// CHECK-NOT: linalg.fill
+// CHECK: vector.outerproduct %{{.+}}, %{{.+}}, %{{.+}} {kind = #vector.kind<add>}
+// CHECK-NOT: linalg.generic
+// CHECK: arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x8xf32>
// -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
index 37e6ffd..37afde9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/split_reduction_pipeline_tests.mlir
@@ -39,18 +39,18 @@
}
}
-// CHECK: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
-// CHECK: scf.for
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported()
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// CHECK: scf.for
// CHECK: scf.for
-// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK: %[[RES:.+]] = arith.addi
-// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
-// CHECK: arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
+// CHECK: scf.for
+// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK: %[[RES:.+]] = arith.addi
+// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK: arith.addi %{{.+}}, %{{.+}} : vector<1x4xi32>
// -----
@@ -92,21 +92,21 @@
}
}
-// CHECK: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xf32>
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xf32>
-// REORDERCHECK: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
-// REORDERCHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// REORDERCHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// REORDERCHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
-// REORDERCHECK: scf.for
+// REORDERCHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag()
+// REORDERCHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// REORDERCHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// REORDERCHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
// REORDERCHECK: scf.for
// REORDERCHECK: scf.for
-// REORDERCHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// REORDERCHECK: %[[RES:.+]] = arith.addf
-// REORDERCHECK: scf.yield %[[RES]] : vector<1x1x4xf32>
-// REORDERCHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
-// REORDERCHECK: arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
+// REORDERCHECK: scf.for
+// REORDERCHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// REORDERCHECK: %[[RES:.+]] = arith.addf
+// REORDERCHECK: scf.yield %[[RES]] : vector<1x1x4xf32>
+// REORDERCHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xf32> into f32
+// REORDERCHECK: arith.addf %{{.+}}, %{{.+}} : vector<1x4xf32>
// -----
@@ -144,17 +144,17 @@
}
}
-// CHECK: func.func @split_reduction_innermost_reduction_next_dynamic_supported()
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK: %[[RES:.+]] = arith.addi
-// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_next_dynamic_supported()
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK: %[[RES:.+]] = arith.addi
+// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
// -----
@@ -188,17 +188,17 @@
}
}
-// CHECK: func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for
-// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
-// CHECK: %[[RES:.+]] = arith.addi
-// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
-// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
+// CHECK-LABEL: func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported()
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for
+// CHECK: scf.for %[[ARG3:.+]] = %[[C0]] to %[[C64]] step %[[C1]]
+// CHECK: %[[RES:.+]] = arith.addi
+// CHECK: scf.yield %[[RES]] : vector<1x1x4xi32>
+// CHECK: vector.reduction <add>, %{{.+}} %{{.+}} : vector<4xi32> into i32
// -----
@@ -234,9 +234,8 @@
}
}
-// CHECK: func.func @split_reduction_innermost_dynamic_reduction_unsupported()
-// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT: vector.reduction
+// CHECK-LABEL: func.func @split_reduction_innermost_dynamic_reduction_unsupported()
+// CHECK-4: vector.mask %{{.*}} { vector.reduction <add>
// -----
@@ -270,9 +269,8 @@
}
}
-// CHECK: func.func @split_reduction_innermost_imperfect_reduction_unsupported()
-// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT: vector.reduction
+// CHECK-LABEL: func.func @split_reduction_innermost_imperfect_reduction_unsupported()
+// CHECK-4: vector.mask %{{.*}} { vector.reduction <add>
// -----
@@ -306,9 +304,9 @@
}
}
-// CHECK: func.func @split_reduction_not_innermost_reduction_unsupported()
-// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xi32>
-// CHECK-NOT: vector.reduction
+// CHECK-LABEL: func.func @split_reduction_not_innermost_reduction_unsupported()
+// CHECK-NOT: scf.yield %{{.+}} : vector<1x1x4xi32>
+// CHECK-NOT: vector.reduction
// -----
@@ -342,6 +340,6 @@
}
}
-// CHECK: func.func @split_reduction_double_reduction_unsupported()
-// CHECK: vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
-// CHECK-NOT: vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
+// CHECK-LABEL: func.func @split_reduction_double_reduction_unsupported()
+// CHECK: vector.insertelement %{{.+}}, %{{.+}} : vector<4xi32>
+// CHECK-NOT: vector.insertelement %{{.+}}, %{{.+}} : vector<1xi32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 4802836..7ae64ee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
@@ -55,17 +55,68 @@
}
}
-// Masking is not applied to the main vector loop when the peeling is used.
+// Masking is applied to the main vector loop when peeling is not used.
// CHECK-LABEL: func.func @mask_dynamic_generic_add
// Main loop
-// CHECK: scf.for
-// CHECK: vector.load
-// CHECK: vector.load
-// CHECK: vector.store
-// Peel loop
-// CHECK: scf.for
-// CHECK: vector.maskedload
-// CHECK: vector.maskedload
-// CHECK: vector.maskedstore
+// CHECK: scf.for
+// CHECK-COUNT-2: vector.maskedload
+// CHECK: vector.maskedstore
+// No epilogue
+// CHECK-NOT: scf.for
+
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+ lowering_config = <tile_sizes = [[127, 255], [8, 0], [0, 32]]>,
+ translation_info = <CPUDoubleTilingExpert>,
+ workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>
+ ]>
+]>
+hal.executable private @preset_config_reduction {
+ hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+ hal.executable.export @mask_dynamic_reduction layout(#pipeline_layout) {
+ ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+ %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+ hal.return %x, %y, %z : index, index, index
+ }
+ builtin.module {
+ func.func @mask_dynamic_reduction() {
+ %cst = arith.constant 0.000000e+00 : f32
+ %0 = hal.interface.constant.load[0] : i32
+ %1 = hal.interface.constant.load[1] : i32
+ %6 = arith.index_cast %0 : i32 to index
+ %7 = arith.index_cast %1 : i32 to index
+ %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+ : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+ %result_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+ : !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+ %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+ : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+ %init = tensor.empty(%6) : tensor<?xf32>
+ %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?xf32>) -> tensor<?xf32>
+ %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+ affine_map<(d0, d1) -> (d0)>],
+ iterator_types = ["parallel", "reduction"]}
+ ins(%lhs : tensor<?x?xf32>) outs(%fill : tensor<?xf32>) {
+ ^bb0(%in0: f32, %out: f32):
+ %add = arith.addf %out, %in0 : f32
+ linalg.yield %add: f32
+ } -> tensor<?xf32>
+ flow.dispatch.tensor.store %generic, %result_binding, offsets = [0], sizes = [%6], strides = [1]
+ : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?xf32>>{%6}
+ return
+ }
+ }
+ }
+}
+
+// CHECK-LABEL: func.func @mask_dynamic_reduction
+// CHECK-COUNT-5: vector.maskedload
+// CHECK-COUNT-4: vector.mask %{{.*}} { vector.reduction <add>
+// CHECK: vector.maskedstore
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
index 4d021c1..1c890f0 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
@@ -43,19 +43,46 @@
// IREE specific functions
//===----------------------------------------------------------------------===//
-/// Returns the op that contains lowering config. Returns failure if there are
-/// multiple op having lowering config.
-static FailureOr<Operation *> getRootOp(func::FuncOp funcOp) {
+/// Returns the op that contains lowering config. Checks whether the provided op
+/// contains the lowering config and returns it. Otherwise, tries to find the
+/// lowering config across the function. If there are multiple ops with the same
+/// lowering configs, returns the first one found. Returns failure if there are
+/// multiple op with different lowering config.
+static FailureOr<Operation *> getRootOp(Operation *op) {
+ // Check for self first.
+ if (iree_compiler::getLoweringConfig(op)) {
+ return op;
+ }
+
+ // Get the function op.
+ auto funcOp = dyn_cast<func::FuncOp>(op);
+ if (!funcOp) {
+ funcOp = op->getParentOfType<func::FuncOp>();
+ }
+
+ assert(funcOp && "Missing funcOp");
+
Operation *rootOp = nullptr;
+ mlir::iree_compiler::IREE::Codegen::LoweringConfigAttr rootLoweringConfig;
auto result = funcOp.walk([&](Operation *op) -> WalkResult {
- if (!iree_compiler::getLoweringConfig(op)) return WalkResult::advance();
- if (rootOp) {
- return WalkResult::interrupt();
+ auto loweringConfig = iree_compiler::getLoweringConfig(op);
+ if (!loweringConfig) {
+ return WalkResult::advance();
}
- rootOp = op;
+ if (rootLoweringConfig) {
+ if (rootLoweringConfig != loweringConfig) {
+ return WalkResult::interrupt();
+ }
+ } else {
+ rootOp = op;
+ rootLoweringConfig = loweringConfig;
+ }
return WalkResult::advance();
});
- if (!rootOp || result.wasInterrupted()) return failure();
+
+ if (!rootOp || result.wasInterrupted()) {
+ return failure();
+ }
return rootOp;
}
@@ -141,8 +168,7 @@
// particular linalg op within that dispatch.
static SmallVector<int64_t> getVectorSizes(
linalg::LinalgOp linalgOp, ArrayRef<int64_t> canonicalVectorShape) {
- FailureOr<Operation *> rootOp =
- getRootOp(linalgOp->getParentOfType<func::FuncOp>());
+ FailureOr<Operation *> rootOp = getRootOp(linalgOp);
if (failed(rootOp)) {
return {};
}
@@ -152,21 +178,6 @@
return {};
}
- // TODO: Support masking for static shapes.
- if (llvm::any_of(linalgOp.getStaticLoopRanges(), [](int64_t dimSize) {
- return !ShapedType::isDynamicShape(dimSize) && dimSize != 1;
- })) {
- return {};
- }
-
- // TODO: Support masking for reduction.
- if (llvm::any_of(linalgOp.getIteratorTypesArray(),
- [](utils::IteratorType iter) {
- return !linalg::isParallelIterator(iter);
- })) {
- return {};
- }
-
if (canonicalVectorShape.empty()) {
return {};
}
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
index a205824..e96141f 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_lbeta.run
@@ -1,5 +1,4 @@
-# TODO(#11923): Enable after fixing.
-# XFAIL: *
+# TODO(#11923): If this test fails again, check #11923.
# REQUIRES: llvmcpu
# RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=lbeta --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
index eee6dbb..0753eb1 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_log_softmax.run
@@ -1,4 +1,3 @@
# REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
# RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=log_softmax --artifacts_dir=%t
diff --git a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
index 0f0cde2..092b784 100644
--- a/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
+++ b/integrations/tensorflow/test/iree_tf_tests/math/llvmcpu__dynamic_dim_softmax.run
@@ -1,4 +1,3 @@
# REQUIRES: llvmcpu
-# TODO(#11196): Re-enable once the issue is resolved.
-# XFAIL: *
+# TODO(#11196): If this test fails again, check #11196.
# RUN: %PYTHON -m iree_tf_tests.math.math_test --target_backends=iree_llvmcpu --dynamic_dims=true --functions=softmax --artifacts_dir=%t