[CPU] Make VectorPreProcStrategy consider undefined behaviors (#18146)
Vectorization pass should not introduce extra undefined behaviors.
In some particular cases, using masking strategy could result in
divide-by-zero exception, as the masking strategy by default loads zero
values when mask is false.
This patch addresses such issue by falling back to loop peeling, in case
extra undefined behaviors could happen using mask strategy.
Signed-off-by: Alan Li <me@alanli.org>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 3db67b8..f475657 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -219,6 +219,20 @@
return VectorPreProcStrategy::None;
}
+ // Walk the linalgOp code, if there is any instruction that could result in
+ // undefined behavior in mask strategy, fall back to using peel strategy.
+ bool usePeelingStrategy = false;
+ linalgOp.walk([&](Operation *op) -> WalkResult {
+ if (mlir::iree_compiler::mayHaveUndefinedBehaviorInMasking(op)) {
+ usePeelingStrategy = true;
+ return WalkResult::interrupt();
+ }
+ return WalkResult::advance();
+ });
+ if (usePeelingStrategy) {
+ return VectorPreProcStrategy::Peeling;
+ }
+
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
bool isLinalgGeneric = isa<linalg::GenericOp>(linalgOp.getOperation());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
index ef59e79..65bdff5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
@@ -93,4 +93,16 @@
return true;
}
+bool mayHaveUndefinedBehaviorInMasking(Operation *op) {
+ // Those operations will be lowered to division or related instructions,
+ // and they might result in divide-by-zero.
+ if (isa<mlir::arith::RemSIOp, mlir::arith::RemUIOp, mlir::arith::DivSIOp,
+ mlir::arith::DivUIOp, mlir::arith::CeilDivSIOp,
+ mlir::arith::CeilDivUIOp, mlir::arith::FloorDivSIOp,
+ mlir::arith::DivFOp, mlir::arith::RemFOp>(op)) {
+ return true;
+ }
+ return false;
+}
+
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
index a674306..1308f87 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
@@ -51,6 +51,9 @@
/// argument corresponding to the input.
bool isLinalgGeneric2DTranspose(linalg::GenericOp genericOp);
+/// Returns true if the op could result in undefined behavior.
+bool mayHaveUndefinedBehaviorInMasking(Operation *op);
+
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_CODEGEN_LLVMCPU_UTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 15183ef..2e21413 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1904,3 +1904,34 @@
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: {lowering_config = #[[CONFIG]]}
+
+// -----
+
+// Test scenario: While doing vectorization with masking strategy, and when the vector size is not a multiple of the element size,
+// the vectorization could result in holes in the vectorized load/store, which might result in undefined behavior such as divide by zero.
+// To avoid this, do not use masking strategy if the vectorized operation may result in undefined behavior. In this case, `arith.remsi`
+// could result in divide by zero exception with masking strategy when the loop size is not a multiple of the vector size.
+
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {native_vector_size = 16}>
+module {
+ func.func @test_mod_vectorizing_strategy_peeling() attributes {hal.executable.target = #executable_target_system_elf_x86_64_}{
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
+ %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
+ %2 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<6xi32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
+ %5 = tensor.empty() : tensor<6xi32>
+ %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<6xi32>, tensor<6xi32>) outs(%5 : tensor<6xi32>) {
+ ^bb0(%in: i32, %in_0: i32, %out: i32):
+ %7 = arith.remsi %in, %in_0 : i32
+ linalg.yield %7 : i32
+ } -> tensor<6xi32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [6], strides = [1] : tensor<6xi32> -> !flow.dispatch.tensor<writeonly:tensor<6xi32>>
+ return
+ }
+}
+
+// CHECK: #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>
+// CHECK-LABEL: @test_mod_vectorizing_strategy_peeling
+// CHECK-SAME: attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation}