[CPU] Make VectorPreProcStrategy consider undefined behaviors (#18146) Vectorization pass should not introduce extra undefined behaviors. In some particular cases, using masking strategy could result in divide-by-zero exception, as the masking strategy by default loads zero values when mask is false. This patch addresses such issue by falling back to loop peeling, in case extra undefined behaviors could happen using mask strategy. Signed-off-by: Alan Li <me@alanli.org>

commit: 66ed1389b8c85766bf2a79186f751d3a13777abf [log] [tgz]
author: lialan <me@alanli.org> Thu Aug 15 14:28:02 2024 -0400
committer: GitHub <noreply@github.com> Thu Aug 15 18:28:02 2024 +0000
tree: bc04dc9fcde1447f55e1a3ef719f7bec18ca6b76
parent: 75ad9370d759f4557a68a8a97ea25508aafc6f42 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 3db67b8..f475657 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -219,6 +219,20 @@
     return VectorPreProcStrategy::None;
   }
 
+  // Walk the linalgOp code, if there is any instruction that could result in
+  // undefined behavior in mask strategy, fall back to using peel strategy.
+  bool usePeelingStrategy = false;
+  linalgOp.walk([&](Operation *op) -> WalkResult {
+    if (mlir::iree_compiler::mayHaveUndefinedBehaviorInMasking(op)) {
+      usePeelingStrategy = true;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (usePeelingStrategy) {
+    return VectorPreProcStrategy::Peeling;
+  }
+
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(linalgOp);
   bool isLinalgGeneric = isa<linalg::GenericOp>(linalgOp.getOperation());
 

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
index ef59e79..65bdff5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.cpp

@@ -93,4 +93,16 @@
   return true;
 }
 
+bool mayHaveUndefinedBehaviorInMasking(Operation *op) {
+  // Those operations will be lowered to division or related instructions,
+  // and they might result in divide-by-zero.
+  if (isa<mlir::arith::RemSIOp, mlir::arith::RemUIOp, mlir::arith::DivSIOp,
+          mlir::arith::DivUIOp, mlir::arith::CeilDivSIOp,
+          mlir::arith::CeilDivUIOp, mlir::arith::FloorDivSIOp,
+          mlir::arith::DivFOp, mlir::arith::RemFOp>(op)) {
+    return true;
+  }
+  return false;
+}
+
 } // namespace mlir::iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
index a674306..1308f87 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Utils.h

@@ -51,6 +51,9 @@
 ///   argument corresponding to the input.
 bool isLinalgGeneric2DTranspose(linalg::GenericOp genericOp);
 
+/// Returns true if the op could result in undefined behavior.
+bool mayHaveUndefinedBehaviorInMasking(Operation *op);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_LLVMCPU_UTILS_H_

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 15183ef..2e21413 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir

@@ -1904,3 +1904,34 @@
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //     CHECK:    linalg.generic
 // CHECK-SAME:     {lowering_config = #[[CONFIG]]}
+
+// -----
+
+// Test scenario: While doing vectorization with masking strategy, and when the vector size is not a multiple of the element size,
+// the vectorization could result in holes in the vectorized load/store, which might result in undefined behavior such as divide by zero.
+// To avoid this, do not use masking strategy if the vectorized operation may result in undefined behavior. In this case, `arith.remsi`
+// could result in divide by zero exception with masking strategy when the loop size is not a multiple of the vector size.
+
+#executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {native_vector_size = 16}>
+module {
+  func.func @test_mod_vectorizing_strategy_peeling() attributes {hal.executable.target = #executable_target_system_elf_x86_64_}{
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
+    %1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<6xi32>>
+    %2 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<6xi32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
+    %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor<readonly:tensor<6xi32>> -> tensor<6xi32>
+    %5 = tensor.empty() : tensor<6xi32>
+    %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<6xi32>, tensor<6xi32>) outs(%5 : tensor<6xi32>) {
+    ^bb0(%in: i32, %in_0: i32, %out: i32):
+      %7 = arith.remsi %in, %in_0 : i32
+      linalg.yield %7 : i32
+    } -> tensor<6xi32>
+    flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [6], strides = [1] : tensor<6xi32> -> !flow.dispatch.tensor<writeonly:tensor<6xi32>>
+    return
+  }
+}
+
+// CHECK: #translation = #iree_codegen.translation_info<CPUDoubleTilingExpert, {enable_loop_peeling}>
+// CHECK-LABEL: @test_mod_vectorizing_strategy_peeling
+// CHECK-SAME: attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation}
commit	66ed1389b8c85766bf2a79186f751d3a13777abf	[log] [tgz]
author	lialan <me@alanli.org>	Thu Aug 15 14:28:02 2024 -0400
committer	GitHub <noreply@github.com>	Thu Aug 15 18:28:02 2024 +0000
tree	bc04dc9fcde1447f55e1a3ef719f7bec18ca6b76
parent	75ad9370d759f4557a68a8a97ea25508aafc6f42 [diff]