[Codegen][CPU] Eliminate all-true vector masks after vectorization (#18190) This enables an upstream transform that eliminates all true `vector.create_mask` ops. This is particularly beneficial for scalable vectors, which use dynamic tensor types, which results in masks that otherwise would not fold away till much later, preventing some optimizations. Depends on llvm/llvm-project#99314. --------- Signed-off-by: Benjamin Maxwell <benjamin.maxwell@arm.com>

commit: fe638b0abe0fdb866dec9ae25f31d871f738fd23 [log] [tgz]
author: Benjamin Maxwell <benjamin.maxwell@arm.com> Wed Aug 14 12:40:22 2024 +0100
committer: GitHub <noreply@github.com> Wed Aug 14 12:40:22 2024 +0100
tree: 6d9e645bdb116b96427e333e4baabfed803c26e9
parent: c71fe1a7aa8a1c360e6418043b2a1165861d72f9 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
index ebe4ef2..0175f19 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GenericVectorization.cpp

@@ -325,6 +325,14 @@
   void runOnOperation() override;
 };
 
+/// Converts from iree_compiler::VscaleRange to vector::VscaleRange.
+static std::optional<vector::VscaleRange>
+toVectorVscaleRange(std::optional<iree_compiler::VscaleRange> vscaleRange) {
+  if (!vscaleRange.has_value())
+    return std::nullopt;
+  return vector::VscaleRange{vscaleRange->min, vscaleRange->max};
+}
+
 void GenericVectorizationPass::runOnOperation() {
   MLIRContext *context = &getContext();
   auto funcOp = getOperation();
@@ -378,6 +386,17 @@
   };
 
   {
+    // Eliminate (all-true) vector masks as early as possible (to avoid missing
+    // optimizations/folds). This is particularly beneficial for scalable
+    // vectors that use dynamic tensor shapes.
+    auto targetAttr =
+        iree_compiler::IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+    auto vscaleRange = iree_compiler::getDefaultVscaleRange(targetAttr);
+    vector::eliminateVectorMasks(rewriter, funcOp,
+                                 toVectorVscaleRange(vscaleRange));
+  }
+
+  {
     // Canonicalize mask related ops before we lower them.
     RewritePatternSet maskCanonPatterns(funcOp.getContext());
     vector::CreateMaskOp::getCanonicalizationPatterns(maskCanonPatterns,

diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
index 2ce6186..38e0f5e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir

@@ -445,3 +445,61 @@
 // CHECK-MASK:   scf.for
 // CHECK-MASK:     scf.for
 // CHECK-MASK:       vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>
+
+// -----
+
+#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}>
+#config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0], [1, 4, [4], 0], [0, 0, 0, 3], [0, 0, 0, 0]]>
+#map = affine_map<()[s0] -> (-(96 mod s0) + 96)>
+#map1 = affine_map<(d0) -> (d0 * 2)>
+
+func.func @depthwise_conv_fold_away_masking(%arg0: tensor<1x68x120x96xf32>, %arg1: tensor<1x137x241x96xf32>, %arg2: tensor<3x3x96xf32>) -> tensor<1x68x120x96xf32>
+  attributes {hal.executable.target = #aarch64_sve}
+{
+  %c3 = arith.constant 3 : index
+  %c120 = arith.constant 120 : index
+  %c68 = arith.constant 68 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %0 = scf.for %arg3 = %c0 to %c68 step %c1 iter_args(%arg4 = %arg0) -> (tensor<1x68x120x96xf32>) {
+    %1 = scf.for %arg5 = %c0 to %c120 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x68x120x96xf32>) {
+      %2 = affine.apply #map()[%c4_vscale]
+      %3 = scf.for %arg7 = %c0 to %2 step %c4_vscale iter_args(%arg8 = %arg6) -> (tensor<1x68x120x96xf32>) {
+        %4 = affine.apply #map1(%arg3)
+        %5 = affine.apply #map1(%arg5)
+        %extracted_slice = tensor.extract_slice %arg1[0, %4, %5, %arg7] [1, 3, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x137x241x96xf32> to tensor<1x3x9x?xf32>
+        %extracted_slice_0 = tensor.extract_slice %arg2[0, 0, %arg7] [3, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x96xf32> to tensor<3x3x?xf32>
+        %extracted_slice_1 = tensor.extract_slice %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x68x120x96xf32> to tensor<1x1x4x?xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32>
+        %7 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %6) -> (tensor<1x1x4x?xf32>) {
+          %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg9, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x3x9x?xf32> to tensor<1x1x9x?xf32>
+          %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg9, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x?xf32> to tensor<1x3x?xf32>
+          %extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x1x4x?xf32>
+          %extracted_slice_5 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x9x?xf32> to tensor<1x9x?xf32>
+          %extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<1x3x?xf32> to tensor<3x?xf32>
+          %extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x4x?xf32>
+          %8 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, lowering_config = #config, strides = dense<2> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x9x?xf32>, tensor<3x?xf32>) outs(%extracted_slice_7 : tensor<1x4x?xf32>) -> tensor<1x4x?xf32>
+          %inserted_slice_8 = tensor.insert_slice %8 into %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x4x?xf32> into tensor<1x1x4x?xf32>
+          %inserted_slice_9 = tensor.insert_slice %inserted_slice_8 into %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x1x4x?xf32>
+          scf.yield %inserted_slice_9 : tensor<1x1x4x?xf32>
+        }
+        %inserted_slice = tensor.insert_slice %7 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x68x120x96xf32>
+        scf.yield %inserted_slice : tensor<1x68x120x96xf32>
+      }
+      scf.yield %3 : tensor<1x68x120x96xf32>
+    }
+    scf.yield %1 : tensor<1x68x120x96xf32>
+  }
+  return %0 : tensor<1x68x120x96xf32>
+}
+
+/// This checks that the masks (introduced by the vectorizer) are eliminated by
+/// the end of the iree-codegen-generic-vectorization pass.
+
+// CHECK-MASK-LABEL: func.func @depthwise_conv_fold_away_masking
+// CHECK-MASK-NOT: vector.create_mask
+// CHECK-MASK-NOT: vector.constant_mask
commit	fe638b0abe0fdb866dec9ae25f31d871f738fd23	[log] [tgz]
author	Benjamin Maxwell <benjamin.maxwell@arm.com>	Wed Aug 14 12:40:22 2024 +0100
committer	GitHub <noreply@github.com>	Wed Aug 14 12:40:22 2024 +0100
tree	6d9e645bdb116b96427e333e4baabfed803c26e9
parent	c71fe1a7aa8a1c360e6418043b2a1165861d72f9 [diff]