Disable masking on Aarch64 without SVE support (#12487)

Masking is a catch-all transformation that only requires vector sizes to
be provided to the vectorizer. This patch makes sure that we don't
provide vector sizes to the vectorizer when we don't want to explicitly
enable vector masking (e.g., Aarch64 without SVE support, at least for
now).
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 2a91ba7..97b9acd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -233,7 +233,8 @@
 
   // Default AArch64 specific strategies.
   if (isAArch64(targetAttr)) {
-    if (isFullyDynamicOp(linalgOp) && enableVectorPeeling) {
+    if ((linalg::isElementwise(linalgOp) || isFullyDynamicOp(linalgOp)) &&
+        enableVectorPeeling) {
       return VectorPreProcStrategy::Peeling;
     }
   }
@@ -618,15 +619,15 @@
     linalg::LinalgOp op, VectorPreProcStrategy vecPreProcStrategy,
     SmallVectorImpl<int64_t> &parallelSizes,
     SmallVectorImpl<int64_t> &reductionSizes) {
-  SmallVector<int64_t> origParallelSizes(parallelSizes.begin(),
-                                         parallelSizes.end());
-  SmallVector<int64_t> origReductionSizes(reductionSizes.begin(),
-                                          reductionSizes.end());
   // Masking doesn't need any dim set to 1.
   if (vecPreProcStrategy == VectorPreProcStrategy::Masking) {
     return;
   }
 
+  SmallVector<int64_t> origParallelSizes(parallelSizes.begin(),
+                                         parallelSizes.end());
+  SmallVector<int64_t> origReductionSizes(reductionSizes.begin(),
+                                          reductionSizes.end());
   setAlwaysVectorizeSizes(op, parallelSizes, reductionSizes);
 
   // If peeling is enabled and the 'op' is fully dynamic, we only vectorize the
@@ -1283,6 +1284,8 @@
                            maxTileSizes, vecPreProcStrategy, parallelTileSizes);
   splitParallelAndReductionTiles(genericOp, parallelTileSizes,
                                  reductionTileSizes);
+  setVectorSizesForDynamicShapes(genericOp, vecPreProcStrategy,
+                                 parallelTileSizes, reductionTileSizes);
 
   LLVM_DEBUG(KD_DBGS() << "Vectorization/unrolling tile sizes (parallel): "
                        << parallelTileSizes << "\n");
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 503ace2..83fb8b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -21,7 +21,6 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassRegistry.h"
-#include "mlir/Transforms/Passes.h"
 
 namespace mlir {
 namespace iree_compiler {
@@ -186,6 +185,9 @@
 
       auto target = variantOp.getTarget();
       bool lowerToAVX2 = hasAVX2Feature(target);
+      bool enableVectorMasking =
+          isX86(target) || isRISCV(target) ||
+          (isAArch64(target) && hasAnySVEFeature(target));
       bool enableMicrokernels = hasMicrokernels(target);
       if (!testLoweringConfiguration) {
         switch (translationInfo.value().getDispatchLoweringPassPipeline()) {
@@ -195,40 +197,44 @@
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUBufferOpsTileAndVectorize:
-            addCPUBufferOpsTileAndVectorizePipeline(executableLoweringPipeline);
+            addCPUBufferOpsTileAndVectorizePipeline(executableLoweringPipeline,
+                                                    enableVectorMasking);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUDoubleTilingExpert:
             addMultiTilingExpertPassPipeline(
                 executableLoweringPipeline,
                 static_cast<int>(StrategyTilingLevel::NumStrategyTileLevels),
-                /*enablePeeling=*/false, lowerToAVX2);
+                /*enablePeeling=*/false, enableVectorMasking, lowerToAVX2);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUDoubleTilingPadExpert:
-            addDoubleTilingPadExpertPassPipeline(executableLoweringPipeline);
+            addDoubleTilingPadExpertPassPipeline(executableLoweringPipeline,
+                                                 enableVectorMasking);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUDoubleTilingPeelingExpert:
             addMultiTilingExpertPassPipeline(
                 executableLoweringPipeline,
                 static_cast<int>(StrategyTilingLevel::NumStrategyTileLevels),
-                /*enablePeeling=*/true, lowerToAVX2);
+                /*enablePeeling=*/true, enableVectorMasking, lowerToAVX2);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUTripleTilingExpert:
             addMultiTilingExpertPassPipeline(
                 executableLoweringPipeline,
                 static_cast<int>(TripleTilingLevel::NumTileLevels),
-                /*enablePeeling=*/false);
+                enableVectorMasking,
+                /*enablePeeling=*/false, /*lowerToAVX2i=*/false);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::
               CPUConvTileAndDecomposeExpert:
             addConvTileAndDecomposeExpertPassPipeline(
-                executableLoweringPipeline);
+                executableLoweringPipeline, enableVectorMasking);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::Mmt4dTilingExpert:
-            addMmt4dTilingExpertPassPipeline(executableLoweringPipeline);
+            addMmt4dTilingExpertPassPipeline(executableLoweringPipeline,
+                                             enableVectorMasking);
             break;
           case IREE::Codegen::DispatchLoweringPassPipeline::CPUDataTiling:
             addCPUDataTilingPipeline(executableLoweringPipeline);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 13e241b..4dd5d88 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -10,7 +10,6 @@
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/LLVMCPU/KernelDispatch.h"
-#include "iree/compiler/Codegen/PassDetail.h"
 #include "iree/compiler/Codegen/Sandbox/Passes.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -20,7 +19,6 @@
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Func/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
@@ -334,7 +332,8 @@
 // Codegen pipelines.
 //===---------------------------------------------------------------------===//
 
-void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager) {
+void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
+                                             bool enableVectorMasking) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -346,6 +345,7 @@
         static_cast<int64_t>(StrategyTilingLevel::ParallelTiles);
     options.peel = true;
     options.vectorize = true;
+    options.enableVectorMasking = enableVectorMasking;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
     nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
@@ -365,7 +365,8 @@
   }
 }
 
-void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager) {
+void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
+                                          bool enableVectorMasking) {
   addTileAndDistributePasses(passManager,
                              /*useFuseTensorPadWithConsumerPass=*/false);
 
@@ -436,6 +437,7 @@
   {
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
+    options.enableVectorMasking = enableVectorMasking;
     options.vectorizePadding = true;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
@@ -488,6 +490,7 @@
 
 void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
                                       int64_t numLevels, bool enablePeeling,
+                                      bool enableVectorMasking,
                                       bool lowerToAVX2) {
   addTileAndDistributePasses(passManager);
 
@@ -528,6 +531,7 @@
     LinalgSingleTilingExpertPassOptions options;
     options.peel = enablePeeling;
     options.vectorize = true;
+    options.enableVectorMasking = enableVectorMasking;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
     nestedModulePM.addNestedPass<func::FuncOp>(createCanonicalizerPass());
@@ -550,7 +554,8 @@
   }
 }
 
-void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager) {
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
+                                               bool enableVectorMasking) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -594,6 +599,7 @@
   {
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
+    options.enableVectorMasking = enableVectorMasking;
     options.vectorizePadding = true;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
@@ -620,7 +626,8 @@
   }
 }
 
-void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager) {
+void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+                                      bool enableVectorMasking) {
   addTileAndDistributePasses(passManager);
 
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
@@ -654,6 +661,7 @@
   {
     LinalgSingleTilingExpertPassOptions options;
     options.vectorize = true;
+    options.enableVectorMasking = enableVectorMasking;
     nestedModulePM.addNestedPass<func::FuncOp>(
         createLinalgSingleTilingExpertPass(options));
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
index b7e4ff3..6d5c4d1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_x86_64_launch_configuration.mlir
@@ -338,7 +338,10 @@
   ]>
 ]>
 hal.executable private @preset_config_matmul_tensors  {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @preset_config layout(#pipeline_layout)
     builtin.module {
       func.func @preset_config() {
@@ -383,7 +386,10 @@
   ]>
 ]>
 hal.executable @copy_op_dynamic {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @copy_op_dynamic layout(#pipeline_layout)
     builtin.module {
       func.func @copy_op_dynamic() {
@@ -425,7 +431,10 @@
   ]>
 ]>
 hal.executable private @static_1d_fft_stage2  {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @static_1d_fft_stage2 layout(#pipeline_layout)
     builtin.module {
       func.func @static_1d_fft_stage2() {
@@ -463,7 +472,10 @@
   ]>
 ]>
 hal.executable private @static_3d_fft_stage3  {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @static_3d_fft_stage3 layout(#pipeline_layout)
     builtin.module {
       func.func @static_3d_fft_stage3() {
@@ -501,7 +513,10 @@
   ]>
 ]>
 hal.executable private @outs_fusion {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @outs_fusion_fn layout(#pipeline_layout)
     builtin.module {
       func.func @outs_fusion_fn() {
@@ -545,6 +560,7 @@
     }
   }
 }
+
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 32, 0], [1, 4, 0], [0, 0, 4]{{\]}}>
 //      CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.export public @outs_fusion_fn
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
index 7ae64ee..3d60213 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_masking.mlir
@@ -13,7 +13,11 @@
   ]>
 ]>
 hal.executable private @preset_config_generic_add  {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 32 : index,
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
       %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
@@ -78,7 +82,11 @@
   ]>
 ]>
 hal.executable private @preset_config_reduction  {
-  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64"> {
+  hal.executable.variant @system_elf_x86_64, target = <"llvm-cpu", "system-elf-x86_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 32 : index,
+    target_triple = "x86_64-unknown-linux-gnu"
+  }> {
     hal.executable.export @mask_dynamic_reduction layout(#pipeline_layout) {
     ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
       %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
@@ -120,3 +128,208 @@
 // CHECK-COUNT-4:   vector.mask %{{.*}} { vector.reduction <add>
 //         CHECK:   vector.maskedstore
 
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+    lowering_config = <tile_sizes = [[127, 255], [8, 32], [0, 0]]>,
+    translation_info  = <CPUDoubleTilingExpert>,
+    workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @preset_config_generic_add  {
+  hal.executable.variant @embedded_elf_rv32, target = <"llvm-cpu", "embedded-elf-riscv_32", {
+      data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
+      native_vector_size = 32 : index,
+      target_triple = "riscv32-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @mask_dynamic_generic_add() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %6 = arith.index_cast %0 : i32 to index
+        %7 = arith.index_cast %1 : i32 to index
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
+            : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %init = tensor.empty(%6, %7) : tensor<?x?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>],
+                                   iterator_types = ["parallel", "parallel"]}
+          ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
+            ^bb0(%in0: f32, %in1: f32, %out: f32):
+          %add = arith.addf %in0, %in1 : f32
+          linalg.yield %add: f32
+        } -> tensor<?x?xf32>
+        flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        return
+      }
+    }
+  }
+}
+
+// Masking is applied to the main vector loop when peeling is not used.
+
+// CHECK-LABEL: func.func @mask_dynamic_generic_add
+// Main loop
+//         CHECK: scf.for
+// CHECK-COUNT-2:   vector.maskedload
+//         CHECK:   vector.maskedstore
+// No epilogue
+//     CHECK-NOT: scf.for
+
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+    lowering_config = <tile_sizes = [[127, 255], [8, 32], [0, 0]]>,
+    translation_info  = <CPUDoubleTilingExpert>,
+    workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @preset_config_generic_add  {
+  hal.executable.variant @embedded_elf_rv32, target = <"llvm-cpu", "embedded-elf-arm_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "aarch64-unknown-unknown-eabi-elf"
+  }> {
+    hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @mask_dynamic_generic_add() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %6 = arith.index_cast %0 : i32 to index
+        %7 = arith.index_cast %1 : i32 to index
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
+            : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %init = tensor.empty(%6, %7) : tensor<?x?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>],
+                                   iterator_types = ["parallel", "parallel"]}
+          ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
+            ^bb0(%in0: f32, %in1: f32, %out: f32):
+          %add = arith.addf %in0, %in1 : f32
+          linalg.yield %add: f32
+        } -> tensor<?x?xf32>
+        flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        return
+      }
+    }
+  }
+}
+
+// Masking should not happen on aarch64 is there is no SVE support.
+
+// CHECK-LABEL: func.func @mask_dynamic_generic_add
+//   CHECK-NOT:   vector.maskedload
+
+// -----
+
+#compilation = #iree_codegen.compilation_info<
+    lowering_config = <tile_sizes = [[127, 255], [8, 32], [0, 0]]>,
+    translation_info  = <CPUDoubleTilingExpert>,
+    workgroup_size = []>
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @preset_config_generic_add  {
+  hal.executable.variant @embedded_elf_arm_64, target = <"llvm-cpu", "embedded-elf-arm_64", {
+    cpu_features = "+sve",
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    native_vector_size = 16 : index,
+    target_triple = "aarch64-unknown-unknown-eabi-elf"
+  }> {
+    hal.executable.export @mask_dynamic_generic_add layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @mask_dynamic_generic_add() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %0 = hal.interface.constant.load[0] : i32
+        %1 = hal.interface.constant.load[1] : i32
+        %6 = arith.index_cast %0 : i32 to index
+        %7 = arith.index_cast %1 : i32 to index
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer)
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7}
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer)
+            : !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%6, %7} -> tensor<?x?xf32>
+        %init = tensor.empty(%6, %7) : tensor<?x?xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%init : tensor<?x?xf32>) -> tensor<?x?xf32>
+        %generic = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>,
+                                                    affine_map<(d0, d1) -> (d0, d1)>],
+                                   iterator_types = ["parallel", "parallel"]}
+          ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>) outs(%fill : tensor<?x?xf32>) {
+            ^bb0(%in0: f32, %in1: f32, %out: f32):
+          %add = arith.addf %in0, %in1 : f32
+          linalg.yield %add: f32
+        } -> tensor<?x?xf32>
+        flow.dispatch.tensor.store %generic, %result_binding, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1]
+            : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%6, %7}
+        return
+      }
+    }
+  }
+}
+
+// Masking is applied to the peeled loop on aarch64 when SVE is enabled.
+
+// CHECK-LABEL: func.func @mask_dynamic_generic_add
+// Main loop
+//         CHECK: scf.for
+// Peeled loop:
+//         CHECK: scf.for
+// CHECK-COUNT-2:   vector.maskedload
+//         CHECK:   vector.maskedstore
diff --git a/compiler/src/iree/compiler/Codegen/Passes.h b/compiler/src/iree/compiler/Codegen/Passes.h
index 91de123..4401315 100644
--- a/compiler/src/iree/compiler/Codegen/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Passes.h
@@ -357,7 +357,8 @@
 /// Populates the passes to lower linalg ops on buffers. Currenly this
 /// pipeline is only used for dispatches that just copy data from input
 /// interfaces to output interface.
-void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager);
+void addCPUBufferOpsTileAndVectorizePipeline(OpPassManager &passManager,
+                                             bool enableVectorMasking);
 
 /// Populates the passes needed to multi level tile and lowering of linalg ops
 /// on tensors to vectors operations.
@@ -376,8 +377,10 @@
     ArrayRef<int64_t> workgroupSize = {});
 void addMultiTilingExpertPassPipeline(OpPassManager &passManager,
                                       int64_t numLevels, bool enablePeeling,
-                                      bool lowerToAVX2 = false);
-void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager);
+                                      bool enableVectorMasking,
+                                      bool lowerToAVX2);
+void addDoubleTilingPadExpertPassPipeline(OpPassManager &passManager,
+                                          bool enableVectorMasking);
 
 // Populates the passes needed to do tiling, decomposing, and vectorizing the
 // convolution ops using the Codegen drivers from sandbox.
@@ -385,14 +388,16 @@
     Operation *op, IREE::Codegen::LoweringConfigAttr loweringConfig,
     IREE::Codegen::TranslationInfoAttr translationInfo,
     ArrayRef<int64_t> workgroupSize = {});
-void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager);
+void addConvTileAndDecomposeExpertPassPipeline(OpPassManager &passManager,
+                                               bool enableVectorMasking);
 
 /// Transform dialect-based common.
 void addTransformDialectPasses(OpPassManager &passManager);
 
 /// Populates the passes needed to multi level tile, fuse and vectorize
 /// lowering of linalg ops on tensors to vectors operations.
-void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager);
+void addMmt4dTilingExpertPassPipeline(OpPassManager &passManager,
+                                      bool enableVectorMasking);
 
 //----------------------------------------------------------------------------//
 // LLVMCPU Pass Pipelines for lowering to LLVM dialect.
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
index 1c890f0..e013737 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
@@ -18,10 +18,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
-#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -333,6 +330,7 @@
     this->hoistPaddings = options.hoistPaddings;
     this->transposePaddings = options.transposePaddings;
     this->vectorize = options.vectorize;
+    this->enableVectorMasking = options.enableVectorMasking;
     this->vectorizePadding = options.vectorizePadding;
     this->tilingLevel = options.tilingLevel;
   }
@@ -371,6 +369,7 @@
     this->decomposeToLowerDimOp = options.decomposeToLowerDimOp;
     this->peel = options.peel;
     this->vectorize = options.vectorize;
+    this->enableVectorMasking = options.enableVectorMasking;
     this->vectorizePadding = options.vectorizePadding;
     this->tilingLevel = options.tilingLevel;
   }
@@ -779,8 +778,12 @@
 
   LinalgVectorizationOptions vectorizationOptions;
   vectorizationOptions.setVectorizePadding(vectorizePadding);
-  vectorizationOptions.setCanonicalVectorSizes(getCanonicalVectorShape(funcOp));
-  vectorizationOptions.setVectorSizeComputationFunction(getVectorSizes);
+  vectorizationOptions.setEnableVectorMasking(enableVectorMasking);
+  if (enableVectorMasking) {
+    vectorizationOptions.setCanonicalVectorSizes(
+        getCanonicalVectorShape(funcOp));
+    vectorizationOptions.setVectorSizeComputationFunction(getVectorSizes);
+  }
 
   CodegenStrategy strategy;
   StringRef genericOpName = linalg::GenericOp::getOperationName();
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/Passes.h b/compiler/src/iree/compiler/Codegen/Sandbox/Passes.h
index ad578a9..9fb64f8 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/Passes.h
@@ -28,6 +28,7 @@
   SmallVector<int64_t> hoistPaddings = {};
   SmallVector<std::string> transposePaddings = {};
   bool vectorize = false;
+  bool enableVectorMasking = false;
   bool vectorizePadding = false;
   int64_t tilingLevel = -1;
 };
@@ -57,6 +58,7 @@
   bool decomposeToLowerDimOp = false;
   bool peel = false;
   bool vectorize = false;
+  bool enableVectorMasking = false;
   bool vectorizePadding = false;
   int64_t tilingLevel = -1;
 };
diff --git a/compiler/src/iree/compiler/Codegen/Sandbox/Passes.td b/compiler/src/iree/compiler/Codegen/Sandbox/Passes.td
index 7317fc4..9be525c 100644
--- a/compiler/src/iree/compiler/Codegen/Sandbox/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Sandbox/Passes.td
@@ -50,6 +50,8 @@
     // Vectorization options.
     Option<"vectorize", "vectorize", "bool", /*default=*/"false",
       "Rewrite the linalg op as a vector operation.">,
+    Option<"enableVectorMasking", "enableVectorMasking", "bool", /*default=*/"false",
+      "Enable vector masking during vectorization.">,
     Option<"vectorizePadding", "vectorize-padding", "bool", /*default=*/"false",
       "Rewrite all tensor.pad ops in the function to vector form.">,
 
@@ -136,6 +138,8 @@
     // Vectorization options.
     Option<"vectorize", "vectorize", "bool", /*default=*/"false",
       "Rewrite the linalg op as a vector operation.">,
+    Option<"enableVectorMasking", "enableVectorMasking", "bool", /*default=*/"false",
+      "Enable vector masking during vectorization.">,
     Option<"vectorizePadding", "vectorize-padding", "bool", /*default=*/"false",
       "Rewrite all tensor.pad ops in the function to vector form.">,
 
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index c07ce92..57b4281 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -8,13 +8,10 @@
 
 #include "iree-dialects/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Codegen/Interfaces/ProcessorOpInterfaces.h"
-#include "iree/compiler/Codegen/Utils/MarkerUtils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
@@ -195,6 +192,10 @@
   return hasFeature(targetAttr, "+zve64x");
 }
 
+bool hasAnySVEFeature(IREE::HAL::ExecutableTargetAttr targetAttr) {
+  return hasFeature(targetAttr, "+sve") || hasFeature(targetAttr, "+sve2");
+}
+
 bool isReadOnly(Value v) {
   Operation *definingOp = v.getDefiningOp();
   if (!definingOp) return false;
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index 3593354..2121d80 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -92,6 +92,10 @@
 /// Returns true if the 'targetAttr' contains '+zve64x' in its cpu features.
 bool hasZve64xFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
 
+/// Returns true if the 'targetAttr' contains '+sve' or '+sve2' in its cpu
+/// features.
+bool hasAnySVEFeature(IREE::HAL::ExecutableTargetAttr targetAttr);
+
 /// Checks if a tensor value is generated from a read-only object, like
 /// and interface binding with read-only attribute or from an `arith.constant`
 /// operation.
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
index 196c7fa..7b3ecd8 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Dialect/LinalgExt/Passes/Passes.h
@@ -256,6 +256,14 @@
     std::function<SmallVector<int64_t>(linalg::LinalgOp, ArrayRef<int64_t>)>;
 
 struct LinalgVectorizationOptions {
+  /// Enable vector masking during vectorization.
+  bool enableVectorMasking = false;
+
+  LinalgVectorizationOptions &setEnableVectorMasking(bool val) {
+    enableVectorMasking = val;
+    return *this;
+  }
+
   /// Canonical vector sizes for the vector iteration space (i.e., vectorization
   /// factors). They are optional for input code with full static shapes.
   SmallVector<int64_t> canonicalVectorSizes;
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
index e2c7fc1..5201d55 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgExt/Transforms/Transforms.cpp
@@ -336,7 +336,7 @@
   if (failed(filter.checkAndNotify(rewriter, linalgOp)))
     return failure();
   SmallVector<int64_t> vectorSizes;
-  if (options.vectorSizeComputationFunction)
+  if (options.enableVectorMasking)
     vectorSizes.append(options.vectorSizeComputationFunction(
         linalgOp, options.canonicalVectorSizes));
   return vectorize(rewriter, linalgOp, vectorSizes);