[CPU] Limit vectorization tile sizes for SVE (#18846)

This prevents large vector sizes. A regression test is included with a
linalg.pooling_nchw_max operation that currently fails to compile with

error: One or more operations with large vector sizes (8192 bytes) were
found:

when SVE is enabled, even though SVE isn't used.

---------

Signed-off-by: Cullen Rhodes <cullen.rhodes@arm.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 6f99834..5111b76 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -792,11 +792,8 @@
       return 16 * 128;
     }
   } else if (isAArch64(target)) {
-    // Can't determine register space size at compile time on SVE.
-    if (hasFeature(target, "+sve") || hasFeature(target, "+sve2")) {
-      return 0;
-    }
-    // 32 NEON registers (128-bit each).
+    // 32 NEON/SVE registers (at least 128-bit each, returns the base size for
+    // SVE).
     return 32 * 128;
   } else {
     // Don't know register space size as a compile-time constant on other
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
index 757a039..1308442 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir
@@ -28,7 +28,7 @@
   return
 }
 
-//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
 //   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //       CHECK: func.func @matmul_tensors()
 //  CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -118,7 +118,7 @@
   return
 }
 
-//  DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
+//  DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
 //  DISABLE-ARM-SME-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      DISABLE-ARM-SME: func.func @matmul_tensors()
 //  DISABLE-ARM-SME-SAME:     translation_info = #[[TRANSLATION]]
@@ -179,8 +179,8 @@
   return
 }
 
-// CHECK-DAG:  #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [8, [16]], [0, 0], [0, 0]]>
-// CHECK-DAG:  #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
+// CHECK-DAG:  #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [4, [16]], [0, 0], [0, 0]]>
+// CHECK-DAG:  #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
 // CHECK:      #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 // CHECK:      func.func @matmul_with_fill()
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -217,3 +217,34 @@
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK:      linalg.depthwise_conv_2d_nhwc_hwc
 // CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
+// Regression test. SVE isn't used (scalable vectorizaton of this op is not yet
+// supported), but used to fail to compile when SVE was enabled due to tile
+// sizes leading to large vectors.
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
+func.func @pooling_nchw_max(%arg0: !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>) attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
+  %cst = arith.constant 0.0 : f32
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>> -> tensor<1x64x114x114xf32>
+  %3 = tensor.empty() : tensor<1x64x56x56xf32>
+  %4 = tensor.empty() : tensor<3x3xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
+  %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
+  flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
+  return
+}
+
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32, 56, 8, 0, 0], [1, 2, 1, 8, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
+// CHECK:      func.func @pooling_nchw_max
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+// CHECK:      linalg.pooling_nchw_max
+// CHECK-SAME:     lowering_config = #[[CONFIG]]