[CPU][SVE] Enforce > 1 pow2 sizes when materializing scalable matmul lowering_configs (#15276)

The backend struggles to legalize non-power-of-two scalable vector sizes
and in many cases aborts. So if we can avoid it, we should stick to
power-of-two sizes. Likewise, 1x scalable vectors (e.g. vector<[1]xty>)
are poorly supported, so we fallback to fixed vectorization in that
case.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index a48b470..84d8649 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -865,15 +865,19 @@
     bool allowIncompleteTile =
         vecPreProcStrategy == VectorPreProcStrategy::Peeling ||
         vecPreProcStrategy == VectorPreProcStrategy::Masking;
+    // The backend struggles to legalize non-power-of-two scalable vectors.
+    bool enforcePowerOfTwo = vecScalableDims[index];
 
     if (sz != 0) {
       sz = getMaxVectorTileSize(
           /*lb=*/0, /*ub=*/shape[index],
-          /*maxTileSize=*/sz, vectorSize, allowIncompleteTile);
+          /*maxTileSize=*/sz, vectorSize, allowIncompleteTile,
+          enforcePowerOfTwo);
     }
     parallelTileSizes.push_back(sz);
-    // TODO: How to handle scalable sizes with getMaxVectorTileSize()?
-    parallelScalableFlags.push_back(vecScalableDims[index]);
+    // 1x scalable vectors e.g. vector<[1]xty> are also poorly supported, so
+    // fallback to fixed vectorization if they occur:
+    parallelScalableFlags.push_back(sz > 1 ? vecScalableDims[index] : false);
   }
   SmallVector<int64_t> reductionTileSizes;
   SmallVector<bool> reductionScalableFlags;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index f3fa654..ff4cff5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -111,6 +111,88 @@
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
     #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>,
+    #hal.descriptor_set.binding<3, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_static_tensors_sve  {
+  hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    cpu_features = "+sve",
+    native_vector_size = 16 : index,
+    target_triple = "aarch64-none-elf"
+  }>) {
+    hal.executable.export @static_tensors_non_pow_two_sizes layout(#pipeline_layout)
+    builtin.module {
+      func.func @static_tensors_non_pow_two_sizes() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x9xf32>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x9xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x9xf32>> -> tensor<14x9xf32>
+        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 9], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x9xf32>> -> tensor<15x9xf32>
+        %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x9xf32>) outs(%5 : tensor<15x9xf32>) -> tensor<15x9xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 9], strides = [1, 1] : tensor<15x9xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x9xf32>>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[5, 9, 0], [5, [16], 0], [0, 0, 14], [0, 0, 0]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.export public @static_tensors_non_pow_two_sizes
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK: linalg.matmul
+// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>,
+    #hal.descriptor_set.binding<3, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_static_tensors_sve  {
+  hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
+    data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+    cpu_features = "+sve",
+    native_vector_size = 16 : index,
+    target_triple = "aarch64-none-elf"
+  }>) {
+    hal.executable.export @static_tensors_1x1 layout(#pipeline_layout)
+    builtin.module {
+      func.func @static_tensors_1x1() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
+        %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+        return
+      }
+    }
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [1, 1, 0], [0, 0, 1], [0, 0, 0]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.export public @static_tensors_1x1
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK: linalg.matmul
+// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
     #hal.descriptor_set.binding<2, storage_buffer>
   ]>
 ]>