[CPU][SVE] Enforce > 1 pow2 sizes when materializing scalable matmul lowering_configs (#15276)
The backend struggles to legalize non-power-of-two scalable vector sizes
and in many cases aborts. So if we can avoid it, we should stick to
power-of-two sizes. Likewise, 1x scalable vectors (e.g. vector<[1]xty>)
are poorly supported, so we fallback to fixed vectorization in that
case.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index a48b470..84d8649 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -865,15 +865,19 @@
bool allowIncompleteTile =
vecPreProcStrategy == VectorPreProcStrategy::Peeling ||
vecPreProcStrategy == VectorPreProcStrategy::Masking;
+ // The backend struggles to legalize non-power-of-two scalable vectors.
+ bool enforcePowerOfTwo = vecScalableDims[index];
if (sz != 0) {
sz = getMaxVectorTileSize(
/*lb=*/0, /*ub=*/shape[index],
- /*maxTileSize=*/sz, vectorSize, allowIncompleteTile);
+ /*maxTileSize=*/sz, vectorSize, allowIncompleteTile,
+ enforcePowerOfTwo);
}
parallelTileSizes.push_back(sz);
- // TODO: How to handle scalable sizes with getMaxVectorTileSize()?
- parallelScalableFlags.push_back(vecScalableDims[index]);
+ // 1x scalable vectors e.g. vector<[1]xty> are also poorly supported, so
+ // fallback to fixed vectorization if they occur:
+ parallelScalableFlags.push_back(sz > 1 ? vecScalableDims[index] : false);
}
SmallVector<int64_t> reductionTileSizes;
SmallVector<bool> reductionScalableFlags;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
index f3fa654..ff4cff5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/materialize_aarch64_launch_configuration.mlir
@@ -111,6 +111,88 @@
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>,
+ #hal.descriptor_set.binding<3, storage_buffer>
+ ]>
+]>
+hal.executable private @matmul_static_tensors_sve {
+ hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ cpu_features = "+sve",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-none-elf"
+ }>) {
+ hal.executable.export @static_tensors_non_pow_two_sizes layout(#pipeline_layout)
+ builtin.module {
+ func.func @static_tensors_non_pow_two_sizes() {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<15x14xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<14x9xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<15x9xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<15x14xf32>> -> tensor<15x14xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 9], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<14x9xf32>> -> tensor<14x9xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 9], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<15x9xf32>> -> tensor<15x9xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x9xf32>) outs(%5 : tensor<15x9xf32>) -> tensor<15x9xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 9], strides = [1, 1] : tensor<15x9xf32> -> !flow.dispatch.tensor<readwrite:tensor<15x9xf32>>
+ return
+ }
+ }
+ }
+}
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[5, 9, 0], [5, [16], 0], [0, 0, 14], [0, 0, 0]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+// CHECK: hal.executable.export public @static_tensors_non_pow_two_sizes
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
+ #hal.descriptor_set.binding<2, storage_buffer>,
+ #hal.descriptor_set.binding<3, storage_buffer>
+ ]>
+]>
+hal.executable private @matmul_static_tensors_sve {
+ hal.executable.variant @llvm target(<"llvm-cpu", "embedded-elf-arm_64", {
+ data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+ cpu_features = "+sve",
+ native_vector_size = 16 : index,
+ target_triple = "aarch64-none-elf"
+ }>) {
+ hal.executable.export @static_tensors_1x1 layout(#pipeline_layout)
+ builtin.module {
+ func.func @static_tensors_1x1() {
+ %c0 = arith.constant 0 : index
+ %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1xf32>>
+ %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<1x1xf32>> -> tensor<1x1xf32>
+ %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32>
+ flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<1x1xf32>>
+ return
+ }
+ }
+ }
+}
+// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 0, 0], [1, 1, 0], [0, 0, 1], [0, 0, 0]]>
+// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+// CHECK: hal.executable.export public @static_tensors_1x1
+// CHECK-SAME: translation_info = #[[TRANSLATION]]
+// CHECK: linalg.matmul
+// CHECK-SAME: lowering_config = #[[CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+ #hal.descriptor_set.layout<0, bindings = [
+ #hal.descriptor_set.binding<0, storage_buffer>,
+ #hal.descriptor_set.binding<1, storage_buffer>,
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>