[CPU] Enable sub-byte transpose emulation (#16277)
This PR enables the emulation of sub-byte vector transposes by promoting
them to i8 vector transposes. This should ensure correctness on any kind
of sub-byte vector transposes.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
index d5bbc65..7dd80e4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
@@ -61,10 +61,17 @@
vectorTransformOptions.setVectorTransposeLowering(
vector::VectorTransposeLowering::Shuffle16x16);
}
+
+ constexpr unsigned kSpecializedBenefit = 10;
+ constexpr unsigned kNarrowTypeEmulationBenefit = 20;
+
RewritePatternSet patterns(ctx);
vector::populateVectorToVectorCanonicalizationPatterns(patterns);
vector::populateVectorTransposeLoweringPatterns(patterns,
vectorTransformOptions);
+ vector::populateVectorTransposeNarrowTypeRewritePatterns(
+ patterns, kNarrowTypeEmulationBenefit);
+
if (lowerVectorTransposeToAVX2) {
auto avx2LoweringOptions =
x86vector::avx2::LoweringOptions().setTransposeOptions(
@@ -72,7 +79,7 @@
.lower4x8xf32()
.lower8x8xf32());
x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
- patterns, avx2LoweringOptions, /*benefit=*/10);
+ patterns, avx2LoweringOptions, kSpecializedBenefit);
}
(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index 8876f1c..64b28c6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -60,6 +60,7 @@
"vector_contract_to_arm_asm.mlir",
"vector_contract_to_arm_intrinsics.mlir",
"vector_lowering.mlir",
+ "vector_transpose_lowering.mlir",
"vectorize_with_masking_and_hoist.mlir",
"verify_linalg_transform_legality.mlir",
],
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index b1f9faa..c508fd4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -55,6 +55,7 @@
"vector_contract_to_arm_asm.mlir"
"vector_contract_to_arm_intrinsics.mlir"
"vector_lowering.mlir"
+ "vector_transpose_lowering.mlir"
"vectorize_with_masking_and_hoist.mlir"
"verify_linalg_transform_legality.mlir"
TOOLS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir
new file mode 100644
index 0000000..73c7ed7
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir
@@ -0,0 +1,16 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-vector-transpose-lowering))' --split-input-file %s | FileCheck %s
+
+// Verify that the vector transpose lowering patterns trigger as expected. We
+// shouldn't check the pattern output in detail as that testing should happen in
+// MLIR, where the patterns are implemented.
+
+func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
+ %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4>
+ return %0 : vector<16x8xi4>
+}
+
+// CHECK-LABEL: func.func @i4_transpose(
+// CHECK: arith.extsi %{{.*}} : vector<8x16xi4> to vector<8x16xi8>
+// CHECK: vector.shuffle
+// CHECK: arith.trunci %{{.*}} : vector<16x8xi8> to vector<16x8xi4>
+