[CPU] Enable sub-byte transpose emulation (#16277)

This PR enables the emulation of sub-byte vector transposes by promoting
them to i8 vector transposes. This should ensure correctness on any kind
of sub-byte vector transposes.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
index d5bbc65..7dd80e4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUVectorTransposeLowering.cpp
@@ -61,10 +61,17 @@
     vectorTransformOptions.setVectorTransposeLowering(
         vector::VectorTransposeLowering::Shuffle16x16);
   }
+
+  constexpr unsigned kSpecializedBenefit = 10;
+  constexpr unsigned kNarrowTypeEmulationBenefit = 20;
+
   RewritePatternSet patterns(ctx);
   vector::populateVectorToVectorCanonicalizationPatterns(patterns);
   vector::populateVectorTransposeLoweringPatterns(patterns,
                                                   vectorTransformOptions);
+  vector::populateVectorTransposeNarrowTypeRewritePatterns(
+      patterns, kNarrowTypeEmulationBenefit);
+
   if (lowerVectorTransposeToAVX2) {
     auto avx2LoweringOptions =
         x86vector::avx2::LoweringOptions().setTransposeOptions(
@@ -72,7 +79,7 @@
                 .lower4x8xf32()
                 .lower8x8xf32());
     x86vector::avx2::populateSpecializedTransposeLoweringPatterns(
-        patterns, avx2LoweringOptions, /*benefit=*/10);
+        patterns, avx2LoweringOptions, kSpecializedBenefit);
   }
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
index 8876f1c..64b28c6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -60,6 +60,7 @@
             "vector_contract_to_arm_asm.mlir",
             "vector_contract_to_arm_intrinsics.mlir",
             "vector_lowering.mlir",
+            "vector_transpose_lowering.mlir",
             "vectorize_with_masking_and_hoist.mlir",
             "verify_linalg_transform_legality.mlir",
         ],
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
index b1f9faa..c508fd4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -55,6 +55,7 @@
     "vector_contract_to_arm_asm.mlir"
     "vector_contract_to_arm_intrinsics.mlir"
     "vector_lowering.mlir"
+    "vector_transpose_lowering.mlir"
     "vectorize_with_masking_and_hoist.mlir"
     "verify_linalg_transform_legality.mlir"
   TOOLS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir
new file mode 100644
index 0000000..73c7ed7
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_transpose_lowering.mlir
@@ -0,0 +1,16 @@
+// RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-vector-transpose-lowering))' --split-input-file %s | FileCheck %s
+
+// Verify that the vector transpose lowering patterns trigger as expected. We
+// shouldn't check the pattern output in detail as that testing should happen in
+// MLIR, where the patterns are implemented.
+
+func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
+  %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4>
+  return %0 : vector<16x8xi4>
+}
+
+// CHECK-LABEL: func.func @i4_transpose(
+//       CHECK:   arith.extsi %{{.*}} : vector<8x16xi4> to vector<8x16xi8>
+//       CHECK:   vector.shuffle
+//       CHECK:   arith.trunci %{{.*}} : vector<16x8xi8> to vector<16x8xi4>
+