[CPU] Add an e2e test for i4 store emulation. (#15539)

Fixes https://github.com/openxla/iree/issues/15369
diff --git a/tests/e2e/linalg/BUILD.bazel b/tests/e2e/linalg/BUILD.bazel
index a4a7e55..fa856fd 100644
--- a/tests/e2e/linalg/BUILD.bazel
+++ b/tests/e2e/linalg/BUILD.bazel
@@ -24,9 +24,16 @@
         "i4_to_f32.mlir",
     ],
     include = ["*.mlir"],
-    exclude = ["large_linalg_matmul.mlir"],
+    exclude = [
+        "large_linalg_matmul.mlir",
+        "f32_to_i4.mlir",
+    ],
 )
 
+LLVM_I4_SRCS = [
+    "f32_to_i4.mlir",
+]
+
 iree_check_single_backend_test_suite(
     name = "check_llvm-cpu_local-task",
     srcs = LLVM_SRCS,
@@ -35,6 +42,18 @@
 )
 
 iree_check_single_backend_test_suite(
+    name = "check_i4_llvm-cpu_local-task",
+    srcs = LLVM_I4_SRCS,
+    driver = "local-task",
+    tags = [
+        # TODO(#15540): RISC-V needs sub-byte emulation for vector.maskedstore
+        # ops. Enable the test after it is supported.
+        "noriscv",
+    ],
+    target_backend = "llvm-cpu",
+)
+
+iree_check_single_backend_test_suite(
     name = "check_winograd_llvm-cpu_local-task",
     srcs = LLVM_SRCS,
     compiler_flags = [
@@ -52,6 +71,7 @@
     exclude = [
         "large_linalg_matmul.mlir",
         "i4_to_f32.mlir",
+        "f32_to_i4.mlir",
     ],
 )
 
@@ -68,7 +88,10 @@
         "i4_to_f32.mlir",
     ],
     include = ["*.mlir"],
-    exclude = ["large_linalg_matmul.mlir"],
+    exclude = [
+        "large_linalg_matmul.mlir",
+        "f32_to_i4.mlir",
+    ],
 )
 
 iree_check_single_backend_test_suite(
diff --git a/tests/e2e/linalg/CMakeLists.txt b/tests/e2e/linalg/CMakeLists.txt
index 7f2e9b5..6d83754 100644
--- a/tests/e2e/linalg/CMakeLists.txt
+++ b/tests/e2e/linalg/CMakeLists.txt
@@ -24,6 +24,19 @@
 
 iree_check_single_backend_test_suite(
   NAME
+    check_i4_llvm-cpu_local-task
+  SRCS
+    "f32_to_i4.mlir"
+  TARGET_BACKEND
+    "llvm-cpu"
+  DRIVER
+    "local-task"
+  LABELS
+    "noriscv"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
     check_winograd_llvm-cpu_local-task
   SRCS
     "conv2d.mlir"
diff --git a/tests/e2e/linalg/f32_to_i4.mlir b/tests/e2e/linalg/f32_to_i4.mlir
new file mode 100644
index 0000000..5abf4cc
--- /dev/null
+++ b/tests/e2e/linalg/f32_to_i4.mlir
@@ -0,0 +1,24 @@
+func.func @f32_to_i4_1d() {
+  %input = util.unfoldable_constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]> : tensor<8xf32>
+  %init0 = tensor.empty() : tensor<8xi4>
+  %res = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
+    ins(%input : tensor<8xf32>) outs(%init0 : tensor<8xi4>) {
+  ^bb0(%in: f32, %out: i4):
+    %2 = arith.fptoui %in : f32 to i32
+    %3 = arith.trunci %2 : i32 to i4
+    linalg.yield %3 : i4
+  } -> tensor<8xi4>
+
+  // TODO(#14996): Remove the signed extention and directly check with i4 types.
+  %blocker = util.optimization_barrier %res : tensor<8xi4>
+  %init1 = tensor.empty() : tensor<8xi8>
+  %exti8 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
+    ins(%blocker : tensor<8xi4>) outs(%init1 : tensor<8xi8>) {
+  ^bb0(%in: i4, %out: i8):
+    %2 = arith.extsi %in : i4 to i8
+    linalg.yield %2 : i8
+  } -> tensor<8xi8>
+
+  check.expect_eq_const(%exti8, dense<[0, 1, 2, 3, 4, 5, 6, 7]> : tensor<8xi8>) : tensor<8xi8>
+  return
+}