[CPU] Add a matmul test suite for data-tiling codegen. (#15738)

Similar to nondt test suite, the test suite disables bf16 types because
of https://github.com/openxla/iree/issues/15258
diff --git a/tests/e2e/matmul/BUILD.bazel b/tests/e2e/matmul/BUILD.bazel
index 4dd98be..5a78e4a 100644
--- a/tests/e2e/matmul/BUILD.bazel
+++ b/tests/e2e/matmul/BUILD.bazel
@@ -29,6 +29,7 @@
     name = "e2e_matmul_nondt_%s_%s_%s" % (lhs_rhs_type, acc_type, size),
     compiler_flags = [
         "--iree-opt-data-tiling=false",
+        "--iree-llvmcpu-enable-ukernels=none",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
@@ -84,15 +85,17 @@
     "+avx512bf16",
 ]
 
-# LLVMCPU, data-tiling + microkernels.
-# TODO(#15241, #15215): also test data-tiling alone without microkernels. This currently
-# fails (#15241), which needs to be resolved to unblock data-tiling-by-default (#15215).
+# LLVMCPU, data-tiling, data-tiling + ukernels.
 [iree_generated_trace_runner_test(
-    name = "e2e_matmul_dt_uk_%s_%s_%s" % (lhs_rhs_type, acc_type, size),
+    name = "e2e_matmul_dt%s_%s_%s_%s" % (
+        ("_uk" if use_uk else ""),
+        lhs_rhs_type,
+        acc_type,
+        size,
+    ),
     compiler_flags = [
         "--iree-opt-data-tiling",
-        "--iree-llvmcpu-enable-ukernels=all",
-    ],
+    ] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
@@ -140,14 +143,21 @@
                                        "arm_64:bf16:+bf16",
                                    ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
     trace_runner = "//tools:iree-e2e-matmul-test",
-) for (lhs_rhs_type, acc_type) in [
-    ("i8", "i32"),
-    ("f32", "f32"),
-    ("f16", "f16"),
-    ("f16", "f32"),
-    ("bf16", "bf16"),
-    ("bf16", "f32"),
-] for size in [
+) for use_uk in [
+    False,
+    True,
+] for (lhs_rhs_type, acc_type) in (
+    [
+        ("i8", "i32"),
+        ("f32", "f32"),
+        ("f16", "f16"),
+        ("f16", "f32"),
+    ] + ([
+        # TODO(#15258): enable bf16 tests for !use_uk when that bug is fixed.
+        ("bf16", "bf16"),
+        ("bf16", "f32"),
+    ] if use_uk else [])
+) for size in [
     "small",
     "large",
 ]]
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
index 34ad477..18bf0f3 100644
--- a/tests/e2e/matmul/CMakeLists.txt
+++ b/tests/e2e/matmul/CMakeLists.txt
@@ -27,6 +27,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
 
   TARGET_CPU_FEATURES_VARIANTS
@@ -50,6 +51,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
 
   TARGET_CPU_FEATURES_VARIANTS
@@ -73,6 +75,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
 
   TARGET_CPU_FEATURES_VARIANTS
@@ -97,6 +100,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
 
   TARGET_CPU_FEATURES_VARIANTS
@@ -121,6 +125,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
     "noriscv"
     "nowasm"
@@ -146,6 +151,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
     "noriscv"
     "nowasm"
@@ -171,6 +177,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
     "noriscv"
     "nowasm"
@@ -195,6 +202,7 @@
     "local-task"
   COMPILER_FLAGS
     "--iree-opt-data-tiling=false"
+    "--iree-llvmcpu-enable-ukernels=none"
   LABELS
     "noriscv"
     "nowasm"
@@ -204,6 +212,230 @@
 
 iree_generated_trace_runner_test(
   NAME
+    e2e_matmul_dt_i8_i32_small
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=i8"
+    "--acc_type=i32"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "arm_64:dotprod:+dotprod"
+    "arm_64:i8mm:+i8mm"
+    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_i8_i32_large
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=i8"
+    "--acc_type=i32"
+    "--shapes=large"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noasan"
+    "notsan"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "arm_64:dotprod:+dotprod"
+    "arm_64:i8mm:+i8mm"
+    "x86_64:avx512vnni:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq,+avx512vnni"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f32_f32_small
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f32"
+    "--acc_type=f32"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f32_f32_large
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f32"
+    "--acc_type=f32"
+    "--shapes=large"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noasan"
+    "notsan"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f16_f16_small
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f16"
+    "--acc_type=f16"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noriscv"
+    "nowasm"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+    "arm_64:fullfp16:+fullfp16"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f16_f16_large
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f16"
+    "--acc_type=f16"
+    "--shapes=large"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noasan"
+    "notsan"
+    "noriscv"
+    "nowasm"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+    "arm_64:fullfp16:+fullfp16"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f16_f32_small
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f16"
+    "--acc_type=f32"
+    "--shapes=small"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noriscv"
+    "nowasm"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+    "arm_64:fp16fml:+fp16fml"
+)
+
+iree_generated_trace_runner_test(
+  NAME
+    e2e_matmul_dt_f16_f32_large
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f16"
+    "--acc_type=f32"
+    "--shapes=large"
+  TRACE_RUNNER
+    iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "llvm-cpu"
+  DRIVERS
+    "local-task"
+  COMPILER_FLAGS
+    "--iree-opt-data-tiling"
+    "--iree-llvmcpu-enable-ukernels=none"
+  LABELS
+    "noasan"
+    "notsan"
+    "noriscv"
+    "nowasm"
+  TARGET_CPU_FEATURES_VARIANTS
+    "default"
+    "x86_64:avx2:+avx,+avx2,+fma,+f16c"
+    "x86_64:avx512:+avx,+avx2,+fma,+f16c,+avx512f,+avx512vl,+avx512cd,+avx512bw,+avx512dq"
+    "arm_64:fp16fml:+fp16fml"
+)
+
+iree_generated_trace_runner_test(
+  NAME
     e2e_matmul_dt_uk_i8_i32_small
   GENERATOR
     "generate_e2e_matmul_tests.py"