tests/e2e/matmul/BUILD.bazel - 3p/openxla/iree - Git at Google

 # Copyright 2022 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 # End-to-end matrix multiplication tests.

 load("//build_tools/bazel:iree_e2e_generated_runner_test.bzl", "iree_generated_e2e_runner_test")

 package(
     features = ["layering_check"],
     licenses = ["notice"],  # Apache 2.0
 )

 py_binary(
     name = "generate_e2e_matmul_tests",
     srcs = ["generate_e2e_matmul_tests.py"],
 )

 ###########################################################################
 ##
 ## LLVMCPU backend
 ##
 ###########################################################################

 # LLVMCPU, non-data-tiling, no microkernels
 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cpu_nondt_%s_%s" % (lhs_rhs_type, acc_type),
     compiler_flags = [
         "--iree-opt-data-tiling=false",
         "--iree-llvmcpu-enable-ukernels=none",
         "--iree-llvmcpu-enable-scalable-vectorization",
         "--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
     ],
     tags = [
         # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
         "noriscv",
         "nowasm",
     ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else [],
     target_backends_and_drivers = [
         ("llvm-cpu", "local-task"),
     ],
     target_cpu_features_variants = ["generic"] +
                                    # Widening matmuls fail to lower for SVE.
                                    (["arm_64:sve:+sve"] if lhs_rhs_type == acc_type else []),
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in [
     # ("i8", "i32"),  # TODO(#15800): enable once compile time is reasonable
     # ("f32", "f32"),  # TODO(#15800): enable once compile time is reasonable
     # ("f16", "f16"),  # TODO(#15800): enable once compile time is reasonable
     # ("f16", "f32"),  # TODO(#15800): enable once compile time is reasonable
     # TODO(#15258): enable bf16 tests when that bug is fixed.
     # ("bf16", "bf16"),
     # ("bf16", "f32"),
 ]]

 PREPROCESSING_TRANSPOSE_LHS = "--iree-preprocessing-pass-pipeline=builtin.module\\(util.func\\(iree-preprocessing-transpose-matmul-pass{input=lhs}\\)\\)"

 PREPROCESSING_PEEL = "--iree-llvmcpu-vector-pproc-strategy=peel"

 # LLVMCPU, non-data-tiling, no microkernels, ArmSME
 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cpu_arm_sme_nondt_%s_%s%s" % (
         dtype,
         "_transpose_lhs" if transpose_lhs else "",
         "_peel" if peel else "",
     ),
     compiler_flags = [
                          "--iree-opt-data-tiling=false",
                          "--iree-llvmcpu-enable-scalable-vectorization",
                          "--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
                      ] + ([PREPROCESSING_TRANSPOSE_LHS] if transpose_lhs else []) +
                      ([PREPROCESSING_PEEL] if peel else []),
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % dtype,
         "--acc_type=%s" % dtype,
     ],
     tags = [
         "requires-arm-sme",
     ],
     target_backends_and_drivers = [
         ("llvm-cpu", "local-task"),
     ],
     target_cpu_features_variants = ["arm_64:sme:+sve,+sme"],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for dtype in [
     "f32",
     # f64 disabled because it wasn't supported by the test generator at the time
     # this was added. When adding it in the future, consider passing
     # --iree-input-demote-f64-to-f32=false to the compiler.
     # "f64"
 ] for transpose_lhs in [
     True,
     False,
 ] for peel in [
     True,
     False,
 ]]

 X86_64_AVX2 = [
     "+avx",
     "+avx2",
     "+fma",
     "+f16c",
 ]

 X86_64_AVX512 = X86_64_AVX2 + [
     "+avx512f",
     "+avx512vl",
     "+avx512cd",
     "+avx512bw",
     "+avx512dq",
 ]

 X86_64_AVX512_VNNI = X86_64_AVX512 + [
     "+avx512vnni",
 ]

 X86_64_AVX512_BF16 = X86_64_AVX512 + [
     "+avx512bf16",
 ]

 # LLVMCPU, data-tiling, data-tiling + ukernels.
 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cpu_dt%s_%s_%s" % (
         ("_uk" if use_uk else ""),
         lhs_rhs_type,
         acc_type,
     ),
     compiler_flags = [
         "--iree-opt-data-tiling",
     ] + [
         "--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none"),
     ] + (["--iree-input-demote-f64-to-f32=false"] if acc_type == "f64" else []),
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
     ] + (["--shapes=small"] if acc_type == "f64" else []),
     tags = ([
         # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
         "noriscv",
         "nowasm",
     ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
     target_backends_and_drivers = [
         ("llvm-cpu", "local-task"),
     ],
     target_cpu_features_variants = ["generic"] +
                                    ([
                                        "arm_64:dotprod:+dotprod",
                                        "arm_64:i8mm:+i8mm",
                                        "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
                                    ] if lhs_rhs_type == "i8" and acc_type == "i32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                    ] if lhs_rhs_type == "f32" and acc_type == "f32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "arm_64:fullfp16:+fullfp16",
                                    ] if lhs_rhs_type == "f16" and acc_type == "f16" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "arm_64:fp16fml:+fp16fml",
                                    ] if lhs_rhs_type == "f16" and acc_type == "f32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                        "arm_64:bf16:+bf16",
                                    ] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                        "arm_64:bf16:+bf16",
                                    ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for use_uk in [
     False,
     True,
 ] for (lhs_rhs_type, acc_type) in (
     [
         ("i8", "i32"),
         ("f32", "f32"),
         ("f64", "f64"),
         ("f16", "f16"),
         ("f16", "f32"),
         ("bf16", "bf16"),
         ("bf16", "f32"),
     ]
 )]

 # LLVMCPU, data-tiling, data-tiling + ukernels + late materialization.
 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cpu_experimental_dt%s_%s_%s" % (
         ("_uk" if use_uk else ""),
         lhs_rhs_type,
         acc_type,
     ),
     compiler_flags = [
         "--iree-opt-data-tiling",
         "--iree-global-opt-enable-early-materialization=false",
     ] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
     ],
     tags = ([
         # f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
         "noriscv",
         "nowasm",
     ] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
     target_backends_and_drivers = [
         ("llvm-cpu", "local-task"),
     ],
     target_cpu_features_variants = ["generic"] +
                                    ([
                                        "arm_64:dotprod:+dotprod",
                                        "arm_64:i8mm:+i8mm",
                                        "x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
                                    ] if lhs_rhs_type == "i8" and acc_type == "i32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                    ] if lhs_rhs_type == "f32" and acc_type == "f32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "arm_64:fullfp16:+fullfp16",
                                    ] if lhs_rhs_type == "f16" and acc_type == "f16" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "arm_64:fp16fml:+fp16fml",
                                    ] if lhs_rhs_type == "f16" and acc_type == "f32" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                        "arm_64:bf16:+bf16",
                                    ] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
                                        "x86_64:avx2:" + ",".join(X86_64_AVX2),
                                        "x86_64:avx512:" + ",".join(X86_64_AVX512),
                                        "x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
                                        "arm_64:bf16:+bf16",
                                    ] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for use_uk in [
     False,
     True,
 ] for (lhs_rhs_type, acc_type) in (
     [
         ("i8", "i32"),
         ("f32", "f32"),
         ("f16", "f16"),
         ("f16", "f32"),
         ("bf16", "bf16"),
         ("bf16", "f32"),
     ]
 )]

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_vmvx_experimental_dt%s_%s_%s" % (
         ("_uk" if use_uk else ""),
         lhs_rhs_type,
         acc_type,
     ),
     compiler_flags = [
         "--iree-opt-data-tiling",
         "--iree-global-opt-enable-early-materialization=false",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=small",
     ],
     tags = [],
     target_backends_and_drivers = [
         ("vmvx", "local-task"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for use_uk in [
     False,
     True,
 ] for (lhs_rhs_type, acc_type) in (
     [
         ("f32", "f32"),
     ]
 )]

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_experimental_dt_%s_%s" % (
         lhs_rhs_type,
         acc_type,
     ),
     compiler_flags = [
         "--iree-opt-data-tiling",
         "--iree-global-opt-enable-early-materialization=false",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=small",
     ],
     tags = [],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in (
     [
         ("f32", "f32"),
     ]
 )]

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_spirv_experimental_dt_%s_%s" % (
         lhs_rhs_type,
         acc_type,
     ),
     compiler_flags = [
         "--iree-opt-data-tiling",
         "--iree-global-opt-enable-early-materialization=false",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=small",
     ],
     tags = [],
     target_backends_and_drivers = [
         ("vulkan-spirv", "vulkan"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in (
     [
         ("f32", "f32"),
     ]
 )]

 ###########################################################################
 ##
 ## VMVX backend
 ##
 ###########################################################################

 # VMVX, data-tiling + microkernels.
 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_vmvx_dt_uk_%s_small" % lhs_rhs_type,
     compiler_flags = [
         "--iree-vmvx-enable-microkernels",
         "--iree-opt-data-tiling",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=small",
     ],
     target_backends_and_drivers = [
         ("vmvx", "local-task"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in [
     ("i8", "i32"),
     ("f32", "f32"),
 ]]

 ###########################################################################
 ##
 ## CUDA backend
 ##
 ###########################################################################

 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f32_large_simt",
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f32",
         "--acc_type=f32",
         "--shapes=easy_large_static",
         "--compilation_info=LLVMGPUMatmulSimt",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-nvidia",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 # Testing Ampere + TensorCore path.
 # WMMA TensorCore(F32): wmma.161616.f32.tf32
 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f32_large_tensorcore",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f32",
         "--acc_type=f32",
         "--shapes=easy_large_static",
         "--compilation_info=LLVMGPUMatmulTensorCore",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f32",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f32",
         "--acc_type=f32",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f16",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f16",
         "--acc_type=f32",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 # MMA.SYNC TensorCore(F32): mma.sync.1688.f32.t32
 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f32_large_mma_sync",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f32",
         "--acc_type=f32",
         "--shapes=easy_large_static",
         "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 # WMMA TensorCore(F16): wmma.161616.f16.f16
 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f16_large_tensorcore",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f16",
         "--acc_type=f32",
         "--shapes=easy_large_static",
         "--compilation_info=LLVMGPUMatmulTensorCore",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 # MMA.SYNC TensorCore(F16): mma.sync.161616.f16.f16
 iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_f16_large_mma_sync",
     compiler_flags = [
         "--iree-cuda-target=sm_80",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=f16",
         "--acc_type=f32",
         "--shapes=easy_large_static",
         "--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-sm80",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 )

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_cuda_%s_splitk" % lhs_rhs_type,
     compiler_flags = [
         "--iree-dispatch-creation-split-matmul-reduction=4",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
     ],
     tags = [
         # CUDA cuInit fails with sanitizer on.
         "noasan",
         "nomsan",
         "notsan",
         "noubsan",
         "requires-gpu-nvidia",
     ],
     target_backends_and_drivers = [
         ("cuda", "cuda"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in [
     ("f32", "f32"),
 ]]

 ###########################################################################
 ##
 ## Vulkan backend
 ##
 ###########################################################################

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_vulkan_{0}_large_valhall".format(lhs_rhs_type),
     compiler_flags = [
         "--iree-vulkan-target=valhall",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=easy_large_static",
         "--compilation_info=SPIRVVectorizeMali",
     ],
     tags = [
         # Nvidia GPUs support a superset of Valhall features
         "requires-gpu-nvidia",
         "vulkan_uses_vk_khr_shader_float16_int8",
     ],
     target_backends_and_drivers = [
         ("vulkan-spirv", "vulkan"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in [
     ("i8", "i32"),
     ("f16", "f32"),
     ("f32", "f32"),
 ]]

 [iree_generated_e2e_runner_test(
     name = "e2e_matmul_vulkan_{0}_large_ampere".format(lhs_rhs_type),
     compiler_flags = [
         "--iree-vulkan-target=ampere",
     ],
     generator = ":generate_e2e_matmul_tests",
     generator_args = [
         "--lhs_rhs_type=%s" % lhs_rhs_type,
         "--acc_type=%s" % acc_type,
         "--shapes=easy_large_static",
         "--compilation_info=SPIRVVectorizeNVIDIA",
     ],
     tags = [
         "requires-gpu-sm80",
         "vulkan_uses_vk_khr_shader_float16_int8",
     ],
     target_backends_and_drivers = [
         ("vulkan-spirv", "vulkan"),
     ],
     test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
     test_type = "matmul",
 ) for (lhs_rhs_type, acc_type) in [
     ("i8", "i32"),
     ("f16", "f32"),
     ("f32", "f32"),
 ]]

 # TODO(#19465): add large matmul tests for rdna3.
	# Copyright 2022 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	# End-to-end matrix multiplication tests.

	load("//build_tools/bazel:iree_e2e_generated_runner_test.bzl", "iree_generated_e2e_runner_test")

	package(
	features = ["layering_check"],
	licenses = ["notice"], # Apache 2.0
	)

	py_binary(
	name = "generate_e2e_matmul_tests",
	srcs = ["generate_e2e_matmul_tests.py"],
	)

	###########################################################################
	##
	## LLVMCPU backend
	##
	###########################################################################

	# LLVMCPU, non-data-tiling, no microkernels
	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cpu_nondt_%s_%s" % (lhs_rhs_type, acc_type),
	compiler_flags = [
	"--iree-opt-data-tiling=false",
	"--iree-llvmcpu-enable-ukernels=none",
	"--iree-llvmcpu-enable-scalable-vectorization",
	"--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	],
	tags = [
	# f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
	"noriscv",
	"nowasm",
	] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else [],
	target_backends_and_drivers = [
	("llvm-cpu", "local-task"),
	],
	target_cpu_features_variants = ["generic"] +
	# Widening matmuls fail to lower for SVE.
	(["arm_64:sve:+sve"] if lhs_rhs_type == acc_type else []),
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in [
	# ("i8", "i32"), # TODO(#15800): enable once compile time is reasonable
	# ("f32", "f32"), # TODO(#15800): enable once compile time is reasonable
	# ("f16", "f16"), # TODO(#15800): enable once compile time is reasonable
	# ("f16", "f32"), # TODO(#15800): enable once compile time is reasonable
	# TODO(#15258): enable bf16 tests when that bug is fixed.
	# ("bf16", "bf16"),
	# ("bf16", "f32"),
	]]

	PREPROCESSING_TRANSPOSE_LHS = "--iree-preprocessing-pass-pipeline=builtin.module\\(util.func\\(iree-preprocessing-transpose-matmul-pass{input=lhs}\\)\\)"

	PREPROCESSING_PEEL = "--iree-llvmcpu-vector-pproc-strategy=peel"

	# LLVMCPU, non-data-tiling, no microkernels, ArmSME
	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cpu_arm_sme_nondt_%s_%s%s" % (
	dtype,
	"_transpose_lhs" if transpose_lhs else "",
	"_peel" if peel else "",
	),
	compiler_flags = [
	"--iree-opt-data-tiling=false",
	"--iree-llvmcpu-enable-scalable-vectorization",
	"--iree-llvmcpu-target-triple=aarch64-unknown-unknown",
	] + ([PREPROCESSING_TRANSPOSE_LHS] if transpose_lhs else []) +
	([PREPROCESSING_PEEL] if peel else []),
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % dtype,
	"--acc_type=%s" % dtype,
	],
	tags = [
	"requires-arm-sme",
	],
	target_backends_and_drivers = [
	("llvm-cpu", "local-task"),
	],
	target_cpu_features_variants = ["arm_64:sme:+sve,+sme"],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for dtype in [
	"f32",
	# f64 disabled because it wasn't supported by the test generator at the time
	# this was added. When adding it in the future, consider passing
	# --iree-input-demote-f64-to-f32=false to the compiler.
	# "f64"
	] for transpose_lhs in [
	True,
	False,
	] for peel in [
	True,
	False,
	]]

	X86_64_AVX2 = [
	"+avx",
	"+avx2",
	"+fma",
	"+f16c",
	]

	X86_64_AVX512 = X86_64_AVX2 + [
	"+avx512f",
	"+avx512vl",
	"+avx512cd",
	"+avx512bw",
	"+avx512dq",
	]

	X86_64_AVX512_VNNI = X86_64_AVX512 + [
	"+avx512vnni",
	]

	X86_64_AVX512_BF16 = X86_64_AVX512 + [
	"+avx512bf16",
	]

	# LLVMCPU, data-tiling, data-tiling + ukernels.
	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cpu_dt%s_%s_%s" % (
	("_uk" if use_uk else ""),
	lhs_rhs_type,
	acc_type,
	),
	compiler_flags = [
	"--iree-opt-data-tiling",
	] + [
	"--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none"),
	] + (["--iree-input-demote-f64-to-f32=false"] if acc_type == "f64" else []),
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	] + (["--shapes=small"] if acc_type == "f64" else []),
	tags = ([
	# f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
	"noriscv",
	"nowasm",
	] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
	target_backends_and_drivers = [
	("llvm-cpu", "local-task"),
	],
	target_cpu_features_variants = ["generic"] +
	([
	"arm_64:dotprod:+dotprod",
	"arm_64:i8mm:+i8mm",
	"x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
	] if lhs_rhs_type == "i8" and acc_type == "i32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	] if lhs_rhs_type == "f32" and acc_type == "f32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"arm_64:fullfp16:+fullfp16",
	] if lhs_rhs_type == "f16" and acc_type == "f16" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"arm_64:fp16fml:+fp16fml",
	] if lhs_rhs_type == "f16" and acc_type == "f32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
	"arm_64:bf16:+bf16",
	] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
	"arm_64:bf16:+bf16",
	] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for use_uk in [
	False,
	True,
	] for (lhs_rhs_type, acc_type) in (
	[
	("i8", "i32"),
	("f32", "f32"),
	("f64", "f64"),
	("f16", "f16"),
	("f16", "f32"),
	("bf16", "bf16"),
	("bf16", "f32"),
	]
	)]

	# LLVMCPU, data-tiling, data-tiling + ukernels + late materialization.
	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cpu_experimental_dt%s_%s_%s" % (
	("_uk" if use_uk else ""),
	lhs_rhs_type,
	acc_type,
	),
	compiler_flags = [
	"--iree-opt-data-tiling",
	"--iree-global-opt-enable-early-materialization=false",
	] + ["--iree-llvmcpu-enable-ukernels=%s" % ("all" if use_uk else "none")],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	],
	tags = ([
	# f16/bf16 trigger internal LLVM assertion errors on riscv and wasm.
	"noriscv",
	"nowasm",
	] if (lhs_rhs_type == "f16" or lhs_rhs_type == "bf16") else []),
	target_backends_and_drivers = [
	("llvm-cpu", "local-task"),
	],
	target_cpu_features_variants = ["generic"] +
	([
	"arm_64:dotprod:+dotprod",
	"arm_64:i8mm:+i8mm",
	"x86_64:avx512vnni:" + ",".join(X86_64_AVX512_VNNI),
	] if lhs_rhs_type == "i8" and acc_type == "i32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	] if lhs_rhs_type == "f32" and acc_type == "f32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"arm_64:fullfp16:+fullfp16",
	] if lhs_rhs_type == "f16" and acc_type == "f16" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"arm_64:fp16fml:+fp16fml",
	] if lhs_rhs_type == "f16" and acc_type == "f32" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
	"arm_64:bf16:+bf16",
	] if lhs_rhs_type == "bf16" and acc_type == "bf16" else [
	"x86_64:avx2:" + ",".join(X86_64_AVX2),
	"x86_64:avx512:" + ",".join(X86_64_AVX512),
	"x86_64:avx512bf16:" + ",".join(X86_64_AVX512_BF16),
	"arm_64:bf16:+bf16",
	] if lhs_rhs_type == "bf16" and acc_type == "f32" else []),
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for use_uk in [
	False,
	True,
	] for (lhs_rhs_type, acc_type) in (
	[
	("i8", "i32"),
	("f32", "f32"),
	("f16", "f16"),
	("f16", "f32"),
	("bf16", "bf16"),
	("bf16", "f32"),
	]
	)]

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_vmvx_experimental_dt%s_%s_%s" % (
	("_uk" if use_uk else ""),
	lhs_rhs_type,
	acc_type,
	),
	compiler_flags = [
	"--iree-opt-data-tiling",
	"--iree-global-opt-enable-early-materialization=false",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=small",
	],
	tags = [],
	target_backends_and_drivers = [
	("vmvx", "local-task"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for use_uk in [
	False,
	True,
	] for (lhs_rhs_type, acc_type) in (
	[
	("f32", "f32"),
	]
	)]

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_experimental_dt_%s_%s" % (
	lhs_rhs_type,
	acc_type,
	),
	compiler_flags = [
	"--iree-opt-data-tiling",
	"--iree-global-opt-enable-early-materialization=false",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=small",
	],
	tags = [],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in (
	[
	("f32", "f32"),
	]
	)]

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_spirv_experimental_dt_%s_%s" % (
	lhs_rhs_type,
	acc_type,
	),
	compiler_flags = [
	"--iree-opt-data-tiling",
	"--iree-global-opt-enable-early-materialization=false",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=small",
	],
	tags = [],
	target_backends_and_drivers = [
	("vulkan-spirv", "vulkan"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in (
	[
	("f32", "f32"),
	]
	)]

	###########################################################################
	##
	## VMVX backend
	##
	###########################################################################

	# VMVX, data-tiling + microkernels.
	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_vmvx_dt_uk_%s_small" % lhs_rhs_type,
	compiler_flags = [
	"--iree-vmvx-enable-microkernels",
	"--iree-opt-data-tiling",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=small",
	],
	target_backends_and_drivers = [
	("vmvx", "local-task"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in [
	("i8", "i32"),
	("f32", "f32"),
	]]

	###########################################################################
	##
	## CUDA backend
	##
	###########################################################################

	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f32_large_simt",
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f32",
	"--acc_type=f32",
	"--shapes=easy_large_static",
	"--compilation_info=LLVMGPUMatmulSimt",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-nvidia",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	# Testing Ampere + TensorCore path.
	# WMMA TensorCore(F32): wmma.161616.f32.tf32
	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f32_large_tensorcore",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f32",
	"--acc_type=f32",
	"--shapes=easy_large_static",
	"--compilation_info=LLVMGPUMatmulTensorCore",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f32",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f32",
	"--acc_type=f32",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f16",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f16",
	"--acc_type=f32",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	# MMA.SYNC TensorCore(F32): mma.sync.1688.f32.t32
	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f32_large_mma_sync",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f32",
	"--acc_type=f32",
	"--shapes=easy_large_static",
	"--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	# WMMA TensorCore(F16): wmma.161616.f16.f16
	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f16_large_tensorcore",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f16",
	"--acc_type=f32",
	"--shapes=easy_large_static",
	"--compilation_info=LLVMGPUMatmulTensorCore",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	# MMA.SYNC TensorCore(F16): mma.sync.161616.f16.f16
	iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_f16_large_mma_sync",
	compiler_flags = [
	"--iree-cuda-target=sm_80",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=f16",
	"--acc_type=f32",
	"--shapes=easy_large_static",
	"--compilation_info=LLVMGPUMatmulTensorCoreMmaSync",
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-sm80",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	)

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_cuda_%s_splitk" % lhs_rhs_type,
	compiler_flags = [
	"--iree-dispatch-creation-split-matmul-reduction=4",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	],
	tags = [
	# CUDA cuInit fails with sanitizer on.
	"noasan",
	"nomsan",
	"notsan",
	"noubsan",
	"requires-gpu-nvidia",
	],
	target_backends_and_drivers = [
	("cuda", "cuda"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in [
	("f32", "f32"),
	]]

	###########################################################################
	##
	## Vulkan backend
	##
	###########################################################################

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_vulkan_{0}_large_valhall".format(lhs_rhs_type),
	compiler_flags = [
	"--iree-vulkan-target=valhall",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=easy_large_static",
	"--compilation_info=SPIRVVectorizeMali",
	],
	tags = [
	# Nvidia GPUs support a superset of Valhall features
	"requires-gpu-nvidia",
	"vulkan_uses_vk_khr_shader_float16_int8",
	],
	target_backends_and_drivers = [
	("vulkan-spirv", "vulkan"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in [
	("i8", "i32"),
	("f16", "f32"),
	("f32", "f32"),
	]]

	[iree_generated_e2e_runner_test(
	name = "e2e_matmul_vulkan_{0}_large_ampere".format(lhs_rhs_type),
	compiler_flags = [
	"--iree-vulkan-target=ampere",
	],
	generator = ":generate_e2e_matmul_tests",
	generator_args = [
	"--lhs_rhs_type=%s" % lhs_rhs_type,
	"--acc_type=%s" % acc_type,
	"--shapes=easy_large_static",
	"--compilation_info=SPIRVVectorizeNVIDIA",
	],
	tags = [
	"requires-gpu-sm80",
	"vulkan_uses_vk_khr_shader_float16_int8",
	],
	target_backends_and_drivers = [
	("vulkan-spirv", "vulkan"),
	],
	test_runner = "//tools/testing/e2e:iree-e2e-matmul-test",
	test_type = "matmul",
	) for (lhs_rhs_type, acc_type) in [
	("i8", "i32"),
	("f16", "f32"),
	("f32", "f32"),
	]]

	# TODO(#19465): add large matmul tests for rdna3.