Simplify tests/e2e/tensor_ops. (#17854)

Forked from https://github.com/iree-org/iree/pull/17766

* ~~Move `tests/e2e/tensor_ops/pack_i8.mlir` to
`tests/e2e/cpu_specific/pack_i8_vnni.mlir` and add a comment explaining
what is VNNI specific about it~~
* Enable `tensor_cast` and `unpack` tests on CUDA
* Add Metal and ROCm/HIP test suites to Bazel
* Share test srcs lists between:
  * CPU and VMVX
  * Vulkan and Metal (both using SPIR-V codegen)
  * CUDA and ROCm (both using LLVMGPU codegen)
diff --git a/tests/e2e/tensor_ops/BUILD.bazel b/tests/e2e/tensor_ops/BUILD.bazel
index 35500dc..4b164e0 100644
--- a/tests/e2e/tensor_ops/BUILD.bazel
+++ b/tests/e2e/tensor_ops/BUILD.bazel
@@ -12,48 +12,34 @@
     licenses = ["notice"],  # Apache 2.0
 )
 
+ALL_SRCS = enforce_glob(
+    # keep sorted
+    [
+        "collapse_shape.mlir",
+        "concat.mlir",
+        "expand_shape.mlir",
+        "extract_slice.mlir",
+        "pack.mlir",
+        "pack_dynamic_inner_tiles.mlir",
+        "pack_i8.mlir",
+        "tensor_cast.mlir",
+        "tensor_insert_slice.mlir",
+        "unpack.mlir",
+    ],
+    include = ["*.mlir"],
+    exclude = [],
+)
+
 iree_check_single_backend_test_suite(
     name = "check_llvm-cpu_local-task",
-    srcs = enforce_glob(
-        # keep sorted
-        [
-            "collapse_shape.mlir",
-            "concat.mlir",
-            "expand_shape.mlir",
-            "extract_slice.mlir",
-            "pack.mlir",
-            "pack_dynamic_inner_tiles.mlir",
-            "pack_i8.mlir",
-            "tensor_cast.mlir",
-            "tensor_insert_slice.mlir",
-            "unpack.mlir",
-        ],
-        include = ["*.mlir"],
-        exclude = [],
-    ),
+    srcs = ALL_SRCS,
     driver = "local-task",
     target_backend = "llvm-cpu",
 )
 
 iree_check_single_backend_test_suite(
     name = "check_vmvx_local-task",
-    srcs = enforce_glob(
-        # keep sorted
-        [
-            "collapse_shape.mlir",
-            "concat.mlir",
-            "expand_shape.mlir",
-            "extract_slice.mlir",
-            "pack.mlir",
-            "pack_dynamic_inner_tiles.mlir",
-            "pack_i8.mlir",
-            "tensor_cast.mlir",
-            "tensor_insert_slice.mlir",
-            "unpack.mlir",
-        ],
-        include = ["*.mlir"],
-        exclude = [],
-    ),
+    srcs = ALL_SRCS,
     driver = "local-task",
     target_backend = "vmvx",
 )
@@ -79,26 +65,28 @@
     target_backend = "vmvx",
 )
 
+LLVM_GPU_SRCS = enforce_glob(
+    # keep sorted
+    [
+        "collapse_shape.mlir",
+        "concat.mlir",
+        "expand_shape.mlir",
+        "extract_slice.mlir",
+        "pack.mlir",
+        "pack_i8.mlir",
+        "tensor_cast.mlir",
+        "tensor_insert_slice.mlir",
+        "unpack.mlir",
+    ],
+    include = ["*.mlir"],
+    exclude = [
+        "pack_dynamic_inner_tiles.mlir",
+    ],
+)
+
 iree_check_single_backend_test_suite(
     name = "check_cuda",
-    srcs = enforce_glob(
-        # keep sorted
-        [
-            "collapse_shape.mlir",
-            "concat.mlir",
-            "expand_shape.mlir",
-            "extract_slice.mlir",
-            "pack.mlir",
-            "tensor_insert_slice.mlir",
-        ],
-        include = ["*.mlir"],
-        exclude = [
-            "pack_dynamic_inner_tiles.mlir",
-            "pack_i8.mlir",
-            "tensor_cast.mlir",
-            "unpack.mlir",
-        ],
-    ),
+    srcs = LLVM_GPU_SRCS,
     driver = "cuda",
     tags = [
         "noasan",
@@ -111,25 +99,41 @@
 )
 
 iree_check_single_backend_test_suite(
+    name = "check_rocm_hip",
+    srcs = LLVM_GPU_SRCS,
+    driver = "hip",
+    target_backend = "rocm",
+)
+
+SPIRV_SRCS = enforce_glob(
+    # keep sorted
+    [
+        "collapse_shape.mlir",
+        "concat.mlir",
+        "expand_shape.mlir",
+        "extract_slice.mlir",
+        "tensor_cast.mlir",
+        "tensor_insert_slice.mlir",
+    ],
+    include = ["*.mlir"],
+    exclude = [
+        "pack.mlir",
+        "pack_i8.mlir",
+        "pack_dynamic_inner_tiles.mlir",
+        "unpack.mlir",
+    ],
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_metal-spirv_metal",
+    srcs = SPIRV_SRCS,
+    driver = "metal",
+    target_backend = "metal-spirv",
+)
+
+iree_check_single_backend_test_suite(
     name = "check_vulkan-spirv_vulkan",
-    srcs = enforce_glob(
-        # keep sorted
-        [
-            "collapse_shape.mlir",
-            "concat.mlir",
-            "expand_shape.mlir",
-            "extract_slice.mlir",
-            "tensor_cast.mlir",
-            "tensor_insert_slice.mlir",
-        ],
-        include = ["*.mlir"],
-        exclude = [
-            "pack.mlir",
-            "pack_dynamic_inner_tiles.mlir",
-            "pack_i8.mlir",
-            "unpack.mlir",
-        ],
-    ),
+    srcs = SPIRV_SRCS,
     driver = "vulkan",
     target_backend = "vulkan-spirv",
 )
diff --git a/tests/e2e/tensor_ops/CMakeLists.txt b/tests/e2e/tensor_ops/CMakeLists.txt
index c3c0a51..4a6f718 100644
--- a/tests/e2e/tensor_ops/CMakeLists.txt
+++ b/tests/e2e/tensor_ops/CMakeLists.txt
@@ -75,7 +75,10 @@
     "expand_shape.mlir"
     "extract_slice.mlir"
     "pack.mlir"
+    "pack_i8.mlir"
+    "tensor_cast.mlir"
     "tensor_insert_slice.mlir"
+    "unpack.mlir"
   TARGET_BACKEND
     "cuda"
   DRIVER
@@ -90,6 +93,41 @@
 
 iree_check_single_backend_test_suite(
   NAME
+    check_rocm_hip
+  SRCS
+    "collapse_shape.mlir"
+    "concat.mlir"
+    "expand_shape.mlir"
+    "extract_slice.mlir"
+    "pack.mlir"
+    "pack_i8.mlir"
+    "tensor_cast.mlir"
+    "tensor_insert_slice.mlir"
+    "unpack.mlir"
+  TARGET_BACKEND
+    "rocm"
+  DRIVER
+    "hip"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
+    check_metal-spirv_metal
+  SRCS
+    "collapse_shape.mlir"
+    "concat.mlir"
+    "expand_shape.mlir"
+    "extract_slice.mlir"
+    "tensor_cast.mlir"
+    "tensor_insert_slice.mlir"
+  TARGET_BACKEND
+    "metal-spirv"
+  DRIVER
+    "metal"
+)
+
+iree_check_single_backend_test_suite(
+  NAME
     check_vulkan-spirv_vulkan
   SRCS
     "collapse_shape.mlir"
@@ -105,34 +143,3 @@
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
-
-if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx")
-
-unset(IREE_HIP_TEST_COMPILER_FLAGS)
-list(APPEND IREE_HIP_TEST_COMPILER_FLAGS
-  "--iree-rocm-target-chip=${IREE_HIP_TEST_TARGET_CHIP}"
-)
-
-iree_check_single_backend_test_suite(
-  NAME
-    check_hip_stream
-  SRCS
-    "pack.mlir"
-    "pack_i8.mlir"
-  TARGET_BACKEND
-    "rocm"
-  DRIVER
-    "hip"
-  COMPILER_FLAGS
-    ${IREE_HIP_TEST_COMPILER_FLAGS}
-  RUNNER_ARGS
-    "--hip_use_streams=true"
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-amd"
-)
-
-endif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx")
diff --git a/tests/e2e/tensor_ops/pack_i8.mlir b/tests/e2e/tensor_ops/pack_i8.mlir
index f4519f6..cd80169 100644
--- a/tests/e2e/tensor_ops/pack_i8.mlir
+++ b/tests/e2e/tensor_ops/pack_i8.mlir
@@ -1,3 +1,6 @@
+// These i8 pack ops are seen in CPU codegen with AVX-512 Vector Neural Network Instructions (VNNI).
+// Other backends should still support the tile sizes and other parameters here.
+
 func.func private @generate_2D_source(%height : index, %width : index) -> tensor<?x?xi8> {
   %init_source = tensor.empty(%height, %width) : tensor<?x?xi8>
   %source = linalg.generic {