Fix benchmark presets for CUDA and vulkan-nvidia benchmarks (#13416)

diff --git a/build_tools/benchmarks/common/benchmark_definition.py b/build_tools/benchmarks/common/benchmark_definition.py
index dffde66..0d0b5d5 100644
--- a/build_tools/benchmarks/common/benchmark_definition.py
+++ b/build_tools/benchmarks/common/benchmark_definition.py
@@ -50,15 +50,24 @@
 
 # A map from GPU name to IREE's benchmark target architecture.
 GPU_NAME_TO_TARGET_ARCH_MAP = {
-    "adreno-640": common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
-    "adreno-650": common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
-    "adreno-660": common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
-    "adreno-730": common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
-    "mali-g77": common_definitions.DeviceArchitecture.ARM_VALHALL,
-    "mali-g78": common_definitions.DeviceArchitecture.ARM_VALHALL,
-    "tesla-v100-sxm2-16gb": common_definitions.DeviceArchitecture.CUDA_SM70,
-    "nvidia-a100-sxm4-40gb": common_definitions.DeviceArchitecture.CUDA_SM80,
-    "nvidia-geforce-rtx-3090": common_definitions.DeviceArchitecture.CUDA_SM80,
+    "adreno-640":
+        common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
+    "adreno-650":
+        common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
+    "adreno-660":
+        common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
+    "adreno-730":
+        common_definitions.DeviceArchitecture.QUALCOMM_ADRENO,
+    "mali-g77":
+        common_definitions.DeviceArchitecture.ARM_VALHALL,
+    "mali-g78":
+        common_definitions.DeviceArchitecture.ARM_VALHALL,
+    "tesla-v100-sxm2-16gb":
+        common_definitions.DeviceArchitecture.NVIDIA_PASCAL,
+    "nvidia-a100-sxm4-40gb":
+        common_definitions.DeviceArchitecture.NVIDIA_AMPERE,
+    "nvidia-geforce-rtx-3090":
+        common_definitions.DeviceArchitecture.NVIDIA_AMPERE,
 }
 
 
diff --git a/build_tools/benchmarks/export_benchmark_config.py b/build_tools/benchmarks/export_benchmark_config.py
index 33b32fc..c4c091c 100755
--- a/build_tools/benchmarks/export_benchmark_config.py
+++ b/build_tools/benchmarks/export_benchmark_config.py
@@ -50,12 +50,12 @@
     "x86_64":
         lambda config: config.target_device_spec.architecture.architecture ==
         "x86_64",
-    "cuda":
-        lambda config: config.target_device_spec.architecture.architecture ==
-        "cuda" and "long-running" not in config.tags,
+    "cuda": (lambda config: "cuda" in config.tags and "long-running" not in
+             config.tags),
     "cuda-long":
-        lambda config: config.target_device_spec.architecture.architecture ==
-        "cuda" and "long-running" in config.tags,
+        lambda config: "cuda" in config.tags and "long-running" in config.tags,
+    "vulkan-nvidia":
+        lambda config: "vulkan-nvidia" in config.tags,
     # TODO(#9855): Enable benchmarks on Pixel-6-Pro and XT2201-2.
     "experimental-android-cpu":
         lambda config:
diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py
index 51f91fa..07dbc42 100755
--- a/build_tools/github_actions/configure_ci.py
+++ b/build_tools/github_actions/configure_ci.py
@@ -77,7 +77,7 @@
 RUNNER_ENV_DEFAULT = "prod"
 RUNNER_ENV_OPTIONS = [RUNNER_ENV_DEFAULT, "testing"]
 
-DEFAULT_BENCHMARK_PRESETS = ["cuda", "x86_64", "comp-stats"]
+DEFAULT_BENCHMARK_PRESETS = ["cuda", "x86_64", "vulkan-nvidia", "comp-stats"]
 BENCHMARK_PRESET_OPTIONS = DEFAULT_BENCHMARK_PRESETS + [
     "experimental-android-cpu"
 ]
diff --git a/build_tools/python/benchmark_suites/iree/cuda_benchmarks.py b/build_tools/python/benchmark_suites/iree/cuda_benchmarks.py
index 2aa7b89..08c1cb4 100644
--- a/build_tools/python/benchmark_suites/iree/cuda_benchmarks.py
+++ b/build_tools/python/benchmark_suites/iree/cuda_benchmarks.py
@@ -73,18 +73,21 @@
     """Generates IREE compile and run configs."""
     gen_configs, run_configs = self._generate_configs(model_groups.CUDA_MODELS,
                                                       self.SM_80_COMPILE_CONFIG)
+    # The `cuda` tag is required to put them into the CUDA benchmark preset.
     ubench_gen_configs, ubench_run_configs = self._generate_configs(
         model_groups.MICRO_MATMUL,
         self.SM_80_UBENCH_MATMUL_COMPILE_CONFIG,
-        execution_config=module_execution_configs.CUDA_BATCH_SIZE_100_CONFIG)
+        execution_config=module_execution_configs.CUDA_BATCH_SIZE_100_CONFIG,
+        run_tags=["cuda"])
     ubench_splitk_gen_configs, ubench_splitk_run_configs = self._generate_configs(
         model_groups.MICRO_MATMUL_SPLITK,
         self.SM_80_UBENCH_MATMUL_SPLITK_COMPILE_CONFIG,
-        execution_config=module_execution_configs.CUDA_BATCH_SIZE_100_CONFIG)
+        execution_config=module_execution_configs.CUDA_BATCH_SIZE_100_CONFIG,
+        run_tags=["cuda"])
     long_running_gen_configs, long_running_module_configs = self._generate_configs(
         model_groups.CUDA_MODELS_LONG,
         self.SM_80_COMPILE_CONFIG,
-        run_tags=["long-running"])
+        run_tags=["cuda", "long-running"])
     return (gen_configs + ubench_gen_configs + ubench_splitk_gen_configs +
             long_running_gen_configs, run_configs + ubench_run_configs +
             ubench_splitk_run_configs + long_running_module_configs)
diff --git a/build_tools/python/benchmark_suites/iree/vulkan_nvidia_benchmarks.py b/build_tools/python/benchmark_suites/iree/vulkan_nvidia_benchmarks.py
index e1a3fc6..fc9c764 100644
--- a/build_tools/python/benchmark_suites/iree/vulkan_nvidia_benchmarks.py
+++ b/build_tools/python/benchmark_suites/iree/vulkan_nvidia_benchmarks.py
@@ -88,9 +88,15 @@
   ) -> Tuple[List[iree_definitions.ModuleGenerationConfig],
              List[iree_definitions.E2EModelRunConfig]]:
     """Generates IREE compile and run configs."""
+    # The `vulkan-nvidia` tag is required to put them into the Vulkan NVIDIA
+    # benchmark preset.
     tensorcore_gen_configs, tensorcore_run_configs = self._generate_configs(
-        model_groups.VULKAN_MODELS, self.TENSORCORE_COMPILE_CONFIG)
+        model_groups.VULKAN_MODELS,
+        self.TENSORCORE_COMPILE_CONFIG,
+        run_tags=["vulkan-nvidia"])
     simt_gen_configs, simt_run_configs = self._generate_configs(
-        model_groups.VULKAN_MODELS, self.SIMT_COMPILE_CONFIG)
+        model_groups.VULKAN_MODELS,
+        self.SIMT_COMPILE_CONFIG,
+        run_tags=["vulkan-nvidia"])
     return (tensorcore_gen_configs + simt_gen_configs,
             tensorcore_run_configs + simt_run_configs)