Create platform-specific targets in benchmark suites (#9237)

diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index bbaaa71..a425f88 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -136,670 +136,7 @@
 ################################################################################
 # Add benchmarks for all platforms.                                            #
 ################################################################################
+include(android-arm64-v8a.cmake)
+include(android-adreno.cmake)
+include(android-mali.cmake)
 include(linux-x86_64.cmake)
-
-################################################################################
-#                                                                              #
-# Default benchmark configurations                                             #
-#                                                                              #
-# Each suite benchmarks a list of modules with configurations specifying a     #
-# target architecture and runtime characteristics (e.g. threads/cores). These  #
-# benchmarks only configure IREE translation and runtime flags for the target  #
-# architecture and do *not* include any non-default flags. No non-default      #
-# flags should be added here.                                                  #
-#                                                                              #
-################################################################################
-
-set(ANDROID_CPU_TRANSLATION_FLAGS
-  "--iree-input-type=tosa"
-  "--iree-llvm-target-triple=aarch64-none-linux-android29")
-set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
-  "--iree-input-type=tosa"
-  "--iree-vulkan-target-triple=adreno-unknown-android11"
-)
-set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
-  "--iree-input-type=tosa"
-  "--iree-vulkan-target-triple=valhall-unknown-android11"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILEBERT_INT8_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "big-core,full-inference,default-flags"
-    "little-core,full-inference,default-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib-sync"
-)
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILEBERT_INT8_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "1-thread,big-core,full-inference,default-flags"
-    "1-thread,little-core,full-inference,default-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=1"
-)
-
-# TODO(#7792): Re-enable these when we are able to run different benchmarks
-# depending on use-case (presubmit, postsubmit, nightly, etc.)
-# iree_benchmark_suite(
-#   MODULES
-#     "${DEEPLABV3_FP32_MODULE}"
-#     "${MOBILESSD_FP32_MODULE}"
-#     "${POSENET_FP32_MODULE}"
-#     "${MOBILEBERT_FP32_MODULE}"
-#     "${MOBILENET_V2_MODULE}"
-#     "${MOBILENET_V3SMALL_MODULE}"
-
-#   BENCHMARK_MODES
-#     "2-thread,big-core,full-inference,default-flags"
-#     "2-thread,little-core,full-inference,default-flags"
-#   TARGET_BACKEND
-#     "dylib-llvm-aot"
-#   TARGET_ARCHITECTURE
-#     "CPU-ARM64-v8A"
-#   TRANSLATION_FLAGS
-#     ${ANDROID_CPU_TRANSLATION_FLAGS}
-#   BENCHMARK_TOOL
-#     iree-benchmark-module
-#   DRIVER
-#     "dylib"
-#   RUNTIME_FLAGS
-#     "--task_topology_group_count=2"
-# )
-
-# iree_benchmark_suite(
-#   MODULES
-#     "${DEEPLABV3_FP32_MODULE}"
-#     "${MOBILESSD_FP32_MODULE}"
-#     "${POSENET_FP32_MODULE}"
-#     "${MOBILEBERT_FP32_MODULE}"
-#     "${MOBILENET_V2_MODULE}"
-#     "${MOBILENET_V3SMALL_MODULE}"
-
-#   BENCHMARK_MODES
-#     "3-thread,big-core,full-inference,default-flags"
-#     "3-thread,little-core,full-inference,default-flags"
-#   TARGET_BACKEND
-#     "dylib-llvm-aot"
-#   TARGET_ARCHITECTURE
-#     "CPU-ARM64-v8A"
-#   TRANSLATION_FLAGS
-#     ${ANDROID_CPU_TRANSLATION_FLAGS}
-#   BENCHMARK_TOOL
-#     iree-benchmark-module
-#   DRIVER
-#     "dylib"
-#   RUNTIME_FLAGS
-#     "--task_topology_group_count=3"
-# )
-
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILEBERT_INT8_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "4-thread,big-core,full-inference,default-flags"
-    "4-thread,little-core,full-inference,default-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=4"
-)
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,default-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Adreno"
-  TRANSLATION_FLAGS
-    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,default-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP16_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,default-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
-    # This isn't a special optimization flag. It's so we can reuse the same f32
-    # model file. See comments on MOBILEBERT_FP16_MODULE
-    "--iree-flow-demote-f32-to-f16"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-################################################################################
-
-################################################################################
-#                                                                              #
-# Specialized benchmark configurations                                         #
-#                                                                              #
-# Each suite benchmarks one or more module with configurations that can vary   #
-# on model or architecture characteristics. These are intended for providing   #
-# continuous benchmarks of experimental features that cannot be turned on by   #
-# default yet. It is primarily intended for whoever is actively investigating  #
-# optimizations for a feature exemplified in a specific model or architecture. #
-# Due to our current benchmark setup, there can only be one experimental       #
-# configuration per model and other benchmark mode.                            #
-#                                                                              #
-################################################################################
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. +f32mm).
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "big-core,full-inference,experimental-flags"
-    "little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib-sync"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_INT8_MODULE}"
-
-  BENCHMARK_MODES
-    "big-core,full-inference,experimental-flags"
-    "little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
-    "--iree-llvm-target-cpu-features=+dotprod"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib-sync"
-)
-
-# TODO(#7792): Consider re-enabling little-core experimental-flags if we start
-# optimizing for little cores or we can just run them occasionally
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "1-thread,big-core,full-inference,experimental-flags"
-    # "1-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_INT8_MODULE}"
-
-  BENCHMARK_MODES
-    "1-thread,big-core,full-inference,experimental-flags"
-    # "1-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
-    "--iree-llvm-target-cpu-features=+dotprod"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=1"
-)
-
-# TODO(#7792): Re-enable these when we are able to run different benchmarks
-# depending on use-case (presubmit, postsubmit, nightly, etc.)
-# iree_benchmark_suite(
-#   MODULES
-#     "${DEEPLABV3_FP32_MODULE}"
-#     "${MOBILESSD_FP32_MODULE}"
-#     "${POSENET_FP32_MODULE}"
-#     "${MOBILEBERT_FP32_MODULE}"
-#     "${MOBILENET_V2_MODULE}"
-#     "${MOBILENET_V3SMALL_MODULE}"
-
-#   BENCHMARK_MODES
-#     "2-thread,big-core,full-inference,experimental-flags"
-#     "2-thread,little-core,full-inference,experimental-flags"
-#   TARGET_BACKEND
-#     "dylib-llvm-aot"
-#   TARGET_ARCHITECTURE
-#     "CPU-ARM64-v8A"
-#   TRANSLATION_FLAGS
-#     ${ANDROID_CPU_TRANSLATION_FLAGS}
-#     "--iree-flow-mmt4d-target-options=arch=aarch64"
-#   BENCHMARK_TOOL
-#     iree-benchmark-module
-#   DRIVER
-#     "dylib"
-#   RUNTIME_FLAGS
-#     "--task_topology_group_count=2"
-# )
-
-# iree_benchmark_suite(
-#   MODULES
-#   "${DEEPLABV3_FP32_MODULE}"
-#   "${MOBILESSD_FP32_MODULE}"
-#   "${POSENET_FP32_MODULE}"
-#   "${MOBILEBERT_FP32_MODULE}"
-#   "${MOBILENET_V2_MODULE}"
-#   "${MOBILENET_V3SMALL_MODULE}"
-
-#   BENCHMARK_MODES
-#     "3-thread,big-core,full-inference,experimental-flags"
-#     "3-thread,little-core,full-inference,experimental-flags"
-#   TARGET_BACKEND
-#     "dylib-llvm-aot"
-#   TARGET_ARCHITECTURE
-#     "CPU-ARM64-v8A"
-#   TRANSLATION_FLAGS
-#     ${ANDROID_CPU_TRANSLATION_FLAGS}
-#     "--iree-flow-mmt4d-target-options=arch=aarch64"
-#   BENCHMARK_TOOL
-#     iree-benchmark-module
-#   DRIVER
-#     "dylib"
-#   RUNTIME_FLAGS
-#     "--task_topology_group_count=3"
-# )
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "4-thread,big-core,full-inference,experimental-flags"
-    # "4-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64"
-
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=4"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_INT8_MODULE}"
-
-  BENCHMARK_MODES
-    "4-thread,big-core,full-inference,experimental-flags"
-    # "4-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    ${ANDROID_CPU_TRANSLATION_FLAGS}
-    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
-    "--iree-llvm-target-cpu-features=+dotprod"
-
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=4"
-)
-
-# CPU, VMVX, 4-thread, big-core, full-inference
-# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
-# benchmark because it's useful to keep an eye on and helps disambiguate where a
-# performance change may be coming from (e.g. if it's in vmvx as well, it's
-# probably not a codegen issue).
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "4-thread,big-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vmvx"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vmvx"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=4"
-)
-
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Adreno"
-  TRANSLATION_FLAGS
-    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP16_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-    "--iree-flow-demote-f32-to-f16"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-)
-
-# kernel-execution
-
-# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
-# low enough that the whole dispatch completes within an OS-specific timeout.
-# Otherwise you'll get error like:
-# ```
-# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
-# hal.ex.submit_and_wait; while calling import;
-# ```
-# With current kernel performance and timeouts on Adreno Pixel 4, this means we
-# have no kernel benchmark for the DeepLabV3 and MobileBert models
-# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
-# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
-
-# GPU, Vulkan, Adreno, kernel-execution
-iree_benchmark_suite(
-  MODULES
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Adreno"
-  TRANSLATION_FLAGS
-    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=16"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=16"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
-  MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=32"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=32"
-)
-
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP16_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-    "--iree-flow-demote-f32-to-f16"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-enable-fuse-padding-into-consumer-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=32"
-  BENCHMARK_TOOL
-    iree-benchmark-module
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=32"
-)
diff --git a/benchmarks/TFLite/android-adreno.cmake b/benchmarks/TFLite/android-adreno.cmake
new file mode 100644
index 0000000..1a0e0d0
--- /dev/null
+++ b/benchmarks/TFLite/android-adreno.cmake
@@ -0,0 +1,138 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+#                                                                              #
+# Default benchmark configurations                                             #
+#                                                                              #
+# Each suite benchmarks a list of modules with configurations specifying a     #
+# target architecture and runtime characteristics (e.g. threads/cores). These  #
+# benchmarks only configure IREE translation and runtime flags for the target  #
+# architecture and do *not* include any non-default flags. No non-default      #
+# flags should be added here.                                                  #
+#                                                                              #
+################################################################################
+
+set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
+  "--iree-input-type=tosa"
+  "--iree-vulkan-target-triple=adreno-unknown-android11"
+)
+
+# GPU, Vulkan, Adreno, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-adreno"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,default-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Adreno"
+  TRANSLATION_FLAGS
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+################################################################################
+
+################################################################################
+#                                                                              #
+# Specialized benchmark configurations                                         #
+#                                                                              #
+# Each suite benchmarks one or more module with configurations that can vary   #
+# on model or architecture characteristics. These are intended for providing   #
+# continuous benchmarks of experimental features that cannot be turned on by   #
+# default yet. It is primarily intended for whoever is actively investigating  #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental       #
+# configuration per model and other benchmark mode.                            #
+#                                                                              #
+################################################################################
+
+# GPU, Vulkan, Adreno, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-adreno"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Adreno"
+  TRANSLATION_FLAGS
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+# With current kernel performance and timeouts on Adreno Pixel 4, this means we
+# have no kernel benchmark for the DeepLabV3 and MobileBert models
+# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
+# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
+
+# GPU, Vulkan, Adreno, kernel-execution
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-adreno"
+
+  MODULES
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "kernel-execution,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Adreno"
+  TRANSLATION_FLAGS
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=16"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=16"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/android-arm64-v8a.cmake b/benchmarks/TFLite/android-arm64-v8a.cmake
new file mode 100644
index 0000000..cd49abb
--- /dev/null
+++ b/benchmarks/TFLite/android-arm64-v8a.cmake
@@ -0,0 +1,473 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+#                                                                              #
+# Default benchmark configurations                                             #
+#                                                                              #
+# Each suite benchmarks a list of modules with configurations specifying a     #
+# target architecture and runtime characteristics (e.g. threads/cores). These  #
+# benchmarks only configure IREE translation and runtime flags for the target  #
+# architecture and do *not* include any non-default flags. No non-default      #
+# flags should be added here.                                                  #
+#                                                                              #
+################################################################################
+
+set(ANDROID_CPU_TRANSLATION_FLAGS
+  "--iree-input-type=tosa"
+  "--iree-llvm-target-triple=aarch64-none-linux-android29")
+
+# CPU, Dylib-Sync, big/little-core, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILEBERT_INT8_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "big-core,full-inference,default-flags"
+    "little-core,full-inference,default-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib-sync"
+)
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILEBERT_INT8_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "1-thread,big-core,full-inference,default-flags"
+    "1-thread,little-core,full-inference,default-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=1"
+)
+
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+#   GROUP_NAME
+#     "android-arm64-v8a"
+#
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
+
+#   BENCHMARK_MODES
+#     "2-thread,big-core,full-inference,default-flags"
+#     "2-thread,little-core,full-inference,default-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=2"
+# )
+
+# iree_benchmark_suite(
+#   GROUP_NAME
+#     "android-arm64-v8a"
+#
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
+
+#   BENCHMARK_MODES
+#     "3-thread,big-core,full-inference,default-flags"
+#     "3-thread,little-core,full-inference,default-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=3"
+# )
+
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILEBERT_INT8_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "4-thread,big-core,full-inference,default-flags"
+    "4-thread,little-core,full-inference,default-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=4"
+)
+
+################################################################################
+
+################################################################################
+#                                                                              #
+# Specialized benchmark configurations                                         #
+#                                                                              #
+# Each suite benchmarks one or more module with configurations that can vary   #
+# on model or architecture characteristics. These are intended for providing   #
+# continuous benchmarks of experimental features that cannot be turned on by   #
+# default yet. It is primarily intended for whoever is actively investigating  #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental       #
+# configuration per model and other benchmark mode.                            #
+#                                                                              #
+################################################################################
+
+# CPU, Dylib-Sync, big/little-core, full-inference
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. +f32mm).
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "big-core,full-inference,experimental-flags"
+    "little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib-sync"
+)
+
+# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${MOBILEBERT_INT8_MODULE}"
+
+  BENCHMARK_MODES
+    "big-core,full-inference,experimental-flags"
+    "little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+    "--iree-llvm-target-cpu-features=+dotprod"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib-sync"
+)
+
+# TODO(#7792): Consider re-enabling little-core experimental-flags if we start
+# optimizing for little cores or we can just run them occasionally
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "1-thread,big-core,full-inference,experimental-flags"
+    # "1-thread,little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=1"
+)
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${MOBILEBERT_INT8_MODULE}"
+
+  BENCHMARK_MODES
+    "1-thread,big-core,full-inference,experimental-flags"
+    # "1-thread,little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+    "--iree-llvm-target-cpu-features=+dotprod"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=1"
+)
+
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+#  GROUP_NAME
+#    "android-arm64-v8a"
+#
+#   MODULES
+#     "${DEEPLABV3_FP32_MODULE}"
+#     "${MOBILESSD_FP32_MODULE}"
+#     "${POSENET_FP32_MODULE}"
+#     "${MOBILEBERT_FP32_MODULE}"
+#     "${MOBILENET_V2_MODULE}"
+#     "${MOBILENET_V3SMALL_MODULE}"
+
+#   BENCHMARK_MODES
+#     "2-thread,big-core,full-inference,experimental-flags"
+#     "2-thread,little-core,full-inference,experimental-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#     "--iree-flow-mmt4d-target-options=arch=aarch64"
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=2"
+# )
+
+# iree_benchmark_suite(
+#  GROUP_NAME
+#    "android-arm64-v8a"
+#
+#   MODULES
+#   "${DEEPLABV3_FP32_MODULE}"
+#   "${MOBILESSD_FP32_MODULE}"
+#   "${POSENET_FP32_MODULE}"
+#   "${MOBILEBERT_FP32_MODULE}"
+#   "${MOBILENET_V2_MODULE}"
+#   "${MOBILENET_V3SMALL_MODULE}"
+
+#   BENCHMARK_MODES
+#     "3-thread,big-core,full-inference,experimental-flags"
+#     "3-thread,little-core,full-inference,experimental-flags"
+#   TARGET_BACKEND
+#     "dylib-llvm-aot"
+#   TARGET_ARCHITECTURE
+#     "CPU-ARM64-v8A"
+#   TRANSLATION_FLAGS
+#     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#     "--iree-flow-mmt4d-target-options=arch=aarch64"
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
+#   DRIVER
+#     "dylib"
+#   RUNTIME_FLAGS
+#     "--task_topology_group_count=3"
+# )
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "4-thread,big-core,full-inference,experimental-flags"
+    # "4-thread,little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64"
+
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=4"
+)
+
+# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${MOBILEBERT_INT8_MODULE}"
+
+  BENCHMARK_MODES
+    "4-thread,big-core,full-inference,experimental-flags"
+    # "4-thread,little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    ${ANDROID_CPU_TRANSLATION_FLAGS}
+    "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+    "--iree-llvm-target-cpu-features=+dotprod"
+
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=4"
+)
+
+# CPU, VMVX, 4-thread, big-core, full-inference
+# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
+# benchmark because it's useful to keep an eye on and helps disambiguate where a
+# performance change may be coming from (e.g. if it's in vmvx as well, it's
+# probably not a codegen issue).
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-arm64-v8a"
+
+  MODULES
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "4-thread,big-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vmvx"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    "--iree-input-type=tosa"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vmvx"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=4"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/android-mali.cmake b/benchmarks/TFLite/android-mali.cmake
new file mode 100644
index 0000000..f9cdb2f
--- /dev/null
+++ b/benchmarks/TFLite/android-mali.cmake
@@ -0,0 +1,212 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+#                                                                              #
+# Default benchmark configurations                                             #
+#                                                                              #
+# Each suite benchmarks a list of modules with configurations specifying a     #
+# target architecture and runtime characteristics (e.g. threads/cores). These  #
+# benchmarks only configure IREE translation and runtime flags for the target  #
+# architecture and do *not* include any non-default flags. No non-default      #
+# flags should be added here.                                                  #
+#                                                                              #
+################################################################################
+
+set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
+  "--iree-input-type=tosa"
+  "--iree-vulkan-target-triple=valhall-unknown-android11"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,default-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,default-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    # This isn't a special optimization flag. It's so we can reuse the same f32
+    # model file. See comments on MOBILEBERT_FP16_MODULE
+    "--iree-flow-demote-f32-to-f16"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+################################################################################
+
+################################################################################
+#                                                                              #
+# Specialized benchmark configurations                                         #
+#                                                                              #
+# Each suite benchmarks one or more module with configurations that can vary   #
+# on model or architecture characteristics. These are intended for providing   #
+# continuous benchmarks of experimental features that cannot be turned on by   #
+# default yet. It is primarily intended for whoever is actively investigating  #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental       #
+# configuration per model and other benchmark mode.                            #
+#                                                                              #
+################################################################################
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-input-type=tosa"
+    "--iree-flow-demote-f32-to-f16"
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+)
+
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+
+# GPU, Vulkan, Mali, kernel-execution
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${DEEPLABV3_FP32_MODULE}"
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "kernel-execution,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=32"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=32"
+)
+
+iree_benchmark_suite(
+  GROUP_NAME
+    "android-mali"
+
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
+
+  BENCHMARK_MODES
+    "kernel-execution,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-input-type=tosa"
+    "--iree-flow-demote-f32-to-f16"
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-enable-fuse-padding-into-consumer-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=32"
+  BENCHMARK_TOOL
+    iree-benchmark-module
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=32"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/linux-x86_64.cmake b/benchmarks/TFLite/linux-x86_64.cmake
index 27c4844..efe4de4 100644
--- a/benchmarks/TFLite/linux-x86_64.cmake
+++ b/benchmarks/TFLite/linux-x86_64.cmake
@@ -24,6 +24,9 @@
 
 # CPU, Dylib-Sync, x86_64, full-inference
 iree_benchmark_suite(
+  GROUP_NAME
+    "linux-x86_64"
+
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
@@ -48,6 +51,9 @@
 
 # CPU, Dylib, 1 thread, x86_64, full-inference
 iree_benchmark_suite(
+  GROUP_NAME
+    "linux-x86_64"
+
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
@@ -74,6 +80,9 @@
 
 # CPU, Dylib, 4 threads, x86_64, full-inference
 iree_benchmark_suite(
+  GROUP_NAME
+    "linux-x86_64"
+
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
@@ -100,6 +109,9 @@
 
 # CPU, Dylib, 8 threads, x86_64, full-inference
 iree_benchmark_suite(
+  GROUP_NAME
+    "linux-x86_64"
+
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
diff --git a/build_tools/cmake/benchmark_compilation_flagfile.in b/build_tools/cmake/benchmark_compilation_flagfile.in
deleted file mode 100644
index cf37cf9..0000000
--- a/build_tools/cmake/benchmark_compilation_flagfile.in
+++ /dev/null
@@ -1 +0,0 @@
-@IREE_BENCHMARK_COMPILATION_FLAGS@
diff --git a/build_tools/cmake/build_android_benchmark.sh b/build_tools/cmake/build_android_benchmark.sh
index 4bc1476..6481fae 100755
--- a/build_tools/cmake/build_android_benchmark.sh
+++ b/build_tools/cmake/build_android_benchmark.sh
@@ -69,7 +69,11 @@
 
 "${CMAKE_BIN}" --build . --target install -- -k 0
 # Also generate artifacts for benchmarking on Android.
-"${CMAKE_BIN}" --build . --target iree-benchmark-suites -- -k 0
+"${CMAKE_BIN}" --build . --target \
+  iree-benchmark-suites-android-arm64-v8a \
+  iree-benchmark-suites-android-adreno \
+  iree-benchmark-suites-android-mali \
+  -- -k 0
 "${CMAKE_BIN}" --build . --target iree-microbenchmark-suites -- -k 0
 # --------------------------------------------------------------------------- #
 
diff --git a/build_tools/cmake/build_linux_benchmark.sh b/build_tools/cmake/build_linux_benchmark.sh
index cd3057a..3e92137 100755
--- a/build_tools/cmake/build_linux_benchmark.sh
+++ b/build_tools/cmake/build_linux_benchmark.sh
@@ -60,7 +60,7 @@
   -DIREE_BUILD_SAMPLES=OFF
 
 "${CMAKE_BIN}" --build . --target install -- -k 0
-"${CMAKE_BIN}" --build . --target iree-benchmark-suites -- -k 0
+"${CMAKE_BIN}" --build . --target iree-benchmark-suites-linux-x86_64 -- -k 0
 "${CMAKE_BIN}" --build . --target iree-microbenchmark-suites -- -k 0
 # --------------------------------------------------------------------------- #
 
diff --git a/build_tools/cmake/iree_benchmark_suite.cmake b/build_tools/cmake/iree_benchmark_suite.cmake
index cdcb14a..f5dfd74 100644
--- a/build_tools/cmake/iree_benchmark_suite.cmake
+++ b/build_tools/cmake/iree_benchmark_suite.cmake
@@ -13,6 +13,8 @@
 # `iree-benchmark-module`.
 #
 # Parameters:
+#   GROUP_NAME: A group name this benchmark will join. Each group has its own
+#       CMake's benchmark suite target: "iree-benchmark-suites-<GROUP_NAME>".
 #   MODULES: A list for model specification. Due to CMake's lack of data
 #       structures, each module is represented as a list suitable to be parsed
 #       by cmake_parse_arguments:
@@ -68,18 +70,24 @@
     PARSE_ARGV 0
     _RULE
     ""
-    "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
+    "GROUP_NAME;DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
     "BENCHMARK_MODES;BENCHMARK_TOOL;MODULES;TRANSLATION_FLAGS;RUNTIME_FLAGS"
   )
 
   iree_validate_required_arguments(
     _RULE
-    "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
+    "GROUP_NAME;DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
     "BENCHMARK_MODES;BENCHMARK_TOOL;MODULES"
   )
 
   iree_package_name(PACKAGE_NAME)
 
+  # Add the benchmark suite target.
+  set(SUITE_SUB_TARGET "iree-benchmark-suites-${_RULE_GROUP_NAME}")
+  if(NOT TARGET "${SUITE_SUB_TARGET}")
+    add_custom_target("${SUITE_SUB_TARGET}")
+  endif()
+
   foreach(_MODULE IN LISTS _RULE_MODULES)
     cmake_parse_arguments(
       _MODULE
@@ -234,6 +242,7 @@
 
         # Mark dependency so that we have one target to drive them all.
         add_dependencies(iree-benchmark-suites "${_TRANSLATION_TARGET_NAME}")
+        add_dependencies("${SUITE_SUB_TARGET}" "${_TRANSLATION_TARGET_NAME}")
       endif(NOT TARGET "${_TRANSLATION_TARGET_NAME}")
 
       set(_COMPILE_STATS_TRANSLATION_TARGET_NAME
@@ -272,6 +281,9 @@
         add_dependencies(iree-benchmark-suites
           "${_COMPILE_STATS_TRANSLATION_TARGET_NAME}"
         )
+        add_dependencies("${SUITE_SUB_TARGET}"
+          "${_COMPILE_STATS_TRANSLATION_TARGET_NAME}"
+        )
       endif()
 
       if(NOT TARGET "${_FRIENDLY_TARGET_NAME}")
@@ -284,6 +296,7 @@
       endif()
 
       set(_RUN_SPEC_DIR "${_ROOT_ARTIFACTS_DIR}/${_MODULE_DIR_NAME}/${_BENCHMARK_DIR_NAME}")
+      list(JOIN _COMMON_NAME_SEGMENTS "__" _RUN_SPEC_TARGET_SUFFIX)
 
       # Create the command and target for the flagfile spec used to execute
       # the generated artifacts.
@@ -306,11 +319,8 @@
         COMMENT "Generating ${_FLAG_FILE}"
       )
 
-      set(_FLAGFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-flagfile")
-      list(APPEND _FLAGFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
-      list(JOIN _FLAGFILE_GEN_TARGET_NAME_LIST "__" _FLAGFILE_GEN_TARGET_NAME)
-      set(_FLAGFILE_GEN_TARGET_NAME "${PACKAGE_NAME}_${_FLAGFILE_GEN_TARGET_NAME}")
-
+      set(_FLAGFILE_GEN_TARGET_NAME
+        "${PACKAGE_NAME}_iree-generate-benchmark-flagfile__${_RUN_SPEC_TARGET_SUFFIX}")
       add_custom_target("${_FLAGFILE_GEN_TARGET_NAME}"
         DEPENDS "${_FLAG_FILE}"
       )
@@ -325,23 +335,42 @@
         COMMENT "Generating ${_TOOL_FILE}"
       )
 
-      set(_TOOLFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-toolfile")
-      list(APPEND _TOOLFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
-      list(JOIN _TOOLFILE_GEN_TARGET_NAME_LIST "__" _TOOLFILE_GEN_TARGET_NAME)
+      set(_TOOLFILE_GEN_TARGET_NAME
+        "${PACKAGE_NAME}_iree-generate-benchmark-toolfile__${_RUN_SPEC_TARGET_SUFFIX}")
       add_custom_target("${_TOOLFILE_GEN_TARGET_NAME}"
         DEPENDS "${_TOOL_FILE}"
       )
 
       # Generate a flagfile containing command-line options used to compile the
       # generated artifacts.
-      set(_COMPOPT_FILE "${_RUN_SPEC_DIR}/compilation_flagfile")
-      string(REPLACE ";" "\n" IREE_BENCHMARK_COMPILATION_FLAGS "${_TRANSLATION_ARGS}")
-      configure_file(
-        ${PROJECT_SOURCE_DIR}/build_tools/cmake/benchmark_compilation_flagfile.in
-        ${_COMPOPT_FILE})
+      set(_COMPILATION_FLAGFILE "${_RUN_SPEC_DIR}/compilation_flagfile")
+      # Generate the flagfile with python command. We can't use "file" because
+      # it can't be part of a target's dependency and generated lazily. And
+      # "cmake -E echo" doesn't work with newlines.
+      add_custom_command(
+        OUTPUT "${_COMPILATION_FLAGFILE}"
+        COMMAND
+          "${Python3_EXECUTABLE}" "${IREE_ROOT_DIR}/build_tools/scripts/generate_compilation_flagfile.py"
+            --output "${_COMPILATION_FLAGFILE}"
+            -- ${_TRANSLATION_ARGS}
+        WORKING_DIRECTORY "${_RUN_SPEC_DIR}"
+        COMMENT "Generating ${_COMPILATION_FLAGFILE}"
+      )
+
+      set(_COMPILATION_FLAGFILE_GEN_TARGET_NAME
+        "${PACKAGE_NAME}_iree-generate-benchmark-compilation-flagfile__${_RUN_SPEC_TARGET_SUFFIX}")
+      add_custom_target("${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
+        DEPENDS "${_COMPILATION_FLAGFILE}"
+      )
 
       # Mark dependency so that we have one target to drive them all.
       add_dependencies(iree-benchmark-suites
+        "${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
+        "${_FLAGFILE_GEN_TARGET_NAME}"
+        "${_TOOLFILE_GEN_TARGET_NAME}"
+      )
+      add_dependencies("${SUITE_SUB_TARGET}"
+        "${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
         "${_FLAGFILE_GEN_TARGET_NAME}"
         "${_TOOLFILE_GEN_TARGET_NAME}"
       )
diff --git a/build_tools/scripts/generate_compilation_flagfile.py b/build_tools/scripts/generate_compilation_flagfile.py
new file mode 100755
index 0000000..cf0cb13
--- /dev/null
+++ b/build_tools/scripts/generate_compilation_flagfile.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Generates a compilation flagfile for iree-compiler.
+
+This tool is added due to CMake's incapabilities on generating files with
+multiple lines. CMake's configure_file doesn't work in our case as it can't be
+triggered from a target.
+"""
+
+import argparse
+
+
+def parse_arguments():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--output",
+                      type=str,
+                      required=True,
+                      help="output file to write to")
+  parser.add_argument("compilation_flags",
+                      metavar="<compilation-flags>",
+                      nargs="*",
+                      help="list of compilation flags")
+  return parser.parse_args()
+
+
+def main(args):
+  with open(args.output, "w") as f:
+    f.write("\n".join(args.compilation_flags) + "\n")
+
+
+if __name__ == "__main__":
+  main(parse_arguments())