Create platform-specific targets in benchmark suites (#9237)
diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index bbaaa71..a425f88 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -136,670 +136,7 @@
################################################################################
# Add benchmarks for all platforms. #
################################################################################
+include(android-arm64-v8a.cmake)
+include(android-adreno.cmake)
+include(android-mali.cmake)
include(linux-x86_64.cmake)
-
-################################################################################
-# #
-# Default benchmark configurations #
-# #
-# Each suite benchmarks a list of modules with configurations specifying a #
-# target architecture and runtime characteristics (e.g. threads/cores). These #
-# benchmarks only configure IREE translation and runtime flags for the target #
-# architecture and do *not* include any non-default flags. No non-default #
-# flags should be added here. #
-# #
-################################################################################
-
-set(ANDROID_CPU_TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-llvm-target-triple=aarch64-none-linux-android29")
-set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-vulkan-target-triple=adreno-unknown-android11"
-)
-set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILEBERT_INT8_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "big-core,full-inference,default-flags"
- "little-core,full-inference,default-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib-sync"
-)
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILEBERT_INT8_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "1-thread,big-core,full-inference,default-flags"
- "1-thread,little-core,full-inference,default-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=1"
-)
-
-# TODO(#7792): Re-enable these when we are able to run different benchmarks
-# depending on use-case (presubmit, postsubmit, nightly, etc.)
-# iree_benchmark_suite(
-# MODULES
-# "${DEEPLABV3_FP32_MODULE}"
-# "${MOBILESSD_FP32_MODULE}"
-# "${POSENET_FP32_MODULE}"
-# "${MOBILEBERT_FP32_MODULE}"
-# "${MOBILENET_V2_MODULE}"
-# "${MOBILENET_V3SMALL_MODULE}"
-
-# BENCHMARK_MODES
-# "2-thread,big-core,full-inference,default-flags"
-# "2-thread,little-core,full-inference,default-flags"
-# TARGET_BACKEND
-# "dylib-llvm-aot"
-# TARGET_ARCHITECTURE
-# "CPU-ARM64-v8A"
-# TRANSLATION_FLAGS
-# ${ANDROID_CPU_TRANSLATION_FLAGS}
-# BENCHMARK_TOOL
-# iree-benchmark-module
-# DRIVER
-# "dylib"
-# RUNTIME_FLAGS
-# "--task_topology_group_count=2"
-# )
-
-# iree_benchmark_suite(
-# MODULES
-# "${DEEPLABV3_FP32_MODULE}"
-# "${MOBILESSD_FP32_MODULE}"
-# "${POSENET_FP32_MODULE}"
-# "${MOBILEBERT_FP32_MODULE}"
-# "${MOBILENET_V2_MODULE}"
-# "${MOBILENET_V3SMALL_MODULE}"
-
-# BENCHMARK_MODES
-# "3-thread,big-core,full-inference,default-flags"
-# "3-thread,little-core,full-inference,default-flags"
-# TARGET_BACKEND
-# "dylib-llvm-aot"
-# TARGET_ARCHITECTURE
-# "CPU-ARM64-v8A"
-# TRANSLATION_FLAGS
-# ${ANDROID_CPU_TRANSLATION_FLAGS}
-# BENCHMARK_TOOL
-# iree-benchmark-module
-# DRIVER
-# "dylib"
-# RUNTIME_FLAGS
-# "--task_topology_group_count=3"
-# )
-
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILEBERT_INT8_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "4-thread,big-core,full-inference,default-flags"
- "4-thread,little-core,full-inference,default-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=4"
-)
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,default-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Adreno"
- TRANSLATION_FLAGS
- ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,default-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP16_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,default-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
- # This isn't a special optimization flag. It's so we can reuse the same f32
- # model file. See comments on MOBILEBERT_FP16_MODULE
- "--iree-flow-demote-f32-to-f16"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-################################################################################
-
-################################################################################
-# #
-# Specialized benchmark configurations #
-# #
-# Each suite benchmarks one or more module with configurations that can vary #
-# on model or architecture characteristics. These are intended for providing #
-# continuous benchmarks of experimental features that cannot be turned on by #
-# default yet. It is primarily intended for whoever is actively investigating #
-# optimizations for a feature exemplified in a specific model or architecture. #
-# Due to our current benchmark setup, there can only be one experimental #
-# configuration per model and other benchmark mode. #
-# #
-################################################################################
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. +f32mm).
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "big-core,full-inference,experimental-flags"
- "little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib-sync"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_INT8_MODULE}"
-
- BENCHMARK_MODES
- "big-core,full-inference,experimental-flags"
- "little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
- "--iree-llvm-target-cpu-features=+dotprod"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib-sync"
-)
-
-# TODO(#7792): Consider re-enabling little-core experimental-flags if we start
-# optimizing for little cores or we can just run them occasionally
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "1-thread,big-core,full-inference,experimental-flags"
- # "1-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_INT8_MODULE}"
-
- BENCHMARK_MODES
- "1-thread,big-core,full-inference,experimental-flags"
- # "1-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
- "--iree-llvm-target-cpu-features=+dotprod"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=1"
-)
-
-# TODO(#7792): Re-enable these when we are able to run different benchmarks
-# depending on use-case (presubmit, postsubmit, nightly, etc.)
-# iree_benchmark_suite(
-# MODULES
-# "${DEEPLABV3_FP32_MODULE}"
-# "${MOBILESSD_FP32_MODULE}"
-# "${POSENET_FP32_MODULE}"
-# "${MOBILEBERT_FP32_MODULE}"
-# "${MOBILENET_V2_MODULE}"
-# "${MOBILENET_V3SMALL_MODULE}"
-
-# BENCHMARK_MODES
-# "2-thread,big-core,full-inference,experimental-flags"
-# "2-thread,little-core,full-inference,experimental-flags"
-# TARGET_BACKEND
-# "dylib-llvm-aot"
-# TARGET_ARCHITECTURE
-# "CPU-ARM64-v8A"
-# TRANSLATION_FLAGS
-# ${ANDROID_CPU_TRANSLATION_FLAGS}
-# "--iree-flow-mmt4d-target-options=arch=aarch64"
-# BENCHMARK_TOOL
-# iree-benchmark-module
-# DRIVER
-# "dylib"
-# RUNTIME_FLAGS
-# "--task_topology_group_count=2"
-# )
-
-# iree_benchmark_suite(
-# MODULES
-# "${DEEPLABV3_FP32_MODULE}"
-# "${MOBILESSD_FP32_MODULE}"
-# "${POSENET_FP32_MODULE}"
-# "${MOBILEBERT_FP32_MODULE}"
-# "${MOBILENET_V2_MODULE}"
-# "${MOBILENET_V3SMALL_MODULE}"
-
-# BENCHMARK_MODES
-# "3-thread,big-core,full-inference,experimental-flags"
-# "3-thread,little-core,full-inference,experimental-flags"
-# TARGET_BACKEND
-# "dylib-llvm-aot"
-# TARGET_ARCHITECTURE
-# "CPU-ARM64-v8A"
-# TRANSLATION_FLAGS
-# ${ANDROID_CPU_TRANSLATION_FLAGS}
-# "--iree-flow-mmt4d-target-options=arch=aarch64"
-# BENCHMARK_TOOL
-# iree-benchmark-module
-# DRIVER
-# "dylib"
-# RUNTIME_FLAGS
-# "--task_topology_group_count=3"
-# )
-
-# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
-# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
-# At the moment we use that for fp32 models. We would change that when new
-# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "4-thread,big-core,full-inference,experimental-flags"
- # "4-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64"
-
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=4"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
-# NOTE: +dotprod is only relevant to int8, not fp32.
-# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
-# kernel is currently naive, not ready for benchmarking.
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_INT8_MODULE}"
-
- BENCHMARK_MODES
- "4-thread,big-core,full-inference,experimental-flags"
- # "4-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- ${ANDROID_CPU_TRANSLATION_FLAGS}
- "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
- "--iree-llvm-target-cpu-features=+dotprod"
-
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=4"
-)
-
-# CPU, VMVX, 4-thread, big-core, full-inference
-# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
-# benchmark because it's useful to keep an eye on and helps disambiguate where a
-# performance change may be coming from (e.g. if it's in vmvx as well, it's
-# probably not a codegen issue).
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "4-thread,big-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "vmvx"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vmvx"
- RUNTIME_FLAGS
- "--task_topology_group_count=4"
-)
-
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Adreno"
- TRANSLATION_FLAGS
- ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP16_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-flow-demote-f32-to-f16"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
-)
-
-# kernel-execution
-
-# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
-# low enough that the whole dispatch completes within an OS-specific timeout.
-# Otherwise you'll get error like:
-# ```
-# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
-# hal.ex.submit_and_wait; while calling import;
-# ```
-# With current kernel performance and timeouts on Adreno Pixel 4, this means we
-# have no kernel benchmark for the DeepLabV3 and MobileBert models
-# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
-# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
-
-# GPU, Vulkan, Adreno, kernel-execution
-iree_benchmark_suite(
- MODULES
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Adreno"
- TRANSLATION_FLAGS
- ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=16"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=16"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
- MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=32"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=32"
-)
-
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP16_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-flow-demote-f32-to-f16"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-enable-fuse-padding-into-consumer-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=32"
- BENCHMARK_TOOL
- iree-benchmark-module
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=32"
-)
diff --git a/benchmarks/TFLite/android-adreno.cmake b/benchmarks/TFLite/android-adreno.cmake
new file mode 100644
index 0000000..1a0e0d0
--- /dev/null
+++ b/benchmarks/TFLite/android-adreno.cmake
@@ -0,0 +1,138 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+# #
+# Default benchmark configurations #
+# #
+# Each suite benchmarks a list of modules with configurations specifying a #
+# target architecture and runtime characteristics (e.g. threads/cores). These #
+# benchmarks only configure IREE translation and runtime flags for the target #
+# architecture and do *not* include any non-default flags. No non-default #
+# flags should be added here. #
+# #
+################################################################################
+
+set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-vulkan-target-triple=adreno-unknown-android11"
+)
+
+# GPU, Vulkan, Adreno, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-adreno"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,default-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Adreno"
+ TRANSLATION_FLAGS
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+################################################################################
+
+################################################################################
+# #
+# Specialized benchmark configurations #
+# #
+# Each suite benchmarks one or more module with configurations that can vary #
+# on model or architecture characteristics. These are intended for providing #
+# continuous benchmarks of experimental features that cannot be turned on by #
+# default yet. It is primarily intended for whoever is actively investigating #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental #
+# configuration per model and other benchmark mode. #
+# #
+################################################################################
+
+# GPU, Vulkan, Adreno, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-adreno"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Adreno"
+ TRANSLATION_FLAGS
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+# With current kernel performance and timeouts on Adreno Pixel 4, this means we
+# have no kernel benchmark for the DeepLabV3 and MobileBert models
+# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
+# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
+
+# GPU, Vulkan, Adreno, kernel-execution
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-adreno"
+
+ MODULES
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "kernel-execution,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Adreno"
+ TRANSLATION_FLAGS
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ "--iree-hal-benchmark-dispatch-repeat-count=16"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+ RUNTIME_FLAGS
+ "--batch_size=16"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/android-arm64-v8a.cmake b/benchmarks/TFLite/android-arm64-v8a.cmake
new file mode 100644
index 0000000..cd49abb
--- /dev/null
+++ b/benchmarks/TFLite/android-arm64-v8a.cmake
@@ -0,0 +1,473 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+# #
+# Default benchmark configurations #
+# #
+# Each suite benchmarks a list of modules with configurations specifying a #
+# target architecture and runtime characteristics (e.g. threads/cores). These #
+# benchmarks only configure IREE translation and runtime flags for the target #
+# architecture and do *not* include any non-default flags. No non-default #
+# flags should be added here. #
+# #
+################################################################################
+
+set(ANDROID_CPU_TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-llvm-target-triple=aarch64-none-linux-android29")
+
+# CPU, Dylib-Sync, big/little-core, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILEBERT_INT8_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "big-core,full-inference,default-flags"
+ "little-core,full-inference,default-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib-sync"
+)
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILEBERT_INT8_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "1-thread,big-core,full-inference,default-flags"
+ "1-thread,little-core,full-inference,default-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=1"
+)
+
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+# GROUP_NAME
+# "android-arm64-v8a"
+#
+# MODULES
+# "${DEEPLABV3_FP32_MODULE}"
+# "${MOBILESSD_FP32_MODULE}"
+# "${POSENET_FP32_MODULE}"
+# "${MOBILEBERT_FP32_MODULE}"
+# "${MOBILENET_V2_MODULE}"
+# "${MOBILENET_V3SMALL_MODULE}"
+
+# BENCHMARK_MODES
+# "2-thread,big-core,full-inference,default-flags"
+# "2-thread,little-core,full-inference,default-flags"
+# TARGET_BACKEND
+# "dylib-llvm-aot"
+# TARGET_ARCHITECTURE
+# "CPU-ARM64-v8A"
+# TRANSLATION_FLAGS
+# ${ANDROID_CPU_TRANSLATION_FLAGS}
+# BENCHMARK_TOOL
+# iree-benchmark-module
+# DRIVER
+# "dylib"
+# RUNTIME_FLAGS
+# "--task_topology_group_count=2"
+# )
+
+# iree_benchmark_suite(
+# GROUP_NAME
+# "android-arm64-v8a"
+#
+# MODULES
+# "${DEEPLABV3_FP32_MODULE}"
+# "${MOBILESSD_FP32_MODULE}"
+# "${POSENET_FP32_MODULE}"
+# "${MOBILEBERT_FP32_MODULE}"
+# "${MOBILENET_V2_MODULE}"
+# "${MOBILENET_V3SMALL_MODULE}"
+
+# BENCHMARK_MODES
+# "3-thread,big-core,full-inference,default-flags"
+# "3-thread,little-core,full-inference,default-flags"
+# TARGET_BACKEND
+# "dylib-llvm-aot"
+# TARGET_ARCHITECTURE
+# "CPU-ARM64-v8A"
+# TRANSLATION_FLAGS
+# ${ANDROID_CPU_TRANSLATION_FLAGS}
+# BENCHMARK_TOOL
+# iree-benchmark-module
+# DRIVER
+# "dylib"
+# RUNTIME_FLAGS
+# "--task_topology_group_count=3"
+# )
+
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILEBERT_INT8_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "4-thread,big-core,full-inference,default-flags"
+ "4-thread,little-core,full-inference,default-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=4"
+)
+
+################################################################################
+
+################################################################################
+# #
+# Specialized benchmark configurations #
+# #
+# Each suite benchmarks one or more module with configurations that can vary #
+# on model or architecture characteristics. These are intended for providing #
+# continuous benchmarks of experimental features that cannot be turned on by #
+# default yet. It is primarily intended for whoever is actively investigating #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental #
+# configuration per model and other benchmark mode. #
+# #
+################################################################################
+
+# CPU, Dylib-Sync, big/little-core, full-inference
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. +f32mm).
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "big-core,full-inference,experimental-flags"
+ "little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib-sync"
+)
+
+# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${MOBILEBERT_INT8_MODULE}"
+
+ BENCHMARK_MODES
+ "big-core,full-inference,experimental-flags"
+ "little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+ "--iree-llvm-target-cpu-features=+dotprod"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib-sync"
+)
+
+# TODO(#7792): Consider re-enabling little-core experimental-flags if we start
+# optimizing for little cores or we can just run them occasionally
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "1-thread,big-core,full-inference,experimental-flags"
+ # "1-thread,little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=1"
+)
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${MOBILEBERT_INT8_MODULE}"
+
+ BENCHMARK_MODES
+ "1-thread,big-core,full-inference,experimental-flags"
+ # "1-thread,little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+ "--iree-llvm-target-cpu-features=+dotprod"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=1"
+)
+
+# TODO(#7792): Re-enable these when we are able to run different benchmarks
+# depending on use-case (presubmit, postsubmit, nightly, etc.)
+# iree_benchmark_suite(
+# GROUP_NAME
+# "android-arm64-v8a"
+#
+# MODULES
+# "${DEEPLABV3_FP32_MODULE}"
+# "${MOBILESSD_FP32_MODULE}"
+# "${POSENET_FP32_MODULE}"
+# "${MOBILEBERT_FP32_MODULE}"
+# "${MOBILENET_V2_MODULE}"
+# "${MOBILENET_V3SMALL_MODULE}"
+
+# BENCHMARK_MODES
+# "2-thread,big-core,full-inference,experimental-flags"
+# "2-thread,little-core,full-inference,experimental-flags"
+# TARGET_BACKEND
+# "dylib-llvm-aot"
+# TARGET_ARCHITECTURE
+# "CPU-ARM64-v8A"
+# TRANSLATION_FLAGS
+# ${ANDROID_CPU_TRANSLATION_FLAGS}
+# "--iree-flow-mmt4d-target-options=arch=aarch64"
+# BENCHMARK_TOOL
+# iree-benchmark-module
+# DRIVER
+# "dylib"
+# RUNTIME_FLAGS
+# "--task_topology_group_count=2"
+# )
+
+# iree_benchmark_suite(
+# GROUP_NAME
+# "android-arm64-v8a"
+#
+# MODULES
+# "${DEEPLABV3_FP32_MODULE}"
+# "${MOBILESSD_FP32_MODULE}"
+# "${POSENET_FP32_MODULE}"
+# "${MOBILEBERT_FP32_MODULE}"
+# "${MOBILENET_V2_MODULE}"
+# "${MOBILENET_V3SMALL_MODULE}"
+
+# BENCHMARK_MODES
+# "3-thread,big-core,full-inference,experimental-flags"
+# "3-thread,little-core,full-inference,experimental-flags"
+# TARGET_BACKEND
+# "dylib-llvm-aot"
+# TARGET_ARCHITECTURE
+# "CPU-ARM64-v8A"
+# TRANSLATION_FLAGS
+# ${ANDROID_CPU_TRANSLATION_FLAGS}
+# "--iree-flow-mmt4d-target-options=arch=aarch64"
+# BENCHMARK_TOOL
+# iree-benchmark-module
+# DRIVER
+# "dylib"
+# RUNTIME_FLAGS
+# "--task_topology_group_count=3"
+# )
+
+# CPU, Dylib, 1 through 4 threads, big/little-core, full-inference.
+# NOTE: this is not enabling any SIMD extension beyond baseline Aarch64.
+# At the moment we use that for fp32 models. We would change that when new
+# devices support relevant fp32 SIMD extensions beyond that (e.g. f32mm).
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "4-thread,big-core,full-inference,experimental-flags"
+ # "4-thread,little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64"
+
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=4"
+)
+
+# CPU, Dylib-Sync, big/little-core, full-inference, +dotprod
+# NOTE: +dotprod is only relevant to int8, not fp32.
+# TODO: add a +i8mm variant, supported by new devices already. No rush: our i8mm
+# kernel is currently naive, not ready for benchmarking.
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${MOBILEBERT_INT8_MODULE}"
+
+ BENCHMARK_MODES
+ "4-thread,big-core,full-inference,experimental-flags"
+ # "4-thread,little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "dylib-llvm-aot"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ ${ANDROID_CPU_TRANSLATION_FLAGS}
+ "--iree-flow-mmt4d-target-options=arch=aarch64 features=+dotprod"
+ "--iree-llvm-target-cpu-features=+dotprod"
+
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "dylib"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=4"
+)
+
+# CPU, VMVX, 4-thread, big-core, full-inference
+# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
+# benchmark because it's useful to keep an eye on and helps disambiguate where a
+# performance change may be coming from (e.g. if it's in vmvx as well, it's
+# probably not a codegen issue).
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-arm64-v8a"
+
+ MODULES
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "4-thread,big-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vmvx"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vmvx"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=4"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/android-mali.cmake b/benchmarks/TFLite/android-mali.cmake
new file mode 100644
index 0000000..f9cdb2f
--- /dev/null
+++ b/benchmarks/TFLite/android-mali.cmake
@@ -0,0 +1,212 @@
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+################################################################################
+# #
+# Default benchmark configurations #
+# #
+# Each suite benchmarks a list of modules with configurations specifying a #
+# target architecture and runtime characteristics (e.g. threads/cores). These #
+# benchmarks only configure IREE translation and runtime flags for the target #
+# architecture and do *not* include any non-default flags. No non-default #
+# flags should be added here. #
+# #
+################################################################################
+
+set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-vulkan-target-triple=valhall-unknown-android11"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,default-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,default-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ # This isn't a special optimization flag. It's so we can reuse the same f32
+ # model file. See comments on MOBILEBERT_FP16_MODULE
+ "--iree-flow-demote-f32-to-f16"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+################################################################################
+
+################################################################################
+# #
+# Specialized benchmark configurations #
+# #
+# Each suite benchmarks one or more module with configurations that can vary #
+# on model or architecture characteristics. These are intended for providing #
+# continuous benchmarks of experimental features that cannot be turned on by #
+# default yet. It is primarily intended for whoever is actively investigating #
+# optimizations for a feature exemplified in a specific model or architecture. #
+# Due to our current benchmark setup, there can only be one experimental #
+# configuration per model and other benchmark mode. #
+# #
+################################################################################
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-flow-demote-f32-to-f16"
+ "--iree-vulkan-target-triple=valhall-unknown-android11"
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+)
+
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+
+# GPU, Vulkan, Mali, kernel-execution
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "kernel-execution,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ "--iree-hal-benchmark-dispatch-repeat-count=32"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+ RUNTIME_FLAGS
+ "--batch_size=32"
+)
+
+iree_benchmark_suite(
+ GROUP_NAME
+ "android-mali"
+
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
+
+ BENCHMARK_MODES
+ "kernel-execution,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-flow-demote-f32-to-f16"
+ "--iree-vulkan-target-triple=valhall-unknown-android11"
+ "--iree-flow-enable-fuse-padding-into-consumer-ops"
+ "--iree-hal-benchmark-dispatch-repeat-count=32"
+ BENCHMARK_TOOL
+ iree-benchmark-module
+ DRIVER
+ "vulkan"
+ RUNTIME_FLAGS
+ "--batch_size=32"
+)
+
+################################################################################
diff --git a/benchmarks/TFLite/linux-x86_64.cmake b/benchmarks/TFLite/linux-x86_64.cmake
index 27c4844..efe4de4 100644
--- a/benchmarks/TFLite/linux-x86_64.cmake
+++ b/benchmarks/TFLite/linux-x86_64.cmake
@@ -24,6 +24,9 @@
# CPU, Dylib-Sync, x86_64, full-inference
iree_benchmark_suite(
+ GROUP_NAME
+ "linux-x86_64"
+
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
@@ -48,6 +51,9 @@
# CPU, Dylib, 1 thread, x86_64, full-inference
iree_benchmark_suite(
+ GROUP_NAME
+ "linux-x86_64"
+
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
@@ -74,6 +80,9 @@
# CPU, Dylib, 4 threads, x86_64, full-inference
iree_benchmark_suite(
+ GROUP_NAME
+ "linux-x86_64"
+
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
@@ -100,6 +109,9 @@
# CPU, Dylib, 8 threads, x86_64, full-inference
iree_benchmark_suite(
+ GROUP_NAME
+ "linux-x86_64"
+
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
diff --git a/build_tools/cmake/benchmark_compilation_flagfile.in b/build_tools/cmake/benchmark_compilation_flagfile.in
deleted file mode 100644
index cf37cf9..0000000
--- a/build_tools/cmake/benchmark_compilation_flagfile.in
+++ /dev/null
@@ -1 +0,0 @@
-@IREE_BENCHMARK_COMPILATION_FLAGS@
diff --git a/build_tools/cmake/build_android_benchmark.sh b/build_tools/cmake/build_android_benchmark.sh
index 4bc1476..6481fae 100755
--- a/build_tools/cmake/build_android_benchmark.sh
+++ b/build_tools/cmake/build_android_benchmark.sh
@@ -69,7 +69,11 @@
"${CMAKE_BIN}" --build . --target install -- -k 0
# Also generate artifacts for benchmarking on Android.
-"${CMAKE_BIN}" --build . --target iree-benchmark-suites -- -k 0
+"${CMAKE_BIN}" --build . --target \
+ iree-benchmark-suites-android-arm64-v8a \
+ iree-benchmark-suites-android-adreno \
+ iree-benchmark-suites-android-mali \
+ -- -k 0
"${CMAKE_BIN}" --build . --target iree-microbenchmark-suites -- -k 0
# --------------------------------------------------------------------------- #
diff --git a/build_tools/cmake/build_linux_benchmark.sh b/build_tools/cmake/build_linux_benchmark.sh
index cd3057a..3e92137 100755
--- a/build_tools/cmake/build_linux_benchmark.sh
+++ b/build_tools/cmake/build_linux_benchmark.sh
@@ -60,7 +60,7 @@
-DIREE_BUILD_SAMPLES=OFF
"${CMAKE_BIN}" --build . --target install -- -k 0
-"${CMAKE_BIN}" --build . --target iree-benchmark-suites -- -k 0
+"${CMAKE_BIN}" --build . --target iree-benchmark-suites-linux-x86_64 -- -k 0
"${CMAKE_BIN}" --build . --target iree-microbenchmark-suites -- -k 0
# --------------------------------------------------------------------------- #
diff --git a/build_tools/cmake/iree_benchmark_suite.cmake b/build_tools/cmake/iree_benchmark_suite.cmake
index cdcb14a..f5dfd74 100644
--- a/build_tools/cmake/iree_benchmark_suite.cmake
+++ b/build_tools/cmake/iree_benchmark_suite.cmake
@@ -13,6 +13,8 @@
# `iree-benchmark-module`.
#
# Parameters:
+# GROUP_NAME: A group name this benchmark will join. Each group has its own
+# CMake's benchmark suite target: "iree-benchmark-suites-<GROUP_NAME>".
# MODULES: A list for model specification. Due to CMake's lack of data
# structures, each module is represented as a list suitable to be parsed
# by cmake_parse_arguments:
@@ -68,18 +70,24 @@
PARSE_ARGV 0
_RULE
""
- "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
+ "GROUP_NAME;DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
"BENCHMARK_MODES;BENCHMARK_TOOL;MODULES;TRANSLATION_FLAGS;RUNTIME_FLAGS"
)
iree_validate_required_arguments(
_RULE
- "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
+ "GROUP_NAME;DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
"BENCHMARK_MODES;BENCHMARK_TOOL;MODULES"
)
iree_package_name(PACKAGE_NAME)
+ # Add the benchmark suite target.
+ set(SUITE_SUB_TARGET "iree-benchmark-suites-${_RULE_GROUP_NAME}")
+ if(NOT TARGET "${SUITE_SUB_TARGET}")
+ add_custom_target("${SUITE_SUB_TARGET}")
+ endif()
+
foreach(_MODULE IN LISTS _RULE_MODULES)
cmake_parse_arguments(
_MODULE
@@ -234,6 +242,7 @@
# Mark dependency so that we have one target to drive them all.
add_dependencies(iree-benchmark-suites "${_TRANSLATION_TARGET_NAME}")
+ add_dependencies("${SUITE_SUB_TARGET}" "${_TRANSLATION_TARGET_NAME}")
endif(NOT TARGET "${_TRANSLATION_TARGET_NAME}")
set(_COMPILE_STATS_TRANSLATION_TARGET_NAME
@@ -272,6 +281,9 @@
add_dependencies(iree-benchmark-suites
"${_COMPILE_STATS_TRANSLATION_TARGET_NAME}"
)
+ add_dependencies("${SUITE_SUB_TARGET}"
+ "${_COMPILE_STATS_TRANSLATION_TARGET_NAME}"
+ )
endif()
if(NOT TARGET "${_FRIENDLY_TARGET_NAME}")
@@ -284,6 +296,7 @@
endif()
set(_RUN_SPEC_DIR "${_ROOT_ARTIFACTS_DIR}/${_MODULE_DIR_NAME}/${_BENCHMARK_DIR_NAME}")
+ list(JOIN _COMMON_NAME_SEGMENTS "__" _RUN_SPEC_TARGET_SUFFIX)
# Create the command and target for the flagfile spec used to execute
# the generated artifacts.
@@ -306,11 +319,8 @@
COMMENT "Generating ${_FLAG_FILE}"
)
- set(_FLAGFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-flagfile")
- list(APPEND _FLAGFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
- list(JOIN _FLAGFILE_GEN_TARGET_NAME_LIST "__" _FLAGFILE_GEN_TARGET_NAME)
- set(_FLAGFILE_GEN_TARGET_NAME "${PACKAGE_NAME}_${_FLAGFILE_GEN_TARGET_NAME}")
-
+ set(_FLAGFILE_GEN_TARGET_NAME
+ "${PACKAGE_NAME}_iree-generate-benchmark-flagfile__${_RUN_SPEC_TARGET_SUFFIX}")
add_custom_target("${_FLAGFILE_GEN_TARGET_NAME}"
DEPENDS "${_FLAG_FILE}"
)
@@ -325,23 +335,42 @@
COMMENT "Generating ${_TOOL_FILE}"
)
- set(_TOOLFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-toolfile")
- list(APPEND _TOOLFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
- list(JOIN _TOOLFILE_GEN_TARGET_NAME_LIST "__" _TOOLFILE_GEN_TARGET_NAME)
+ set(_TOOLFILE_GEN_TARGET_NAME
+ "${PACKAGE_NAME}_iree-generate-benchmark-toolfile__${_RUN_SPEC_TARGET_SUFFIX}")
add_custom_target("${_TOOLFILE_GEN_TARGET_NAME}"
DEPENDS "${_TOOL_FILE}"
)
# Generate a flagfile containing command-line options used to compile the
# generated artifacts.
- set(_COMPOPT_FILE "${_RUN_SPEC_DIR}/compilation_flagfile")
- string(REPLACE ";" "\n" IREE_BENCHMARK_COMPILATION_FLAGS "${_TRANSLATION_ARGS}")
- configure_file(
- ${PROJECT_SOURCE_DIR}/build_tools/cmake/benchmark_compilation_flagfile.in
- ${_COMPOPT_FILE})
+ set(_COMPILATION_FLAGFILE "${_RUN_SPEC_DIR}/compilation_flagfile")
+ # Generate the flagfile with python command. We can't use "file" because
+ # it can't be part of a target's dependency and generated lazily. And
+ # "cmake -E echo" doesn't work with newlines.
+ add_custom_command(
+ OUTPUT "${_COMPILATION_FLAGFILE}"
+ COMMAND
+ "${Python3_EXECUTABLE}" "${IREE_ROOT_DIR}/build_tools/scripts/generate_compilation_flagfile.py"
+ --output "${_COMPILATION_FLAGFILE}"
+ -- ${_TRANSLATION_ARGS}
+ WORKING_DIRECTORY "${_RUN_SPEC_DIR}"
+ COMMENT "Generating ${_COMPILATION_FLAGFILE}"
+ )
+
+ set(_COMPILATION_FLAGFILE_GEN_TARGET_NAME
+ "${PACKAGE_NAME}_iree-generate-benchmark-compilation-flagfile__${_RUN_SPEC_TARGET_SUFFIX}")
+ add_custom_target("${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
+ DEPENDS "${_COMPILATION_FLAGFILE}"
+ )
# Mark dependency so that we have one target to drive them all.
add_dependencies(iree-benchmark-suites
+ "${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
+ "${_FLAGFILE_GEN_TARGET_NAME}"
+ "${_TOOLFILE_GEN_TARGET_NAME}"
+ )
+ add_dependencies("${SUITE_SUB_TARGET}"
+ "${_COMPILATION_FLAGFILE_GEN_TARGET_NAME}"
"${_FLAGFILE_GEN_TARGET_NAME}"
"${_TOOLFILE_GEN_TARGET_NAME}"
)
diff --git a/build_tools/scripts/generate_compilation_flagfile.py b/build_tools/scripts/generate_compilation_flagfile.py
new file mode 100755
index 0000000..cf0cb13
--- /dev/null
+++ b/build_tools/scripts/generate_compilation_flagfile.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Generates a compilation flagfile for iree-compiler.
+
+This tool is added due to CMake's incapabilities on generating files with
+multiple lines. CMake's configure_file doesn't work in our case as it can't be
+triggered from a target.
+"""
+
+import argparse
+
+
+def parse_arguments():
+ """Parses command line arguments."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output",
+ type=str,
+ required=True,
+ help="output file to write to")
+ parser.add_argument("compilation_flags",
+ metavar="<compilation-flags>",
+ nargs="*",
+ help="list of compilation flags")
+ return parser.parse_args()
+
+
+def main(args):
+ with open(args.output, "w") as f:
+ f.write("\n".join(args.compilation_flags) + "\n")
+
+
+if __name__ == "__main__":
+ main(parse_arguments())