Add TFLite benchmarks for formerly TF models (#7645)
These benchmarks align very well with the existing TF source benchmarks
but don't involve going through our less-well-supported TF source
integration or storing unstable MLIR artifacts.
I did a comparison of benchmarks before and after on my Pixel 4 dev
phone, manually rewriting the model source in the former so that a
direct comparison was possible with existing tooling. No benchmarks
have significant changes:
https://gist.github.com/GMNGeoffrey/bce029bf4697f9b3deda3bb217b0c6b3
diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index 3af45c6..dca4b35 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -22,7 +22,7 @@
TAGS
"fp32"
SOURCE
- # No significant compression of tflite files by gzip
+ # Mirror of https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/default/1
"https://storage.googleapis.com/iree-model-artifacts/deeplabv3.tflite"
ENTRY_FUNCTION
"main"
@@ -36,7 +36,7 @@
TAGS
"fp32"
SOURCE
- # No significant compression of tflite files by gzip
+ # Mirror of https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite
"https://storage.googleapis.com/iree-model-artifacts/mobile_ssd_v2_float_coco.tflite"
ENTRY_FUNCTION
"main"
@@ -50,7 +50,7 @@
TAGS
"fp32"
SOURCE
- # No significant compression of tflite files by gzip
+ # Mirror of https://tfhub.dev/tensorflow/lite-model/posenet/mobilenet/float/075/1/default/1
"https://storage.googleapis.com/iree-model-artifacts/posenet.tflite"
ENTRY_FUNCTION
"main"
@@ -58,6 +58,68 @@
"1x353x257x3xf32"
)
+set(MOBILEBERT_FP32_MODULE
+ NAME
+ "MobileBertSquad"
+ TAGS
+ "fp32"
+ SOURCE
+ # Mirror of https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1
+ "https://storage.googleapis.com/iree-model-artifacts/mobilebertsquad.tflite"
+ ENTRY_FUNCTION
+ "main"
+ FUNCTION_INPUTS
+ "1x384xi32,1x384xi32,1x384xi32"
+)
+
+set(MOBILEBERT_FP16_MODULE
+ NAME
+ "MobileBertSquad"
+ TAGS
+ "fp16"
+ # This uses the same input MLIR source as fp32 to save download time.
+ # It requires users to have "--iree-flow-demote-f32-to-f16".
+ SOURCE
+ # Mirror of https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1
+ "https://storage.googleapis.com/iree-model-artifacts/mobilebertsquad.tflite"
+ ENTRY_FUNCTION
+ "main"
+ # The conversion done by "--iree-flow-demote-f32-to-f16" won't change the
+ # original input signature.
+ FUNCTION_INPUTS
+ "1x384xi32,1x384xi32,1x384xi32"
+)
+
+set(MOBILENET_V2_MODULE
+ NAME
+ "MobileNetV2"
+ TAGS
+ "fp32,imagenet"
+ SOURCE
+ # Mirror https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/python/tests/testdata/image_classifier/mobilenet_v2_1.0_224.tflite
+ "https://storage.googleapis.com/iree-model-artifacts/mobilenet_v2_1.0_224.tflite"
+ ENTRY_FUNCTION
+ "main"
+ FUNCTION_INPUTS
+ "1x224x224x3xf32"
+)
+
+set(MOBILENET_V3SMALL_MODULE
+ NAME
+ "MobileNetV3Small"
+ TAGS
+ "fp32,imagenet"
+ SOURCE
+ # https://tfhub.dev/google/imagenet/mobilenet_v3_small_100_224/classification/5
+ # Manually exported to tflite with static batch dimension
+ "https://storage.googleapis.com/iree-model-artifacts/MobileNetV3SmallStaticBatch.tflite"
+ ENTRY_FUNCTION
+ "main"
+ FUNCTION_INPUTS
+ "1x224x224x3xf32"
+)
+
+
################################################################################
# #
# Default benchmark configurations #
@@ -71,8 +133,16 @@
################################################################################
set(ANDROID_CPU_TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-llvm-target-triple=aarch64-none-linux-android29")
+ "--iree-input-type=tosa"
+ "--iree-llvm-target-triple=aarch64-none-linux-android29")
+set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-vulkan-target-triple=adreno-unknown-android11"
+)
+set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-vulkan-target-triple=valhall-unknown-android11"
+)
# CPU, Dylib-Sync, big/little-core, full-inference
iree_benchmark_suite(
@@ -80,6 +150,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"big-core,full-inference,default-flags"
@@ -100,6 +173,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"1-thread,big-core,full-inference,default-flags"
@@ -122,6 +198,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"2-thread,little-core,full-inference,default-flags"
@@ -142,6 +221,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"3-thread,little-core,full-inference,default-flags"
@@ -162,6 +244,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"4-thread,little-core,full-inference,default-flags"
@@ -183,6 +268,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"full-inference,default-flags"
@@ -191,8 +279,7 @@
TARGET_ARCHITECTURE
"GPU-Adreno"
TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-vulkan-target-triple=adreno-unknown-android11"
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
DRIVER
"vulkan"
)
@@ -203,6 +290,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"full-inference,default-flags"
@@ -211,8 +301,27 @@
TARGET_ARCHITECTURE
"GPU-Mali-Valhall"
TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ DRIVER
+ "vulkan"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,default-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ # This isn't a special optimization flag. It's so we can reuse the same f32
+ # model file. See comments on MOBILEBERT_FP16_MODULE
+ "--iree-flow-demote-f32-to-f16"
DRIVER
"vulkan"
)
@@ -239,6 +348,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"big-core,full-inference,experimental-flags"
@@ -261,6 +373,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"1-thread,big-core,full-inference,experimental-flags"
@@ -285,6 +400,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"2-thread,little-core,full-inference,experimental-flags"
@@ -304,9 +422,12 @@
iree_benchmark_suite(
MODULES
- "${DEEPLABV3_FP32_MODULE}"
- "${MOBILESSD_FP32_MODULE}"
- "${POSENET_FP32_MODULE}"
+ "${DEEPLABV3_FP32_MODULE}"
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"3-thread,little-core,full-inference,experimental-flags"
@@ -329,6 +450,9 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"4-thread,little-core,full-inference,experimental-flags"
@@ -346,12 +470,42 @@
"--task_topology_group_count=4"
)
+
+# CPU, VMVX, 4-thread, little-core, full-inference
+# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
+# benchmark because it's useful to keep an eye on and helps disambiguate where a
+# performance change may be coming from (e.g. if it's in vmvx as well, it's
+# probably not a codegen issue).
+iree_benchmark_suite(
+ MODULES
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "4-thread,little-core,full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vmvx"
+ TARGET_ARCHITECTURE
+ "CPU-ARM64-v8A"
+ TRANSLATION_FLAGS
+ "--iree-input-type=tosa"
+ "--iree-flow-inline-constants-max-byte-length=2048"
+ DRIVER
+ "vmvx"
+ RUNTIME_FLAGS
+ "--task_topology_group_count=4"
+)
+
+
# GPU, Vulkan, Adreno, full-inference
iree_benchmark_suite(
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
BENCHMARK_MODES
"full-inference,experimental-flags"
@@ -360,8 +514,7 @@
TARGET_ARCHITECTURE
"GPU-Adreno"
TRANSLATION_FLAGS
- "--iree-input-type=tosa"
- "--iree-vulkan-target-triple=adreno-unknown-android11"
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
"--iree-flow-inline-constants-max-byte-length=2048"
"--iree-enable-fusion-with-reduction-ops"
DRIVER
@@ -374,6 +527,27 @@
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "full-inference,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-inline-constants-max-byte-length=16"
+ "--iree-enable-fusion-with-reduction-ops"
+ DRIVER
+ "vulkan"
+)
+
+iree_benchmark_suite(
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
BENCHMARK_MODES
"full-inference,experimental-flags"
@@ -383,6 +557,7 @@
"GPU-Mali-Valhall"
TRANSLATION_FLAGS
"--iree-input-type=tosa"
+ "--iree-flow-demote-f32-to-f16"
"--iree-vulkan-target-triple=valhall-unknown-android11"
"--iree-flow-inline-constants-max-byte-length=16"
"--iree-enable-fusion-with-reduction-ops"
@@ -390,12 +565,75 @@
"vulkan"
)
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+# With current kernel performance and timeouts on Adreno Pixel 4, this means we
+# have no kernel benchmark for the DeepLabV3 and MobileBert models
+# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
+# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
+
+# GPU, Vulkan, Adreno, kernel-execution
+iree_benchmark_suite(
+ MODULES
+ "${MOBILESSD_FP32_MODULE}"
+ "${POSENET_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "kernel-execution,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Adreno"
+ TRANSLATION_FLAGS
+ ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-inline-constants-max-byte-length=2048"
+ "--iree-enable-fusion-with-reduction-ops"
+ "--iree-hal-benchmark-dispatch-repeat-count=16"
+ DRIVER
+ "vulkan"
+ RUNTIME_FLAGS
+ "--batch_size=16"
+)
+
# GPU, Vulkan, Mali, kernel-execution
iree_benchmark_suite(
MODULES
"${DEEPLABV3_FP32_MODULE}"
"${MOBILESSD_FP32_MODULE}"
"${POSENET_FP32_MODULE}"
+ "${MOBILEBERT_FP32_MODULE}"
+ "${MOBILENET_V2_MODULE}"
+ "${MOBILENET_V3SMALL_MODULE}"
+
+ BENCHMARK_MODES
+ "kernel-execution,experimental-flags"
+ TARGET_BACKEND
+ "vulkan-spirv"
+ TARGET_ARCHITECTURE
+ "GPU-Mali-Valhall"
+ TRANSLATION_FLAGS
+ ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+ "--iree-flow-inline-constants-max-byte-length=16"
+ "--iree-enable-fusion-with-reduction-ops"
+ "--iree-hal-benchmark-dispatch-repeat-count=32"
+ DRIVER
+ "vulkan"
+ RUNTIME_FLAGS
+ "--batch_size=32"
+)
+
+iree_benchmark_suite(
+ MODULES
+ "${MOBILEBERT_FP16_MODULE}"
BENCHMARK_MODES
"kernel-execution,experimental-flags"
@@ -405,6 +643,7 @@
"GPU-Mali-Valhall"
TRANSLATION_FLAGS
"--iree-input-type=tosa"
+ "--iree-flow-demote-f32-to-f16"
"--iree-vulkan-target-triple=valhall-unknown-android11"
"--iree-flow-inline-constants-max-byte-length=16"
"--iree-enable-fusion-with-reduction-ops"
diff --git a/benchmarks/TensorFlow/CMakeLists.txt b/benchmarks/TensorFlow/CMakeLists.txt
deleted file mode 100644
index b5ffbb5..0000000
--- a/benchmarks/TensorFlow/CMakeLists.txt
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-
-################################################################################
-# #
-# Benchmark models from TensorFlow #
-# #
-# Each module specification should be a list containing alternating keys and #
-# values. The fields are: NAME, TAGS, SOURCE, ENTRY_FUNCTION, and #
-# FUNCTION_INPUTS. See the iree_benchmark_suite definition for details #
-# about these fields. Note that these must be quoted when used as arguments. #
-# #
-################################################################################
-
-set(MOBILEBERT_FP16_MODULE
- NAME
- "MobileBertSquad"
- TAGS
- "fp16"
- # This uses the same input MLIR source as fp32 to save download time.
- # It requires users to have "--iree-flow-demote-f32-to-f16".
- SOURCE
- "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-89edfa50d.mlir.gz"
- ENTRY_FUNCTION
- "serving_default"
- # The conversion done by "--iree-flow-demote-f32-to-f16" won't change the
- # original input signature.
- FUNCTION_INPUTS
- "1x384xi32,1x384xi32,1x384xi32"
-)
-
-set(MOBILEBERT_FP32_MODULE
- NAME
- "MobileBertSquad"
- TAGS
- "fp32"
- SOURCE
- "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-89edfa50d.mlir.gz"
- ENTRY_FUNCTION
- "serving_default"
- FUNCTION_INPUTS
- "1x384xi32,1x384xi32,1x384xi32"
-)
-
-set(MOBILENET_V2_MODULE
- NAME
- "MobileNetV2"
- TAGS
- "fp32,imagenet"
- SOURCE
- "https://storage.googleapis.com/iree-model-artifacts/MobileNetV2-89edfa50d.mlir.gz"
- ENTRY_FUNCTION
- "call"
- FUNCTION_INPUTS
- "1x224x224x3xf32"
-)
-
-set(MOBILENET_V3SMALL_MODULE
- NAME
- "MobileNetV3Small"
- TAGS
- "fp32,imagenet"
- SOURCE
- "https://storage.googleapis.com/iree-model-artifacts/MobileNetV3Small-89edfa50d.mlir.gz"
- ENTRY_FUNCTION
- "call"
- FUNCTION_INPUTS
- "1x224x224x3xf32"
-)
-
-################################################################################
-# #
-# Common benchmark configurations #
-# #
-# Each suite benchmarks a list of modules with some specific configuration, #
-# typically involving different translation/runtime flags and targeting #
-# different IREE drivers and hardware architectures. #
-# #
-################################################################################
-
-# CPU, VMVX, 3-thread, little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "3-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "vmvx"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-flow-inline-constants-max-byte-length=2048"
- DRIVER
- "vmvx"
- RUNTIME_FLAGS
- "--task_topology_group_count=3"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "big-core,full-inference,experimental-flags"
- "little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- "--iree-llvm-loop-unrolling=true"
- DRIVER
- "dylib-sync"
-)
-
-# CPU, Dylib, 1-thread, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "1-thread,big-core,full-inference,experimental-flags"
- "1-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- "--iree-llvm-loop-unrolling=true"
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 3-thread, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "3-thread,big-core,full-inference,experimental-flags"
- "3-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- "--iree-llvm-loop-unrolling=true"
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=3"
-)
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Adreno"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-vulkan-target-triple=adreno-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=2048"
- "--iree-enable-fusion-with-reduction-ops"
- DRIVER
- "vulkan"
-)
-
-# GPU, Vulkan, Adreno, kernel-execution
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Adreno"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-vulkan-target-triple=adreno-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=2048"
- "--iree-enable-fusion-with-reduction-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=16"
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=16"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP32_MODULE}"
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=16"
- "--iree-enable-fusion-with-reduction-ops"
- DRIVER
- "vulkan"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
- MODULES
- "${MOBILENET_V2_MODULE}"
- "${MOBILENET_V3SMALL_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=16"
- "--iree-enable-fusion-with-reduction-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=32"
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=32"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP16_MODULE}"
-
- BENCHMARK_MODES
- "kernel-execution,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-flow-demote-f32-to-f16"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=16"
- "--iree-enable-fusion-with-reduction-ops"
- "--iree-hal-benchmark-dispatch-repeat-count=32"
- DRIVER
- "vulkan"
- RUNTIME_FLAGS
- "--batch_size=32"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP16_MODULE}"
-
- BENCHMARK_MODES
- "full-inference,experimental-flags"
- TARGET_BACKEND
- "vulkan-spirv"
- TARGET_ARCHITECTURE
- "GPU-Mali-Valhall"
- TRANSLATION_FLAGS
- "--iree-input-type=mhlo"
- "--iree-flow-demote-f32-to-f16"
- "--iree-vulkan-target-triple=valhall-unknown-android11"
- "--iree-flow-inline-constants-max-byte-length=16"
- "--iree-enable-fusion-with-reduction-ops"
- DRIVER
- "vulkan"
-)
-
-################################################################################
-# #
-# Speical benchmark configurations #
-# #
-# These are configurations that can only be enabled for some specific model. #
-# However, THIS SHOULD REALLY BE TEMPORARY; we should strike for uniformity. #
-# #
-################################################################################
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP32_MODULE}"
-
- BENCHMARK_MODES
- "big-core,full-inference,experimental-flags"
- "little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- # TODO: Merge this rule once we can use the same flags as the common one.
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- DRIVER
- "dylib-sync"
-)
-
-# CPU, Dylib, 1-thread, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP32_MODULE}"
-
- BENCHMARK_MODES
- "1-thread,big-core,full-inference,experimental-flags"
- "1-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- # TODO: Merge this rule once we can use the same flags as the common one.
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 3-thread, big/little-core, full-inference
-iree_benchmark_suite(
- MODULES
- "${MOBILEBERT_FP32_MODULE}"
-
- BENCHMARK_MODES
- "3-thread,big-core,full-inference,experimental-flags"
- "3-thread,little-core,full-inference,experimental-flags"
- TARGET_BACKEND
- "dylib-llvm-aot"
- TARGET_ARCHITECTURE
- "CPU-ARM64-v8A"
- TRANSLATION_FLAGS
- # TODO: Merge this rule once we can use the same flags as the common one.
- "--iree-input-type=mhlo"
- "--iree-llvm-target-triple=aarch64-none-linux-android29"
- "--iree-flow-inline-constants-max-byte-length=2048"
- DRIVER
- "dylib"
- RUNTIME_FLAGS
- "--task_topology_group_count=3"
-)
diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
index a6e8e35..1d89b92 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
@@ -41,7 +41,7 @@
artifact_paths:
- "benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json"
- "trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz"
- timeout_in_minutes: "40"
+ timeout_in_minutes: "60"
- label: "Benchmark on Galaxy S20 (exynos-990, mali-g77)"
commands:
@@ -61,7 +61,7 @@
artifact_paths:
- "benchmark-results-galaxy-s20-${BUILDKITE_BUILD_NUMBER}.json"
- "trace-captures-galaxy-s20-${BUILDKITE_BUILD_NUMBER}.tgz"
- timeout_in_minutes: "40"
+ timeout_in_minutes: "60"
- wait