Add TFLite benchmarks for formerly TF models (#7645)

These benchmarks align very well with the existing TF source benchmarks
but don't involve going through our less-well-supported TF source
integration or storing unstable MLIR artifacts.

I did a comparison of benchmarks before and after on my Pixel 4 dev
phone, manually rewriting the model source in the former so that a
direct comparison was possible with existing tooling. No benchmarks
have significant changes:
https://gist.github.com/GMNGeoffrey/bce029bf4697f9b3deda3bb217b0c6b3
diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index 3af45c6..dca4b35 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -22,7 +22,7 @@
   TAGS
     "fp32"
   SOURCE
-    # No significant compression of tflite files by gzip
+    # Mirror of https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/default/1
     "https://storage.googleapis.com/iree-model-artifacts/deeplabv3.tflite"
   ENTRY_FUNCTION
     "main"
@@ -36,7 +36,7 @@
   TAGS
     "fp32"
   SOURCE
-    # No significant compression of tflite files by gzip
+    # Mirror of https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite
     "https://storage.googleapis.com/iree-model-artifacts/mobile_ssd_v2_float_coco.tflite"
   ENTRY_FUNCTION
     "main"
@@ -50,7 +50,7 @@
   TAGS
     "fp32"
   SOURCE
-    # No significant compression of tflite files by gzip
+    # Mirror of https://tfhub.dev/tensorflow/lite-model/posenet/mobilenet/float/075/1/default/1
     "https://storage.googleapis.com/iree-model-artifacts/posenet.tflite"
   ENTRY_FUNCTION
     "main"
@@ -58,6 +58,68 @@
     "1x353x257x3xf32"
 )
 
+set(MOBILEBERT_FP32_MODULE
+  NAME
+    "MobileBertSquad"
+  TAGS
+    "fp32"
+  SOURCE
+    # Mirror of https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1
+    "https://storage.googleapis.com/iree-model-artifacts/mobilebertsquad.tflite"
+  ENTRY_FUNCTION
+    "main"
+  FUNCTION_INPUTS
+    "1x384xi32,1x384xi32,1x384xi32"
+)
+
+set(MOBILEBERT_FP16_MODULE
+  NAME
+    "MobileBertSquad"
+  TAGS
+    "fp16"
+  # This uses the same input MLIR source as fp32 to save download time.
+  # It requires users to have "--iree-flow-demote-f32-to-f16".
+  SOURCE
+    # Mirror of https://tfhub.dev/tensorflow/lite-model/mobilebert/1/default/1
+    "https://storage.googleapis.com/iree-model-artifacts/mobilebertsquad.tflite"
+  ENTRY_FUNCTION
+    "main"
+  # The conversion done by "--iree-flow-demote-f32-to-f16" won't change the
+  # original input signature.
+  FUNCTION_INPUTS
+    "1x384xi32,1x384xi32,1x384xi32"
+)
+
+set(MOBILENET_V2_MODULE
+  NAME
+    "MobileNetV2"
+  TAGS
+    "fp32,imagenet"
+  SOURCE
+    # Mirror https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/python/tests/testdata/image_classifier/mobilenet_v2_1.0_224.tflite
+    "https://storage.googleapis.com/iree-model-artifacts/mobilenet_v2_1.0_224.tflite"
+  ENTRY_FUNCTION
+    "main"
+  FUNCTION_INPUTS
+    "1x224x224x3xf32"
+)
+
+set(MOBILENET_V3SMALL_MODULE
+  NAME
+    "MobileNetV3Small"
+  TAGS
+    "fp32,imagenet"
+  SOURCE
+    # https://tfhub.dev/google/imagenet/mobilenet_v3_small_100_224/classification/5
+    # Manually exported to tflite with static batch dimension
+    "https://storage.googleapis.com/iree-model-artifacts/MobileNetV3SmallStaticBatch.tflite"
+  ENTRY_FUNCTION
+    "main"
+  FUNCTION_INPUTS
+    "1x224x224x3xf32"
+)
+
+
 ################################################################################
 #                                                                              #
 # Default benchmark configurations                                             #
@@ -71,8 +133,16 @@
 ################################################################################
 
 set(ANDROID_CPU_TRANSLATION_FLAGS
-      "--iree-input-type=tosa"
-      "--iree-llvm-target-triple=aarch64-none-linux-android29")
+  "--iree-input-type=tosa"
+  "--iree-llvm-target-triple=aarch64-none-linux-android29")
+set(ANDROID_ADRENO_GPU_TRANSLATION_FLAGS
+  "--iree-input-type=tosa"
+  "--iree-vulkan-target-triple=adreno-unknown-android11"
+)
+set(ANDROID_MALI_GPU_TRANSLATION_FLAGS
+  "--iree-input-type=tosa"
+  "--iree-vulkan-target-triple=valhall-unknown-android11"
+)
 
 # CPU, Dylib-Sync, big/little-core, full-inference
 iree_benchmark_suite(
@@ -80,6 +150,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "big-core,full-inference,default-flags"
@@ -100,6 +173,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "1-thread,big-core,full-inference,default-flags"
@@ -122,6 +198,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "2-thread,little-core,full-inference,default-flags"
@@ -142,6 +221,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "3-thread,little-core,full-inference,default-flags"
@@ -162,6 +244,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "4-thread,little-core,full-inference,default-flags"
@@ -183,6 +268,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "full-inference,default-flags"
@@ -191,8 +279,7 @@
   TARGET_ARCHITECTURE
     "GPU-Adreno"
   TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-    "--iree-vulkan-target-triple=adreno-unknown-android11"
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
   DRIVER
     "vulkan"
 )
@@ -203,6 +290,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "full-inference,default-flags"
@@ -211,8 +301,27 @@
   TARGET_ARCHITECTURE
     "GPU-Mali-Valhall"
   TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+  DRIVER
+    "vulkan"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_benchmark_suite(
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,default-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    # This isn't a special optimization flag. It's so we can reuse the same f32
+    # model file. See comments on MOBILEBERT_FP16_MODULE
+    "--iree-flow-demote-f32-to-f16"
   DRIVER
     "vulkan"
 )
@@ -239,6 +348,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "big-core,full-inference,experimental-flags"
@@ -261,6 +373,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "1-thread,big-core,full-inference,experimental-flags"
@@ -285,6 +400,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "2-thread,little-core,full-inference,experimental-flags"
@@ -304,9 +422,12 @@
 
 iree_benchmark_suite(
   MODULES
-    "${DEEPLABV3_FP32_MODULE}"
-    "${MOBILESSD_FP32_MODULE}"
-    "${POSENET_FP32_MODULE}"
+  "${DEEPLABV3_FP32_MODULE}"
+  "${MOBILESSD_FP32_MODULE}"
+  "${POSENET_FP32_MODULE}"
+  "${MOBILEBERT_FP32_MODULE}"
+  "${MOBILENET_V2_MODULE}"
+  "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "3-thread,little-core,full-inference,experimental-flags"
@@ -329,6 +450,9 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "4-thread,little-core,full-inference,experimental-flags"
@@ -346,12 +470,42 @@
     "--task_topology_group_count=4"
 )
 
+
+# CPU, VMVX, 4-thread, little-core, full-inference
+# VMVX is slow and we're not optimizing perf yet. Leaving in a single max-thread
+# benchmark because it's useful to keep an eye on and helps disambiguate where a
+# performance change may be coming from (e.g. if it's in vmvx as well, it's
+# probably not a codegen issue).
+iree_benchmark_suite(
+  MODULES
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "4-thread,little-core,full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vmvx"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    "--iree-input-type=tosa"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+  DRIVER
+    "vmvx"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=4"
+)
+
+
 # GPU, Vulkan, Adreno, full-inference
 iree_benchmark_suite(
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
 
   BENCHMARK_MODES
     "full-inference,experimental-flags"
@@ -360,8 +514,7 @@
   TARGET_ARCHITECTURE
     "GPU-Adreno"
   TRANSLATION_FLAGS
-    "--iree-input-type=tosa"
-    "--iree-vulkan-target-triple=adreno-unknown-android11"
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
     "--iree-flow-inline-constants-max-byte-length=2048"
     "--iree-enable-fusion-with-reduction-ops"
   DRIVER
@@ -374,6 +527,27 @@
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "full-inference,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-enable-fusion-with-reduction-ops"
+  DRIVER
+    "vulkan"
+)
+
+iree_benchmark_suite(
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
 
   BENCHMARK_MODES
     "full-inference,experimental-flags"
@@ -383,6 +557,7 @@
     "GPU-Mali-Valhall"
   TRANSLATION_FLAGS
     "--iree-input-type=tosa"
+    "--iree-flow-demote-f32-to-f16"
     "--iree-vulkan-target-triple=valhall-unknown-android11"
     "--iree-flow-inline-constants-max-byte-length=16"
     "--iree-enable-fusion-with-reduction-ops"
@@ -390,12 +565,75 @@
     "vulkan"
 )
 
+# kernel-execution
+
+# Note that for kernel-execution benchmarks batch_size/repeat-count need to be
+# low enough that the whole dispatch completes within an OS-specific timeout.
+# Otherwise you'll get error like:
+# ```
+# INTERNAL; VK_ERROR_DEVICE_LOST; vkQueueSubmit; while invoking native function
+# hal.ex.submit_and_wait; while calling import;
+# ```
+# With current kernel performance and timeouts on Adreno Pixel 4, this means we
+# have no kernel benchmark for the DeepLabV3 and MobileBert models
+# TODO: Add kernel-execution config for DEEPLABV3_FP32_MODULE and
+# MOBILEBERT_FP32_MODULE when they can run with at least 8 repetitions.
+
+# GPU, Vulkan, Adreno, kernel-execution
+iree_benchmark_suite(
+  MODULES
+    "${MOBILESSD_FP32_MODULE}"
+    "${POSENET_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "kernel-execution,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Adreno"
+  TRANSLATION_FLAGS
+    ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-inline-constants-max-byte-length=2048"
+    "--iree-enable-fusion-with-reduction-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=16"
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=16"
+)
+
 # GPU, Vulkan, Mali, kernel-execution
 iree_benchmark_suite(
   MODULES
     "${DEEPLABV3_FP32_MODULE}"
     "${MOBILESSD_FP32_MODULE}"
     "${POSENET_FP32_MODULE}"
+    "${MOBILEBERT_FP32_MODULE}"
+    "${MOBILENET_V2_MODULE}"
+    "${MOBILENET_V3SMALL_MODULE}"
+
+  BENCHMARK_MODES
+    "kernel-execution,experimental-flags"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-enable-fusion-with-reduction-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=32"
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=32"
+)
+
+iree_benchmark_suite(
+  MODULES
+    "${MOBILEBERT_FP16_MODULE}"
 
   BENCHMARK_MODES
     "kernel-execution,experimental-flags"
@@ -405,6 +643,7 @@
     "GPU-Mali-Valhall"
   TRANSLATION_FLAGS
     "--iree-input-type=tosa"
+    "--iree-flow-demote-f32-to-f16"
     "--iree-vulkan-target-triple=valhall-unknown-android11"
     "--iree-flow-inline-constants-max-byte-length=16"
     "--iree-enable-fusion-with-reduction-ops"
diff --git a/benchmarks/TensorFlow/CMakeLists.txt b/benchmarks/TensorFlow/CMakeLists.txt
deleted file mode 100644
index b5ffbb5..0000000
--- a/benchmarks/TensorFlow/CMakeLists.txt
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright 2021 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-
-################################################################################
-#                                                                              #
-# Benchmark models from TensorFlow                                             #
-#                                                                              #
-# Each module specification should be a list containing alternating keys and   #
-# values. The fields are: NAME, TAGS, SOURCE, ENTRY_FUNCTION, and              #
-# FUNCTION_INPUTS. See the iree_benchmark_suite definition for details         #
-# about these fields. Note that these must be quoted when used as arguments.   #
-#                                                                              #
-################################################################################
-
-set(MOBILEBERT_FP16_MODULE
-  NAME
-    "MobileBertSquad"
-  TAGS
-    "fp16"
-  # This uses the same input MLIR source as fp32 to save download time.
-  # It requires users to have "--iree-flow-demote-f32-to-f16".
-  SOURCE
-    "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-89edfa50d.mlir.gz"
-  ENTRY_FUNCTION
-    "serving_default"
-  # The conversion done by "--iree-flow-demote-f32-to-f16" won't change the
-  # original input signature.
-  FUNCTION_INPUTS
-    "1x384xi32,1x384xi32,1x384xi32"
-)
-
-set(MOBILEBERT_FP32_MODULE
-  NAME
-    "MobileBertSquad"
-  TAGS
-    "fp32"
-  SOURCE
-    "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-89edfa50d.mlir.gz"
-  ENTRY_FUNCTION
-    "serving_default"
-  FUNCTION_INPUTS
-    "1x384xi32,1x384xi32,1x384xi32"
-)
-
-set(MOBILENET_V2_MODULE
-  NAME
-    "MobileNetV2"
-  TAGS
-    "fp32,imagenet"
-  SOURCE
-    "https://storage.googleapis.com/iree-model-artifacts/MobileNetV2-89edfa50d.mlir.gz"
-  ENTRY_FUNCTION
-    "call"
-  FUNCTION_INPUTS
-    "1x224x224x3xf32"
-)
-
-set(MOBILENET_V3SMALL_MODULE
-  NAME
-    "MobileNetV3Small"
-  TAGS
-    "fp32,imagenet"
-  SOURCE
-    "https://storage.googleapis.com/iree-model-artifacts/MobileNetV3Small-89edfa50d.mlir.gz"
-  ENTRY_FUNCTION
-    "call"
-  FUNCTION_INPUTS
-    "1x224x224x3xf32"
-)
-
-################################################################################
-#                                                                              #
-# Common benchmark configurations                                              #
-#                                                                              #
-# Each suite benchmarks a list of modules with some specific configuration,    #
-# typically involving different translation/runtime flags and targeting        #
-# different IREE drivers and hardware architectures.                           #
-#                                                                              #
-################################################################################
-
-# CPU, VMVX, 3-thread, little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "3-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vmvx"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-  DRIVER
-    "vmvx"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=3"
-)
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "big-core,full-inference,experimental-flags"
-    "little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-llvm-loop-unrolling=true"
-  DRIVER
-    "dylib-sync"
-)
-
-# CPU, Dylib, 1-thread, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "1-thread,big-core,full-inference,experimental-flags"
-    "1-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-llvm-loop-unrolling=true"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 3-thread, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "3-thread,big-core,full-inference,experimental-flags"
-    "3-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-llvm-loop-unrolling=true"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=3"
-)
-
-# GPU, Vulkan, Adreno, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Adreno"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-vulkan-target-triple=adreno-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-enable-fusion-with-reduction-ops"
-  DRIVER
-    "vulkan"
-)
-
-# GPU, Vulkan, Adreno, kernel-execution
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Adreno"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-vulkan-target-triple=adreno-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-    "--iree-enable-fusion-with-reduction-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=16"
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=16"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP32_MODULE}"
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=16"
-    "--iree-enable-fusion-with-reduction-ops"
-  DRIVER
-    "vulkan"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
-  MODULES
-    "${MOBILENET_V2_MODULE}"
-    "${MOBILENET_V3SMALL_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=16"
-    "--iree-enable-fusion-with-reduction-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=32"
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=32"
-)
-
-# GPU, Vulkan, Mali, kernel-execution
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP16_MODULE}"
-
-  BENCHMARK_MODES
-    "kernel-execution,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-flow-demote-f32-to-f16"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=16"
-    "--iree-enable-fusion-with-reduction-ops"
-    "--iree-hal-benchmark-dispatch-repeat-count=32"
-  DRIVER
-    "vulkan"
-  RUNTIME_FLAGS
-    "--batch_size=32"
-)
-
-# GPU, Vulkan, Mali, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP16_MODULE}"
-
-  BENCHMARK_MODES
-    "full-inference,experimental-flags"
-  TARGET_BACKEND
-    "vulkan-spirv"
-  TARGET_ARCHITECTURE
-    "GPU-Mali-Valhall"
-  TRANSLATION_FLAGS
-    "--iree-input-type=mhlo"
-    "--iree-flow-demote-f32-to-f16"
-    "--iree-vulkan-target-triple=valhall-unknown-android11"
-    "--iree-flow-inline-constants-max-byte-length=16"
-    "--iree-enable-fusion-with-reduction-ops"
-  DRIVER
-    "vulkan"
-)
-
-################################################################################
-#                                                                              #
-# Speical benchmark configurations                                             #
-#                                                                              #
-# These are configurations that can only be enabled for some specific model.   #
-# However, THIS SHOULD REALLY BE TEMPORARY; we should strike for uniformity.   #
-#                                                                              #
-################################################################################
-
-# CPU, Dylib-Sync, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP32_MODULE}"
-
-  BENCHMARK_MODES
-    "big-core,full-inference,experimental-flags"
-    "little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    # TODO: Merge this rule once we can use the same flags as the common one.
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-  DRIVER
-    "dylib-sync"
-)
-
-# CPU, Dylib, 1-thread, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP32_MODULE}"
-
-  BENCHMARK_MODES
-    "1-thread,big-core,full-inference,experimental-flags"
-    "1-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    # TODO: Merge this rule once we can use the same flags as the common one.
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=1"
-)
-
-# CPU, Dylib, 3-thread, big/little-core, full-inference
-iree_benchmark_suite(
-  MODULES
-    "${MOBILEBERT_FP32_MODULE}"
-
-  BENCHMARK_MODES
-    "3-thread,big-core,full-inference,experimental-flags"
-    "3-thread,little-core,full-inference,experimental-flags"
-  TARGET_BACKEND
-    "dylib-llvm-aot"
-  TARGET_ARCHITECTURE
-    "CPU-ARM64-v8A"
-  TRANSLATION_FLAGS
-    # TODO: Merge this rule once we can use the same flags as the common one.
-    "--iree-input-type=mhlo"
-    "--iree-llvm-target-triple=aarch64-none-linux-android29"
-    "--iree-flow-inline-constants-max-byte-length=2048"
-  DRIVER
-    "dylib"
-  RUNTIME_FLAGS
-    "--task_topology_group_count=3"
-)
diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
index a6e8e35..1d89b92 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
@@ -41,7 +41,7 @@
     artifact_paths:
       - "benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json"
       - "trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz"
-    timeout_in_minutes: "40"
+    timeout_in_minutes: "60"
 
   - label: "Benchmark on Galaxy S20 (exynos-990, mali-g77)"
     commands:
@@ -61,7 +61,7 @@
     artifact_paths:
       - "benchmark-results-galaxy-s20-${BUILDKITE_BUILD_NUMBER}.json"
       - "trace-captures-galaxy-s20-${BUILDKITE_BUILD_NUMBER}.tgz"
-    timeout_in_minutes: "40"
+    timeout_in_minutes: "60"
 
   - wait