[cmake] Support defining benchmark suites from MLIR input modules (#6024)

We have seen several benchmark artifacts generation issues in
the benchmark pipeline thus far. It's because the benchmark pipeline
is totally detached from the normal CI testing flow and it's not
run pre-submit.

This commit adds a CMake rule, `iree_mlir_benchmark_suite`, to
support defining benchmark suites from MLIR input modules. With it,
we can have benchmark artifact generated by building a normal CMake
target: `cmake --target iree-benchmark-suites`.

Concretely, this rule downloads imported model MLIR files, invokes
`iree-translate` to convert them according to the given translation
flags in the rule, and then generates the corresponding `flagfile`
for `iree-benchmark-module` to use.

With this commit, we should be able to make benchmark artifact
generation a step in the normal CI testing flow. Additionally,
we have the additional benefit that now we have a unified and simple
way for bots and developers to generate the full benchmark suite.

Ideally, we shouldn't be downloading with CMake; we should directly
compile from the initial source (i.e., TensorFlow Python models).
It avoids the manual step for uploading imported model somewhere for
every model we need to benchmark and avoids out-of-date inputs.
But that is tangled with the pending Python testing infrastructure
revamp so we'd prefer to not do that right now.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e996397..c495613 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@
 
 option(IREE_BUILD_COMPILER "Builds the IREE compiler." ON)
 option(IREE_BUILD_TESTS "Builds IREE unit tests." ON)
+option(IREE_BUILD_BENCHMARKS "Builds IREE benchmark suites." OFF)
 option(IREE_BUILD_DOCS "Builds IREE docs." OFF)
 option(IREE_BUILD_SAMPLES "Builds IREE sample projects." ON)
 option(IREE_BUILD_EMBEDDING_SAMPLES "Builds IREE embedding sample projects. The compiler needs to be available." OFF)
@@ -222,6 +223,7 @@
 include(iree_add_all_subdirs)
 include(iree_check_test)
 include(iree_run_binary_test)
+include(iree_mlir_benchmark_suite)
 
 set(DEFAULT_CMAKE_BUILD_TYPE "Release")
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -417,6 +419,11 @@
 # IREE top-level targets
 #-------------------------------------------------------------------------------
 
+if(${IREE_BUILD_BENCHMARKS})
+  # Add a top-level custom target to drive generating benchmark suites.
+  add_custom_target(iree-benchmark-suites)
+endif()
+
 if(${IREE_BUILD_DOCS})
   # Add a top-level custom target to drive generating all documentation.
   # Register it to the default target given that IREE_BUILD_DOCS is explicitly
diff --git a/build_tools/android/common/benchmark_description.py b/build_tools/android/common/benchmark_description.py
index 2ca76a5..2db0a70 100644
--- a/build_tools/android/common/benchmark_description.py
+++ b/build_tools/android/common/benchmark_description.py
@@ -174,7 +174,8 @@
   - model_tags: a list of tags used to describe additional model information,
       e.g., ['imagenet']
   - model_source: the source of the model, e.g., 'TensorFlow'
-  - bench_mode: the mode of the benchmark, e.g., 'f32-full-inference'
+  - bench_mode: a list of tags for benchmark mode,
+      e.g., ['1-thread', 'big-core', 'full-inference']
   - runner: which runner is used for benchmarking, e.g., 'iree_vulkan', 'tflite'
   - device_info: an AndroidDeviceInfo object describing the phone where
       bnechmarks run
@@ -183,7 +184,7 @@
   model_name: str
   model_tags: Sequence[str]
   model_source: str
-  bench_mode: str
+  bench_mode: Sequence[str]
   runner: str
   device_info: AndroidDeviceInfo
 
@@ -207,8 +208,20 @@
     else:
       model_part = f"{self.model_name} ({self.model_source})"
     phone_part = f"{self.device_info.model} ({target_arch})"
+    mode = ",".join(self.bench_mode)
 
-    return f"{model_part} {self.bench_mode} with {driver} @ {phone_part}"
+    return f"{model_part} {mode} with {driver} @ {phone_part}"
+
+  def deduce_taskset(self) -> str:
+    """Deduces the CPU affinity taskset mask according to benchmark modes."""
+    # TODO: we actually should check the number of cores the phone have.
+    if "big-core" in self.bench_mode:
+      return "80" if "1-thread" in self.bench_mode else "f0"
+    if "little-core" in self.bench_mode:
+      return "08" if "1-thread" in self.bench_mode else "0f"
+
+    # Not specified: use the 7th core.
+    return "80"
 
   def to_json_object(self) -> Dict[str, Any]:
     return {
diff --git a/build_tools/android/run_benchmarks.py b/build_tools/android/run_benchmarks.py
index f7669ed..d891749 100755
--- a/build_tools/android/run_benchmarks.py
+++ b/build_tools/android/run_benchmarks.py
@@ -40,7 +40,7 @@
 # Relative path against build directory.
 BENCHMARK_SUITE_REL_PATH = "benchmark_suites"
 # Relative path against root benchmark suite directory.
-MLIR_MODEL_SUITE_REL_PATH = "mlir_models"
+TENSORFLOW_MODEL_SUITE_REL_PATH = "TensorFlow"
 
 # The flagfile's filename for compiled Python models.
 MODEL_FLAGFILE_NAME = "flagfile"
@@ -121,7 +121,7 @@
     - A BenchmarkInfo object.
   """
   model_root_dir = os.path.join(root_build_dir, BENCHMARK_SUITE_REL_PATH,
-                                MLIR_MODEL_SUITE_REL_PATH)
+                                TENSORFLOW_MODEL_SUITE_REL_PATH)
 
   # Extract the model name from the directory path. This uses the relative
   # path under the root model directory. If there are multiple segments,
@@ -137,9 +137,8 @@
     model_tags = [re.sub(r"\W+", "-", rest)]
   else:
     # Tags coming from the name itself.
-    rest = re.sub(r"\W+", "-", rest).split("-")
-    model_name = rest[0]
-    model_tags = rest[1:]
+    model_name, rest = rest.split("-", 1)
+    model_tags = rest.split(",")
 
   # Extract benchmark info from the directory path following convention:
   #   <iree-driver>__<target-architecture>__<benchmark_mode>
@@ -149,7 +148,7 @@
   return BenchmarkInfo(model_name=model_name,
                        model_tags=model_tags,
                        model_source="TensorFlow",
-                       bench_mode=bench_mode,
+                       bench_mode=bench_mode.split(","),
                        runner=iree_driver,
                        device_info=device_info)
 
@@ -170,7 +169,7 @@
   gpu_target_arch = GPU_NAME_TO_TARGET_ARCH_MAP[device_info.gpu_name.lower()]
 
   model_root_dir = os.path.join(root_build_dir, BENCHMARK_SUITE_REL_PATH,
-                                MLIR_MODEL_SUITE_REL_PATH)
+                                TENSORFLOW_MODEL_SUITE_REL_PATH)
   matched_benchmarks = []
 
   # Go over all benchmarks in the model directory to find those matching the
@@ -225,7 +224,7 @@
                                           verbose=verbose)
 
   model_root_dir = os.path.join(root_build_dir, BENCHMARK_SUITE_REL_PATH,
-                                MLIR_MODEL_SUITE_REL_PATH)
+                                TENSORFLOW_MODEL_SUITE_REL_PATH)
 
   results = []
 
@@ -244,6 +243,8 @@
                                                 verbose=verbose)
 
     cmd = [
+        "taskset",
+        benchmark_info.deduce_taskset(),
         android_tool_path,
         f"--flagfile={android_flagfile_path}",
         f"--benchmark_repetitions={BENCHMARK_REPETITIONS}",
@@ -294,6 +295,12 @@
                       dest="output",
                       default=None,
                       help="Path to the ouput file")
+  parser.add_argument(
+      "--no-clean",
+      action="store_true",
+      help=
+      "Do not clean up the temporary directory used for benchmarking on the Android device"
+  )
   parser.add_argument("--verbose",
                       action="store_true",
                       help="Print internal information during execution")
@@ -342,8 +349,11 @@
     print(results.commit)
     print(results.benchmarks)
 
-  # Clear the benchmark directory on the Android device.
-  adb_execute_in_dir(["rm", "-rf", "*"], relative_dir="", verbose=args.verbose)
+  if not args.no_clean:
+    # Clear the benchmark directory on the Android device.
+    adb_execute_in_dir(["rm", "-rf", "*"],
+                       relative_dir="",
+                       verbose=args.verbose)
 
 
 if __name__ == "__main__":
diff --git a/build_tools/cmake/iree_mlir_benchmark_suite.cmake b/build_tools/cmake/iree_mlir_benchmark_suite.cmake
new file mode 100644
index 0000000..b5761f5
--- /dev/null
+++ b/build_tools/cmake/iree_mlir_benchmark_suite.cmake
@@ -0,0 +1,224 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# iree_check_lists_have_same_size()
+#
+# Note that the caller should pass in the list variables themselves to
+# LIST1 and LIST2, not the list variables' values.
+function(iree_check_lists_have_same_size LIST1 LIST2)
+  list(LENGTH "${LIST1}" _LIST1_COUNT)
+  list(LENGTH "${LIST2}" _LIST2_COUNT)
+  if(NOT _LIST1_COUNT EQUAL _LIST2_COUNT)
+    message(SEND_ERROR "${LIST1} count ${_LIST1_COUNT} does not "
+                       "match ${LIST2} count ${_LIST2_COUNT}"
+    )
+  endif()
+endfunction()
+
+# iree_mlir_benchmark_suite()
+#
+# Generates benchmark suites for MLIR input modules. The generated artifacts
+# will be executed with `iree-benchmark-module`.
+#
+# Parameters:
+#   MODULE_NAMES: A list of input module names.
+#   MODULE_TAGS: A list of tags for each input module.
+#   MODULE_SOURCES: The initial generating source for each input module.
+#   MLIR_SOURCES: The input file for each input module. It can be a file in
+#       checked in the repository; it can also be a URL for downloading from.
+#       the web. When it's a URL, the file should be a a direct .mlir file
+#       or a tarball containing a .mlir file; for both cases, the .mlir file
+#       should have a name matching the one in MODULE_NAMES.
+#   ENTRY_FUNCTIONS: The entry function name for each input module.
+#   FUNCTION_INPUTS: A list of entry function inputs for each input module.
+#   BENCHMARK_MODE: A comma-separated list of benchmark mode tags.
+#   TARGET_BACKEND: The compiler target backend.
+#   TARGET_ARCHITECTURE: The detailed target backend's architecture.
+#   TRANSLATION_FLAGS: A list of command-line options and their values to
+#       pass to the IREE translation tool for artifact generation.
+#   DRIVER: The runtime driver.
+#   RUNTIME_FLAGS: A list of command-line options and their values to pass
+#       to the IREE runtime during benchmark exectuion.
+#
+# The above parameters largely fall into two categories: 1) for specifying
+# the MLIR input module and its metadata, 2) for specifying the translation/
+# runtime configuration.
+#
+# 1)
+#
+# MODULE_NAMES, MODULE_TAGS, MODULE_SOURCES, MLIR_SOURCES, ENTRY_FUNCTIONS,
+# and FUNCTION_INPUTS together provide good flexiblity for specifying the MLIR
+# input module and its metadata. For example, we can generate modules with
+# idential name from different sources (TensorFlow, TFLite, PyTorch, etc.),
+# and we can transform the same input module differently for benchmarking
+# different aspects like fp32 vs fp16.
+#
+# Note that the above parameters are all lists and they should have the name
+# number of elements. This enables us to use the same CMake function call to
+# generate benchmarks for many models and share the specification of
+# translation/runtime configurations.
+#
+# 2)
+#
+# TARGET_BACKEND and TRANSLATION_FLAGS control how the input module will be
+# converted into the final IREE deployable module format. DRIVER and
+# RUNTIME_FLAGS specify how the module will be executed. BENCHMARK_MODE
+# can be used to give descriptions of the translation/runtime configuration
+# (e.g., full-inference vs. kernel-execution) and specify more contextual
+# requirements (e.g., big-core vs. little-core).
+#
+function(iree_mlir_benchmark_suite)
+  if(NOT IREE_BUILD_BENCHMARKS)
+    return()
+  endif()
+
+  cmake_parse_arguments(
+    PARSE_ARGV 0
+    _RULE
+    ""
+    "BENCHMARK_MODE;DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
+    "ENTRY_FUNCTIONS;FUNCTION_INPUTS;MLIR_SOURCES;MODULE_NAMES;MODULE_SOURCES;MODULE_TAGS;TRANSLATION_FLAGS;RUNTIME_FLAGS"
+  )
+
+  iree_check_lists_have_same_size(_RULE_MODULE_NAMES _RULE_MODULE_TAGS)
+  iree_check_lists_have_same_size(_RULE_MODULE_NAMES _RULE_MODULE_SOURCES)
+  iree_check_lists_have_same_size(_RULE_MODULE_NAMES _RULE_MLIR_SOURCES)
+  iree_check_lists_have_same_size(_RULE_MODULE_NAMES _RULE_ENTRY_FUNCTIONS)
+  iree_check_lists_have_same_size(_RULE_MODULE_NAMES _RULE_FUNCTION_INPUTS)
+
+  # Loop over all modules and their sources to create targets.
+  math(EXPR _MAX_INDEX "${_MODULE_NAMES_COUNT} - 1")
+  foreach(_INDEX RANGE 0 "${_MAX_INDEX}")
+    # Generate all benchmarks to the root build directory. This helps for
+    # discovering them and execute them on devices.
+    list(GET _RULE_MODULE_SOURCES ${_INDEX} _MODULE_SOURCE)
+    set(_ROOT_ARTIFACTS_DIR "${IREE_BINARY_DIR}/benchmark_suites/${_MODULE_SOURCE}")
+
+    list(GET _RULE_MODULE_NAMES ${_INDEX} _MODULE_NAME)
+    list(GET _RULE_MODULE_TAGS ${_INDEX} _MODULE_TAGS)
+    list(GET _RULE_MLIR_SOURCES ${_INDEX} _MLIR_SOURCE)
+    list(GET _RULE_ENTRY_FUNCTIONS ${_INDEX} _ENTRY_FUNCTION)
+    list(GET _RULE_FUNCTION_INPUTS ${_INDEX} _FUNCTION_INPUTS)
+
+    # The source file used to generate benchmark artifacts.
+    set(_SOURCE_FILE "${_MLIR_SOURCE}")
+    # The CMake target's name if we need to download from the web.
+    set(_DOWNLOAD_TARGET_NAME "")
+
+    # If the source file is from the web, create a custom command to download it.
+    # And wrap that with a custom target so later we can use for dependency.
+    #
+    # Note: We actually should not do this; instead, we should directly compile
+    # from the initial source (i.e., TensorFlow Python models). But that is
+    # tangled with the pending Python testing infrastructure revamp so we'd prefer
+    # to not do that right now.
+    if("${_MLIR_SOURCE}" MATCHES "^https?://")
+      # Update the source file to the downloaded-to place.
+      string(REPLACE "/" ";" _SOURCE_URL_SEGMENTS "${_MLIR_SOURCE}")
+      # TODO: we can do `list(POP_BACK _SOURCE_URL_SEGMENTS _LAST_URL_SEGMENT)`
+      # after migrating to CMake 3.15.
+      list(LENGTH _SOURCE_URL_SEGMENTS _URL_SEGMENT_COUNT)
+      math(EXPR _SEGMENT_LAST_INDEX "${_URL_SEGMENT_COUNT} - 1")
+      list(GET _SOURCE_URL_SEGMENTS ${_SEGMENT_LAST_INDEX} _LAST_URL_SEGMENT)
+      set(_DOWNLOAD_TARGET_NAME "iree-download-benchmark-source-${_LAST_URL_SEGMENT}")
+
+      string(REPLACE "tar.gz" "mlir" _FILE_NAME "${_LAST_URL_SEGMENT}")
+      set(_SOURCE_FILE "${_ROOT_ARTIFACTS_DIR}/${_MODULE_NAME}.mlir")
+
+      if (NOT TARGET "${_DOWNLOAD_TARGET_NAME}")
+        add_custom_command(
+          OUTPUT "${_SOURCE_FILE}"
+          COMMAND
+            "${Python3_EXECUTABLE}" "${IREE_ROOT_DIR}/scripts/download_file.py"
+            "${_MLIR_SOURCE}" -o "${_ROOT_ARTIFACTS_DIR}"
+          DEPENDS
+            "${IREE_ROOT_DIR}/scripts/download_file.py"
+          COMMENT "Downloading ${_MLIR_SOURCE}"
+        )
+        add_custom_target("${_DOWNLOAD_TARGET_NAME}"
+          DEPENDS "${_SOURCE_FILE}"
+        )
+      endif()
+    endif()
+
+    # Next create the command and target for compiling the input module into
+    # IREE deployable format.
+    string(JOIN "-" _MODULE_DIR_NAME "${_MODULE_NAME}" "${_MODULE_TAGS}")
+    set(_BENCHMARK_DIR_NAME
+        "iree-${_RULE_DRIVER}__${_RULE_TARGET_ARCHITECTURE}__${_RULE_BENCHMARK_MODE}")
+    set(_ARTIFACTS_DIR "${_ROOT_ARTIFACTS_DIR}/${_MODULE_DIR_NAME}/${_BENCHMARK_DIR_NAME}")
+
+    set(_TRANSLATION_ARGS "--iree-mlir-to-vm-bytecode-module")
+    list(APPEND _TRANSLATION_ARGS "--iree-hal-target-backends=${_RULE_TARGET_BACKEND}")
+    list(APPEND _TRANSLATION_ARGS ${_RULE_TRANSLATION_FLAGS})
+
+    set(_VMFB_FILE "${_ARTIFACTS_DIR}/compiled.vmfb")
+    add_custom_command(
+      OUTPUT "${_VMFB_FILE}"
+      COMMAND
+        "$<TARGET_FILE:iree_tools_iree-translate>"
+          ${_TRANSLATION_ARGS}
+          "${_SOURCE_FILE}"
+          -o "${_VMFB_FILE}"
+      WORKING_DIRECTORY "${_ARTIFACTS_DIR}"
+      DEPENDS
+        iree_tools_iree-translate
+        "${_DOWNLOAD_TARGET_NAME}"
+      COMMENT "Generating ${_VMFB_FILE}"
+    )
+
+    set(_COMMON_NAME_SEGMENTS "${_MODULE_NAME}")
+    string(REPLACE "," "-" _TAGS "${_MODULE_TAGS}")
+    string(REPLACE "," "-" _MODE "${_RULE_BENCHMARK_MODE}")
+    list(APPEND _COMMON_NAME_SEGMENTS
+         "${_TAGS}" "${_MODE}" "${_RULE_TARGET_BACKEND}"
+         "${_RULE_TARGET_ARCHITECTURE}")
+
+    # Construct the benchmark artifact generation target name, which is the module
+    # name, followed by benchmark mode, target backend, and configuration.
+    set(_TRANSLATION_TARGET_NAME_LIST "iree-generate-benchmark-artifact")
+    list(APPEND _TRANSLATION_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
+    list(JOIN _TRANSLATION_TARGET_NAME_LIST "__" _TRANSLATION_TARGET_NAME)
+
+    add_custom_target("${_TRANSLATION_TARGET_NAME}"
+      DEPENDS "${_VMFB_FILE}"
+    )
+
+    # Mark dependency so that we have one target to drive them all.
+    add_dependencies(iree-benchmark-suites "${_TRANSLATION_TARGET_NAME}")
+
+    # Finally create the command and target for the flagfile used to execute the
+    # generated artifacts.
+    set(_FLAG_FILE "${_ARTIFACTS_DIR}/flagfile")
+    set(_ADDITIONAL_ARGS_CL "--additional_args=\"${_RULE_RUNTIME_FLAGS}\"")
+    add_custom_command(
+      OUTPUT "${_FLAG_FILE}"
+      COMMAND
+        "${Python3_EXECUTABLE}" "${IREE_ROOT_DIR}/scripts/generate_flagfile.py"
+          --module_file=compiled.vmfb
+          --driver=${_RULE_DRIVER}
+          --entry_function=${_ENTRY_FUNCTION}
+          --function_inputs=${_FUNCTION_INPUTS}
+          "${_ADDITIONAL_ARGS_CL}"
+          -o "${_FLAG_FILE}"
+      DEPENDS
+        "${IREE_ROOT_DIR}/scripts/generate_flagfile.py"
+      WORKING_DIRECTORY "${_ARTIFACTS_DIR}"
+      COMMENT "Generating ${_FLAG_FILE}"
+    )
+
+    set(_FLAGFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-flagfile")
+    list(APPEND _FLAGFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
+    list(JOIN _FLAGFILE_GEN_TARGET_NAME_LIST "__" _FLAGFILE_GEN_TARGET_NAME)
+
+    add_custom_target("${_FLAGFILE_GEN_TARGET_NAME}"
+      DEPENDS "${_FLAG_FILE}"
+    )
+
+    # Mark dependency so that we have one target to drive them all.
+    add_dependencies(iree-benchmark-suites "${_FLAGFILE_GEN_TARGET_NAME}")
+  endforeach()
+endfunction()
diff --git a/iree/test/model_benchmarks/CMakeLists.txt b/iree/test/model_benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..218f6f1
--- /dev/null
+++ b/iree/test/model_benchmarks/CMakeLists.txt
@@ -0,0 +1,213 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+################################################################################
+#                                                                              #
+# Benchmark models                                                             #
+#                                                                              #
+# All the lists should have the same number of elements. Each list describes   #
+# one aspect of the model. Elements at the same index are for the same model.  #
+#                                                                              #
+# Normally models to be benchmarked should be placed here becuase all          #
+# benchmark cases will be enabled for them. There might exist cases where we   #
+# cannot enable all the benchmark cases for one model; then they should be     #
+# placed directly in the cmake function calls in the next section.             #
+#                                                                              #
+################################################################################
+
+set(BENCHMARK_MODULE_NAMES
+  "MobileNetV2"
+  "MobileNetV3Small"
+)
+
+# Each element is a comma-separated list.
+set(BENCHMARK_MODULE_TAGS
+  "fp32,imagenet"
+  "fp32,imagenet"
+)
+
+set(BENCHMARK_MODULE_SOURCES
+  "TensorFlow"
+  "TensorFlow"
+)
+
+set(BENCHMARK_MLIR_SOURCES
+  "https://storage.googleapis.com/iree-model-artifacts/MobileNetV2-b0c5c584.tar.gz"
+  "https://storage.googleapis.com/iree-model-artifacts/MobileNetV3Small-b0c5c584.tar.gz"
+)
+
+set(BENCHMARK_ENTRY_FUNCTIONS
+  "call"
+  "call"
+)
+
+# Each element is a comma-separated list.
+set(BENCHMARK_FUNCTION_INPUTS
+  "1x224x224x3xf32"
+  "1x224x224x3xf32"
+)
+
+################################################################################
+#                                                                              #
+# Benchmark cases                                                              #
+#                                                                              #
+################################################################################
+
+# CPU, Dylib-Sync, 1-thread, big-core, full-inference
+iree_mlir_benchmark_suite(
+  MODULE_NAMES
+    ${BENCHMARK_MODULE_NAMES}
+  MODULE_TAGS
+    ${BENCHMARK_MODULE_TAGS}
+  MODULE_SOURCES
+    ${BENCHMARK_MODULE_SOURCES}
+  MLIR_SOURCES
+    ${BENCHMARK_MLIR_SOURCES}
+  ENTRY_FUNCTIONS
+    ${BENCHMARK_ENTRY_FUNCTIONS}
+  FUNCTION_INPUTS
+    ${BENCHMARK_FUNCTION_INPUTS}
+
+  BENCHMARK_MODE
+    "1-thread,big-core,full-inference"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    "--iree-llvm-target-triple=aarch64-none-linux-android29"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "-iree-llvm-loop-unrolling=true"
+  DRIVER
+    "dylib-sync"
+)
+
+# CPU, Dylib, 3-thread, big-core, full-inference
+iree_mlir_benchmark_suite(
+  MODULE_NAMES
+    ${BENCHMARK_MODULE_NAMES}
+  MODULE_TAGS
+    ${BENCHMARK_MODULE_TAGS}
+  MODULE_SOURCES
+    ${BENCHMARK_MODULE_SOURCES}
+  MLIR_SOURCES
+    ${BENCHMARK_MLIR_SOURCES}
+  ENTRY_FUNCTIONS
+    ${BENCHMARK_ENTRY_FUNCTIONS}
+  FUNCTION_INPUTS
+    ${BENCHMARK_FUNCTION_INPUTS}
+
+  BENCHMARK_MODE
+    "3-thread,big-core,full-inference"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    "--iree-llvm-target-triple=aarch64-none-linux-android29"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "-iree-llvm-loop-unrolling=true"
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=3"
+)
+
+# GPU, Vulkan, Adreno, full-inference
+iree_mlir_benchmark_suite(
+  MODULE_NAMES
+    ${BENCHMARK_MODULE_NAMES}
+  MODULE_TAGS
+    ${BENCHMARK_MODULE_TAGS}
+  MODULE_SOURCES
+    ${BENCHMARK_MODULE_SOURCES}
+  MLIR_SOURCES
+    ${BENCHMARK_MLIR_SOURCES}
+  ENTRY_FUNCTIONS
+    ${BENCHMARK_ENTRY_FUNCTIONS}
+  FUNCTION_INPUTS
+    ${BENCHMARK_FUNCTION_INPUTS}
+
+  BENCHMARK_MODE
+    "full-inference"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Adreno"
+  TRANSLATION_FLAGS
+    "--iree-vulkan-target-triple=adreno-unknown-android11"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "--iree-enable-fusion-with-reduction-ops"
+  DRIVER
+    "vulkan"
+)
+
+# GPU, Vulkan, Mali, full-inference
+iree_mlir_benchmark_suite(
+  MODULE_NAMES
+    ${BENCHMARK_MODULE_NAMES}
+  MODULE_TAGS
+    ${BENCHMARK_MODULE_TAGS}
+  MODULE_SOURCES
+    ${BENCHMARK_MODULE_SOURCES}
+  MLIR_SOURCES
+    ${BENCHMARK_MLIR_SOURCES}
+  ENTRY_FUNCTIONS
+    ${BENCHMARK_ENTRY_FUNCTIONS}
+  FUNCTION_INPUTS
+    ${BENCHMARK_FUNCTION_INPUTS}
+
+  BENCHMARK_MODE
+    "full-inference"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "--iree-enable-fusion-with-reduction-ops"
+  DRIVER
+    "vulkan"
+)
+
+# GPU, Vulkan, Mali, kernel-execution
+iree_mlir_benchmark_suite(
+  MODULE_NAMES
+    ${BENCHMARK_MODULE_NAMES}
+  MODULE_TAGS
+    ${BENCHMARK_MODULE_TAGS}
+  MODULE_SOURCES
+    ${BENCHMARK_MODULE_SOURCES}
+  MLIR_SOURCES
+    ${BENCHMARK_MLIR_SOURCES}
+  ENTRY_FUNCTIONS
+    ${BENCHMARK_ENTRY_FUNCTIONS}
+  FUNCTION_INPUTS
+    ${BENCHMARK_FUNCTION_INPUTS}
+
+  BENCHMARK_MODE
+    "kernel-execution"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "--iree-enable-fusion-with-reduction-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=32"
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=32"
+)
diff --git a/scripts/download_file.py b/scripts/download_file.py
new file mode 100755
index 0000000..0e9a36d
--- /dev/null
+++ b/scripts/download_file.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Downloads a file from the web and untars it if necessary."""
+
+import argparse
+import os
+import requests
+import tarfile
+
+
+def parse_arguments():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument("source_url",
+                      type=str,
+                      metavar="<source-url>",
+                      help="Source URL to download")
+  parser.add_argument("-o",
+                      "--output",
+                      type=str,
+                      required=True,
+                      metavar="<output-directory>",
+                      help="Output directory to contain the file")
+  return parser.parse_args()
+
+
+def main(args):
+  name = args.source_url.split("/")[-1]
+
+  if not os.path.isdir(args.output):
+    os.makedirs(args.output)
+  output_file = os.path.join(args.output, name)
+
+  response = requests.get(args.source_url)
+  if response.status_code != 200:
+    raise requests.RequestException(
+        f"Failed to download file with status code {response.status_code}")
+
+  with open(output_file, "wb") as f:
+    f.write(response.content)
+
+  if name.endswith("tar.gz") or name.endswith("tgz"):
+    with tarfile.open(output_file, "r") as f:
+      f.extractall(args.output)
+
+
+if __name__ == "__main__":
+  main(parse_arguments())
diff --git a/scripts/generate_flagfile.py b/scripts/generate_flagfile.py
new file mode 100755
index 0000000..87c5ef1
--- /dev/null
+++ b/scripts/generate_flagfile.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Generates a flagfile for iree-benchmark-module."""
+
+import argparse
+import os
+
+
+def parse_arguments():
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--module_file",
+                      type=str,
+                      required=True,
+                      metavar="<module-file>",
+                      help="The name of the module file")
+  parser.add_argument("--driver",
+                      type=str,
+                      required=True,
+                      metavar="<driver>",
+                      help="The name of the IREE driver")
+  parser.add_argument("--entry_function",
+                      type=str,
+                      required=True,
+                      metavar="<entry-function>",
+                      help="The name of the entry function")
+  parser.add_argument("--function_inputs",
+                      type=str,
+                      required=True,
+                      metavar="<function-inputs>",
+                      help="A list of comma-separated function inputs")
+  parser.add_argument("--additional_args",
+                      type=str,
+                      required=True,
+                      metavar="<additional-cl-args>",
+                      help="Additional command-line arguments")
+  parser.add_argument("-o",
+                      "--output",
+                      type=str,
+                      required=True,
+                      metavar="<output-file>",
+                      help="Output file to write to")
+  return parser.parse_args()
+
+
+def main(args):
+  lines = [
+      f"--driver={args.driver}", f"--module_file={args.module_file}",
+      f"--entry_function={args.entry_function}"
+  ]
+  lines.extend([
+      ("--function_input=" + e) for e in args.function_inputs.split(",")
+  ])
+  lines.extend(args.additional_args.split(";"))
+  content = "\n".join(lines) + "\n"
+
+  with open(args.output, "w") as f:
+    f.writelines(content)
+
+
+if __name__ == "__main__":
+  main(parse_arguments())