Support fetching and streaming artifacts in benchmark tools (#15432)

This change adds the support of URL as the e2e-test-artifacts-dir
source. This allows benchmark tools to pull only needed files and in
some cases streaming the artifacts directly to the target device.
diff --git a/.github/workflows/benchmark_execution.yml b/.github/workflows/benchmark_execution.yml
index 4c84620..d428777 100644
--- a/.github/workflows/benchmark_execution.yml
+++ b/.github/workflows/benchmark_execution.yml
@@ -127,7 +127,6 @@
       SHARD_COUNT: ${{ matrix.benchmark.shard.count }}
       PLATFORM_ARCH: ${{ matrix.benchmark.host_environment.platform }}-${{ matrix.benchmark.host_environment.architecture }}
       E2E_TEST_ARTIFACTS_GCS_ARTIFACT_DIR: ${{ inputs.e2e-test-artifacts-gcs-artifact-dir }}
-      E2E_TEST_ARTIFACTS_DIR: ${{ inputs.e2e-test-artifacts-dir }}
       BENCHMARK_RESULTS_DIR: benchmark-results
     outputs:
       benchmark-results-dir: ${{ env.BENCHMARK_RESULTS_DIR }}
@@ -153,14 +152,6 @@
         id: download-assets
         run: |
           gcloud storage cp "${BENCHMARK_CONFIG_GCS_ARTIFACT}" "${BENCHMARK_CONFIG}"
-          mkdir -p "${E2E_TEST_ARTIFACTS_DIR}"
-          jq -r \
-            --arg DEVICE_NAME "${DEVICE_NAME}" \
-            --arg SHARD_INDEX "${SHARD_INDEX}" \
-            --arg GCS_ARTIFACT_DIR "${E2E_TEST_ARTIFACTS_GCS_ARTIFACT_DIR}" \
-            '.[$DEVICE_NAME].shards[($SHARD_INDEX | tonumber)] | .module_dir_paths[] | "\($GCS_ARTIFACT_DIR)/\(.)"' \
-            "${BENCHMARK_CONFIG}" | \
-            gcloud storage cp -r --read-paths-from-stdin "${E2E_TEST_ARTIFACTS_DIR}"
           echo "benchmark-config=${BENCHMARK_CONFIG}" >> "${GITHUB_OUTPUT}"
       - name: "Unpacking benchmark tools"
         id: unpack-tools
@@ -191,11 +182,11 @@
           IREE_TRACY_CAPTURE_TOOL: ${{ steps.unpack-tools.outputs.tracy-capture-tool }}
           IREE_TARGET_DEVICE_NAME: ${{ env.DEVICE_NAME }}
           IREE_SHARD_INDEX: ${{ matrix.benchmark.shard.index }}
-          IREE_E2E_TEST_ARTIFACTS_DIR: ${{ env.E2E_TEST_ARTIFACTS_DIR }}
           IREE_BENCHMARK_RESULTS: ${{ env.BENCHMARK_RESULTS_DIR }}/benchmark-results-${{ matrix.benchmark.device_name }}${{ steps.sharding.outputs.suffix }}.json
           IREE_BENCHMARK_TRACES: ${{ env.BENCHMARK_RESULTS_DIR }}/benchmark-traces-${{ matrix.benchmark.device_name }}${{ steps.sharding.outputs.suffix }}.tar.gz
         run: |
           mkdir -p ${BENCHMARK_RESULTS_DIR}
+          export IREE_E2E_TEST_ARTIFACTS_DIR="${E2E_TEST_ARTIFACTS_GCS_ARTIFACT_DIR/gs:\/\//https://storage.googleapis.com/}"
           ./build_tools/benchmarks/run_benchmarks.sh
       - name: "Uploading benchmark results"
         run: gcloud storage cp -r "${BENCHMARK_RESULTS_DIR}" "${GCS_DIR}/"
diff --git a/build_tools/benchmarks/common/benchmark_config.py b/build_tools/benchmarks/common/benchmark_config.py
index a86ef0f..fcf44de 100644
--- a/build_tools/benchmarks/common/benchmark_config.py
+++ b/build_tools/benchmarks/common/benchmark_config.py
@@ -9,6 +9,9 @@
 from dataclasses import dataclass
 from typing import Optional
 import pathlib
+import re
+
+from common import benchmark_definition
 
 BENCHMARK_RESULTS_REL_PATH = "benchmark-results"
 CAPTURES_REL_PATH = "captures"
@@ -35,8 +38,9 @@
 class BenchmarkConfig:
     """Represents the settings to run benchmarks.
 
-    root_benchmark_dir: the root directory containing the built benchmark
-      suites.
+    tmp_dir: per-commit temporary directory.
+    root_benchmark_dir: the root directory path/URL containing the built
+      benchmark suites.
     benchmark_results_dir: the directory to store benchmark results files.
     git_commit_hash: the git commit hash.
     normal_benchmark_tool_dir: the path to the non-traced benchmark tool
@@ -59,7 +63,8 @@
     verify: verify the output if model's expected output is available.
     """
 
-    root_benchmark_dir: pathlib.Path
+    tmp_dir: pathlib.Path
+    root_benchmark_dir: benchmark_definition.ResourceLocation
     benchmark_results_dir: pathlib.Path
     git_commit_hash: str
 
@@ -113,8 +118,20 @@
                 capture_tmp_dir=per_commit_tmp_dir / CAPTURES_REL_PATH,
             )
 
+        root_benchmark_dir = args.e2e_test_artifacts_dir
+        # Convert the local path into Path object.
+        if re.match("^[^:]+://", str(root_benchmark_dir)):
+            root_benchmark_dir = benchmark_definition.ResourceLocation.build_url(
+                root_benchmark_dir
+            )
+        else:
+            root_benchmark_dir = benchmark_definition.ResourceLocation.build_local_path(
+                root_benchmark_dir
+            )
+
         return BenchmarkConfig(
-            root_benchmark_dir=args.e2e_test_artifacts_dir,
+            tmp_dir=per_commit_tmp_dir,
+            root_benchmark_dir=root_benchmark_dir,
             benchmark_results_dir=per_commit_tmp_dir / BENCHMARK_RESULTS_REL_PATH,
             git_commit_hash=git_commit_hash,
             normal_benchmark_tool_dir=real_path_or_none(args.normal_benchmark_tool_dir),
diff --git a/build_tools/benchmarks/common/benchmark_config_test.py b/build_tools/benchmarks/common/benchmark_config_test.py
index 747982b..e90454d 100644
--- a/build_tools/benchmarks/common/benchmark_config_test.py
+++ b/build_tools/benchmarks/common/benchmark_config_test.py
@@ -6,12 +6,10 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import pathlib
-import stat
 import unittest
 import tempfile
-import os
 
-from common import benchmark_config, common_arguments
+from common import benchmark_config, benchmark_definition, common_arguments
 
 
 class BenchmarkConfigTest(unittest.TestCase):
@@ -69,7 +67,10 @@
             capture_tmp_dir=per_commit_tmp_dir / "captures",
         )
         expected_config = benchmark_config.BenchmarkConfig(
-            root_benchmark_dir=self.e2e_test_artifacts_dir,
+            tmp_dir=per_commit_tmp_dir,
+            root_benchmark_dir=benchmark_definition.ResourceLocation.build_local_path(
+                self.e2e_test_artifacts_dir
+            ),
             benchmark_results_dir=per_commit_tmp_dir / "benchmark-results",
             git_commit_hash="abcd",
             normal_benchmark_tool_dir=self.normal_tool_dir,
@@ -101,6 +102,25 @@
 
         self.assertIsNone(config.trace_capture_config)
 
+    def test_build_from_args_with_test_artifacts_dir_url(self):
+        args = common_arguments.Parser().parse_args(
+            [
+                f"--tmp_dir={self.tmp_dir}",
+                f"--normal_benchmark_tool_dir={self.normal_tool_dir}",
+                f"--e2e_test_artifacts_dir=https://example.com/testdata",
+                f"--execution_benchmark_config={self.execution_config}",
+                "--target_device=test",
+            ]
+        )
+
+        config = benchmark_config.BenchmarkConfig.build_from_args(
+            args=args, git_commit_hash="abcd"
+        )
+
+        self.assertEqual(
+            config.root_benchmark_dir.get_url(), "https://example.com/testdata"
+        )
+
     def test_build_from_args_invalid_capture_args(self):
         args = common_arguments.Parser().parse_args(
             [
diff --git a/build_tools/benchmarks/common/benchmark_definition.py b/build_tools/benchmarks/common/benchmark_definition.py
index 6707818..f6d48c3 100644
--- a/build_tools/benchmarks/common/benchmark_definition.py
+++ b/build_tools/benchmarks/common/benchmark_definition.py
@@ -14,10 +14,12 @@
 import pathlib
 import re
 import subprocess
+import urllib.parse
+import urllib.request
 
 import dataclasses
 from enum import Enum
-from typing import Any, Dict, List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 from e2e_test_framework.definitions import common_definitions
 
@@ -41,6 +43,43 @@
 }
 
 
+@dataclasses.dataclass(frozen=True)
+class ResourceLocation:
+    """Class to represent either local resource path or an URL."""
+
+    local_path: Optional[pathlib.Path]
+    url: Optional[str]
+
+    def get_local_path(self) -> Optional[pathlib.Path]:
+        """Returns the local path or None if it is an URL."""
+        return self.local_path
+
+    def get_url(self) -> Optional[str]:
+        """Returns the URL or None if it is a local path."""
+        return self.url
+
+    def __truediv__(self, sub_path: Union[str, pathlib.PurePath]) -> "ResourceLocation":
+        """Appends the sub path and returns the new location."""
+        local_path = self.get_local_path()
+        if local_path:
+            return self.__class__.build_local_path(local_path / sub_path)
+        url = self.get_url()
+        assert url is not None
+        sub_url_path = urllib.request.pathname2url(str(sub_path))
+        # urljoin requires the directly URL ended with "/".
+        return self.__class__.build_url(urllib.parse.urljoin(url + "/", sub_url_path))
+
+    @classmethod
+    def build_local_path(cls, path: Union[pathlib.Path, str]) -> "ResourceLocation":
+        """Build from a local path."""
+        return cls(local_path=pathlib.Path(path), url=None)
+
+    @classmethod
+    def build_url(cls, url: str) -> "ResourceLocation":
+        """Build from an URL."""
+        return cls(local_path=None, url=url)
+
+
 @dataclasses.dataclass
 class DriverInfo:
     """An object describing a IREE HAL driver.
diff --git a/build_tools/benchmarks/common/benchmark_driver_test.py b/build_tools/benchmarks/common/benchmark_driver_test.py
index 95fc6f7..333b772 100644
--- a/build_tools/benchmarks/common/benchmark_driver_test.py
+++ b/build_tools/benchmarks/common/benchmark_driver_test.py
@@ -14,6 +14,7 @@
 from common import benchmark_config
 from common.benchmark_suite import BenchmarkCase, BenchmarkSuite
 from common.benchmark_driver import BenchmarkDriver
+from common import benchmark_definition
 from common.benchmark_definition import (
     IREE_DRIVERS_INFOS,
     DeviceInfo,
@@ -79,7 +80,10 @@
         self.captures_dir.mkdir()
 
         self.config = benchmark_config.BenchmarkConfig(
-            root_benchmark_dir=pathlib.Path(self._root_dir_obj.name),
+            tmp_dir=self.tmp_dir,
+            root_benchmark_dir=benchmark_definition.ResourceLocation.build_local_path(
+                self._root_dir_obj.name
+            ),
             benchmark_results_dir=self.benchmark_results_dir,
             git_commit_hash="abcd",
             normal_benchmark_tool_dir=self.tmp_dir,
@@ -163,7 +167,7 @@
             bench_mode=["sync"],
             target_arch=common_definitions.DeviceArchitecture.X86_64_CASCADELAKE,
             driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu-sync"],
-            benchmark_case_dir=pathlib.Path("case1"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path("case1"),
             benchmark_tool_name="tool",
             run_config=run_config_a,
         )
@@ -173,7 +177,7 @@
             bench_mode=["task"],
             target_arch=common_definitions.DeviceArchitecture.X86_64_CASCADELAKE,
             driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu"],
-            benchmark_case_dir=pathlib.Path("case2"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path("case2"),
             benchmark_tool_name="tool",
             run_config=run_config_b,
         )
@@ -210,7 +214,9 @@
             bench_mode=["task"],
             target_arch=common_definitions.DeviceArchitecture.RV64_GENERIC,
             driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu"],
-            benchmark_case_dir=pathlib.Path("incompatible_case"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path(
+                "incompatible_case"
+            ),
             benchmark_tool_name="tool",
             run_config=run_config_incompatible,
         )
diff --git a/build_tools/benchmarks/common/benchmark_suite.py b/build_tools/benchmarks/common/benchmark_suite.py
index bc2b21e..fa05805 100644
--- a/build_tools/benchmarks/common/benchmark_suite.py
+++ b/build_tools/benchmarks/common/benchmark_suite.py
@@ -11,18 +11,18 @@
 
 import pathlib
 import re
+import urllib.parse
+import urllib.request
 
 import dataclasses
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Sequence, Tuple
+from common import benchmark_definition
 from common.benchmark_definition import IREE_DRIVERS_INFOS, DriverInfo
 from e2e_test_artifacts import iree_artifacts
 from e2e_test_framework.definitions import common_definitions, iree_definitions
 from e2e_test_framework import serialization
 
-MODEL_FLAGFILE_NAME = "flagfile"
-MODEL_TOOLFILE_NAME = "tool"
-
 
 @dataclass
 class BenchmarkCase:
@@ -34,8 +34,8 @@
     target_arch: the target CPU/GPU architature.
     driver_info: the IREE driver configuration.
     benchmark_tool_name: the benchmark tool, e.g., 'iree-benchmark-module'.
-    benchmark_case_dir: the path to benchmark case directory.
     run_config: the run config from e2e test framework.
+    module_dir: path/URL of the module directory.
     input_uri: URI to find the input npy.
     expected_output_uri: URI to find the expected output npy.
     """
@@ -46,8 +46,8 @@
     target_arch: common_definitions.DeviceArchitecture
     driver_info: DriverInfo
     benchmark_tool_name: str
-    benchmark_case_dir: pathlib.Path
     run_config: iree_definitions.E2EModelRunConfig
+    module_dir: benchmark_definition.ResourceLocation
     input_uri: Optional[str] = None
     expected_output_uri: Optional[str] = None
     verify_params: List[str] = dataclasses.field(default_factory=list)
@@ -174,12 +174,13 @@
     @staticmethod
     def load_from_run_configs(
         run_configs: Sequence[iree_definitions.E2EModelRunConfig],
-        root_benchmark_dir: pathlib.Path,
+        root_benchmark_dir: benchmark_definition.ResourceLocation,
     ):
         """Loads the benchmarks from the run configs.
 
         Args:
           run_configs: list of benchmark run configs.
+          root_benchmark_dir: path/URL of the root benchmark directory.
         Returns:
           A benchmark suite.
         """
@@ -202,10 +203,8 @@
             target_arch = target_device_spec.architecture
             model = module_gen_config.imported_model.model
 
-            module_dir_path = iree_artifacts.get_module_dir_path(
-                module_generation_config=module_gen_config, root_path=root_benchmark_dir
-            )
-            module_dir_path = pathlib.Path(module_dir_path)
+            module_rel_dir = iree_artifacts.get_module_dir_path(module_gen_config)
+            module_dir = root_benchmark_dir / module_rel_dir
 
             benchmark_case = BenchmarkCase(
                 model_name=model.name,
@@ -214,7 +213,7 @@
                 target_arch=target_arch,
                 driver_info=driver_info,
                 benchmark_tool_name=run_config.tool.value,
-                benchmark_case_dir=module_dir_path,
+                module_dir=module_dir,
                 input_uri=model.input_url,
                 expected_output_uri=model.expected_output_url,
                 verify_params=model.verify_params,
diff --git a/build_tools/benchmarks/common/benchmark_suite_test.py b/build_tools/benchmarks/common/benchmark_suite_test.py
index 82e2aec..b9035e2 100644
--- a/build_tools/benchmarks/common/benchmark_suite_test.py
+++ b/build_tools/benchmarks/common/benchmark_suite_test.py
@@ -7,6 +7,7 @@
 
 import pathlib
 import unittest
+from common import benchmark_definition
 from common.benchmark_definition import IREE_DRIVERS_INFOS
 from common.benchmark_suite import BenchmarkCase, BenchmarkSuite
 from e2e_test_framework.definitions import common_definitions, iree_definitions
@@ -62,7 +63,7 @@
             bench_mode=["1-thread", "full-inference"],
             target_arch=common_definitions.DeviceArchitecture.ARMV8_2_A_GENERIC,
             driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu"],
-            benchmark_case_dir=pathlib.Path("case1"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path("case1"),
             benchmark_tool_name="tool",
             run_config=dummy_run_config,
         )
@@ -72,7 +73,7 @@
             bench_mode=["full-inference"],
             target_arch=common_definitions.DeviceArchitecture.ARM_VALHALL,
             driver_info=IREE_DRIVERS_INFOS["iree-vulkan"],
-            benchmark_case_dir=pathlib.Path("case2"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path("case2"),
             benchmark_tool_name="tool",
             run_config=dummy_run_config,
         )
@@ -82,7 +83,7 @@
             bench_mode=["full-inference"],
             target_arch=common_definitions.DeviceArchitecture.X86_64_CASCADELAKE,
             driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu-sync"],
-            benchmark_case_dir=pathlib.Path("case3"),
+            module_dir=benchmark_definition.ResourceLocation.build_local_path("case3"),
             benchmark_tool_name="tool",
             run_config=dummy_run_config,
         )
@@ -215,7 +216,10 @@
         root_dir = pathlib.Path("root")
 
         suite = BenchmarkSuite.load_from_run_configs(
-            run_configs=run_configs, root_benchmark_dir=root_dir
+            run_configs=run_configs,
+            root_benchmark_dir=benchmark_definition.ResourceLocation.build_local_path(
+                root_dir
+            ),
         )
 
         loaded_run_configs = [case.run_config for case in suite.filter_benchmarks()]
@@ -248,7 +252,9 @@
                     target_arch=common_definitions.DeviceArchitecture.RV32_GENERIC,
                     driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu-sync"],
                     benchmark_tool_name="iree-benchmark-module",
-                    benchmark_case_dir=run_config_c_case_dir,
+                    module_dir=benchmark_definition.ResourceLocation.build_local_path(
+                        run_config_c_case_dir
+                    ),
                     input_uri=model_tf.input_url,
                     expected_output_uri=model_tf.expected_output_url,
                     run_config=run_config_c,
@@ -256,6 +262,73 @@
             ],
         )
 
+    def test_load_from_run_configs_with_root_url(self):
+        model_tflite = common_definitions.Model(
+            id="tflite",
+            name="model",
+            tags=[],
+            source_type=common_definitions.ModelSourceType.EXPORTED_TFLITE,
+            source_url="",
+            entry_function="predict",
+            input_types=["1xf32"],
+        )
+        exec_config_a = iree_definitions.ModuleExecutionConfig.build(
+            id="exec_a",
+            tags=["defaults"],
+            loader=iree_definitions.RuntimeLoader.EMBEDDED_ELF,
+            driver=iree_definitions.RuntimeDriver.LOCAL_SYNC,
+        )
+        device_spec_a = common_definitions.DeviceSpec.build(
+            id="dev_a",
+            device_name="a",
+            architecture=common_definitions.DeviceArchitecture.RV64_GENERIC,
+            host_environment=common_definitions.HostEnvironment.LINUX_X86_64,
+            device_parameters=[],
+            tags=[],
+        )
+        compile_target = iree_definitions.CompileTarget(
+            target_backend=iree_definitions.TargetBackend.LLVM_CPU,
+            target_architecture=common_definitions.DeviceArchitecture.RV64_GENERIC,
+            target_abi=iree_definitions.TargetABI.LINUX_GNU,
+        )
+        run_config_a = iree_definitions.E2EModelRunConfig.build(
+            module_generation_config=iree_definitions.ModuleGenerationConfig.build(
+                imported_model=iree_definitions.ImportedModel.from_model(model_tflite),
+                compile_config=iree_definitions.CompileConfig.build(
+                    id="1", tags=[], compile_targets=[compile_target]
+                ),
+            ),
+            module_execution_config=exec_config_a,
+            target_device_spec=device_spec_a,
+            input_data=common_definitions.DEFAULT_INPUT_DATA,
+            tool=iree_definitions.E2EModelRunTool.IREE_BENCHMARK_MODULE,
+        )
+
+        suite = BenchmarkSuite.load_from_run_configs(
+            run_configs=[run_config_a],
+            root_benchmark_dir=benchmark_definition.ResourceLocation.build_url(
+                "https://example.com/testdata"
+            ),
+        )
+
+        self.assertEqual(
+            suite.filter_benchmarks(),
+            [
+                BenchmarkCase(
+                    model_name=model_tflite.name,
+                    model_tags=model_tflite.tags,
+                    bench_mode=exec_config_a.tags,
+                    target_arch=common_definitions.DeviceArchitecture.RV64_GENERIC,
+                    driver_info=IREE_DRIVERS_INFOS["iree-llvm-cpu-sync"],
+                    benchmark_tool_name="iree-benchmark-module",
+                    module_dir=benchmark_definition.ResourceLocation.build_url(
+                        "https://example.com/testdata/iree_module_model_tflite___riscv_64-generic-linux_gnu-llvm_cpu___"
+                    ),
+                    run_config=run_config_a,
+                )
+            ],
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/build_tools/benchmarks/common/common_arguments.py b/build_tools/benchmarks/common/common_arguments.py
index 8b7001b..d083945 100644
--- a/build_tools/benchmarks/common/common_arguments.py
+++ b/build_tools/benchmarks/common/common_arguments.py
@@ -45,9 +45,9 @@
         self.add_argument(
             "--e2e_test_artifacts_dir",
             metavar="<e2e-test-artifacts-dir>",
-            type=_check_dir_path,
+            type=str,
             required=True,
-            help="Path to the IREE e2e test artifacts directory.",
+            help="Path/URL to the IREE e2e test artifacts directory.",
         )
 
         self.add_argument(
diff --git a/build_tools/benchmarks/run_benchmarks_on_android.py b/build_tools/benchmarks/run_benchmarks_on_android.py
index bc939d5..2cb3854 100755
--- a/build_tools/benchmarks/run_benchmarks_on_android.py
+++ b/build_tools/benchmarks/run_benchmarks_on_android.py
@@ -37,16 +37,20 @@
 
 import atexit
 import json
+import requests
 import shutil
+import socket
+import struct
 import subprocess
 import tarfile
+import time
 from typing import Any, Optional, Sequence, Tuple
 
 from common import benchmark_suite as benchmark_suite_module
 from common.benchmark_config import BenchmarkConfig
 from common.benchmark_driver import BenchmarkDriver
+from common import benchmark_definition
 from common.benchmark_definition import (
-    DriverInfo,
     execute_cmd,
     execute_cmd_and_get_stdout,
     execute_cmd_and_get_output,
@@ -55,7 +59,7 @@
     wait_for_iree_benchmark_module_start,
     parse_iree_benchmark_metrics,
 )
-from common.benchmark_suite import MODEL_FLAGFILE_NAME, BenchmarkCase, BenchmarkSuite
+from common.benchmark_suite import BenchmarkCase, BenchmarkSuite
 from common.android_device_utils import (
     get_android_device_model,
     get_android_device_info,
@@ -68,36 +72,35 @@
 
 # Root directory to perform benchmarks in on the Android device.
 ANDROID_TMPDIR = pathlib.PurePosixPath("/data/local/tmp/iree-benchmarks")
+ADB_SERVER_ADDR = ("localhost", 5037)
 
 NORMAL_TOOL_REL_DIR = pathlib.PurePosixPath("normal-tools")
 TRACED_TOOL_REL_DIR = pathlib.PurePosixPath("traced-tools")
 
 
-def adb_push_to_tmp_dir(
-    content: pathlib.Path,
-    relative_dir: pathlib.PurePosixPath = pathlib.PurePosixPath(),
+def adb_push_file(
+    source: pathlib.Path,
+    dest: pathlib.PurePosixPath,
     verbose: bool = False,
 ) -> pathlib.PurePosixPath:
     """Pushes content onto the Android device.
 
     Args:
-      content: the full path to the source file.
-      relative_dir: the directory to push to; relative to ANDROID_TMPDIR.
+      source: the path to the source file.
+      dest: the full dest path on the device.
 
     Returns:
       The full path to the content on the Android device.
     """
-    filename = content.name
-    android_path = ANDROID_TMPDIR / relative_dir / filename
     # When the output is a TTY, keep the default progress info output.
     # In other cases, redirect progress info to null to avoid bloating log files.
     stdout_redirect = None if sys.stdout.isatty() else subprocess.DEVNULL
     execute_cmd(
-        ["adb", "push", content.resolve(), android_path],
+        ["adb", "push", source.resolve(), dest],
         verbose=verbose,
         stdout=stdout_redirect,
     )
-    return android_path
+    return dest
 
 
 def adb_execute_and_get_output(
@@ -183,17 +186,90 @@
     return subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True)
 
 
-def get_vmfb_full_path_for_benchmark_case(
-    benchmark_case_dir: pathlib.Path,
-) -> pathlib.Path:
-    flagfile = benchmark_case_dir / MODEL_FLAGFILE_NAME
-    for line in flagfile.read_text().splitlines():
-        flag_name, flag_value = line.strip().split("=")
-        if flag_name == "--module":
-            # Realpath canonicalization matters. The caller may rely on that to track
-            # which files it already pushed.
-            return (benchmark_case_dir / flag_value).resolve()
-    raise ValueError(f"{flagfile} does not contain a --module flag")
+def adb_path_exists(android_path: pathlib.PurePosixPath, verbose: bool = False):
+    """Run stat to check if the path exists."""
+    proc = adb_start_cmd(["stat", str(android_path)], verbose=verbose)
+    return proc.wait() == 0
+
+
+def adb_fetch_and_push_file(
+    source: benchmark_definition.ResourceLocation,
+    dest: pathlib.PurePosixPath,
+    verbose: bool = False,
+):
+    """Fetch file from the path/URL and stream to the device.
+
+    In the case of fetching, this method avoids the temporary file on the host
+    and reduces the overhead when the file is large.
+
+    Args:
+      source: path/URL to fetch the file.
+      dest: the full dest path on the device.
+      verbose: output verbose message.
+
+    Returns:
+      File path on the device.
+    """
+
+    if adb_path_exists(dest, verbose):
+        return dest
+
+    # If the source is a local file, push directly.
+    local_path = source.get_local_path()
+    if local_path:
+        return adb_push_file(local_path, dest, verbose=verbose)
+
+    if verbose:
+        print(f"Streaming file {source} to {dest}.")
+
+    url = source.get_url()
+    assert url is not None
+    req = requests.get(url, stream=True, timeout=60)
+    if not req.ok:
+        raise RuntimeError(f"Failed to fetch {source}: {req.status_code} - {req.text}")
+
+    # Implement the ADB sync protocol to stream file chunk to the device, since
+    # the adb client tool doesn't support it.
+    #
+    # Alternatively we can use thrid-party library such as
+    # https://github.com/JeffLIrion/adb_shell. But the protocol we need is
+    # simple and fairly stable. This part can be replaced with other solutions
+    # if needed.
+    #
+    # To understand the details of the protocol, see
+    # https://cs.android.com/android/_/android/platform/packages/modules/adb/+/93c8e3c26e4de3a2b767a2394200bc0721bb1e24:OVERVIEW.TXT
+
+    def wait_ack_ok(sock: socket.socket):
+        buf = bytearray()
+        while len(buf) < 4:
+            data = sock.recv(4 - len(buf))
+            if not data:
+                break
+            buf += data
+
+        if buf.decode("utf-8") != "OKAY":
+            raise RuntimeError(f"ADB communication error: {buf}")
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+        sock.connect(ADB_SERVER_ADDR)
+        # Connect to any device (the first 4 hexadecimals is the following text
+        # command length).
+        sock.sendall(b"0012host:transport-any")
+        wait_ack_ok(sock)
+        # Switch to sync mode.
+        sock.sendall(b"0005sync:")
+        wait_ack_ok(sock)
+        # Send the dest file path and file permissions 0644 (rw-r-r).
+        file_attr = f"{dest},{0o644}".encode("utf-8")
+        sock.sendall(b"SEND" + struct.pack("I", len(file_attr)) + file_attr)
+        # Stream the file chunks. 64k bytes is the max chunk size for adb.
+        for data in req.iter_content(chunk_size=64 * 1024):
+            sock.sendall(b"DATA" + struct.pack("I", len(data)) + data)
+        # End the file stream and set the creation time.
+        sock.sendall(b"DONE" + struct.pack("I", int(time.time())))
+        wait_ack_ok(sock)
+
+    return dest
 
 
 class AndroidBenchmarkDriver(BenchmarkDriver):
@@ -209,18 +285,22 @@
         benchmark_results_filename: Optional[pathlib.Path],
         capture_filename: Optional[pathlib.Path],
     ) -> None:
-        benchmark_case_dir = benchmark_case.benchmark_case_dir
-        android_case_dir = pathlib.PurePosixPath(
-            benchmark_case_dir.relative_to(self.config.root_benchmark_dir)
+        module_rel_dir = iree_artifacts.get_module_dir_path(
+            benchmark_case.run_config.module_generation_config
+        )
+        android_case_dir = ANDROID_TMPDIR / module_rel_dir
+
+        module_path = benchmark_case.module_dir / iree_artifacts.MODULE_FILENAME
+        module_device_path = adb_fetch_and_push_file(
+            source=module_path,
+            dest=android_case_dir / iree_artifacts.MODULE_FILENAME,
+            verbose=self.verbose,
         )
 
-        self.__check_and_push_file(
-            benchmark_case_dir / iree_artifacts.MODULE_FILENAME, android_case_dir
-        )
         run_config = benchmark_case.run_config
         taskset = self.__deduce_taskset_from_run_config(run_config)
         run_args = run_config.materialize_run_flags()
-        run_args.append(f"--module={iree_artifacts.MODULE_FILENAME}")
+        run_args.append(f"--module={module_device_path}")
 
         if benchmark_results_filename is not None:
             self.__run_benchmark(
@@ -352,8 +432,10 @@
         if android_path is not None:
             return android_path
 
-        android_path = adb_push_to_tmp_dir(
-            host_path, relative_dir=relative_dir, verbose=self.verbose
+        android_path = adb_push_file(
+            host_path,
+            ANDROID_TMPDIR / relative_dir / host_path.name,
+            verbose=self.verbose,
         )
         self.already_pushed_files[host_path] = android_path
         return android_path
@@ -367,7 +449,7 @@
         / "benchmarks"
         / "set_android_scaling_governor.sh"
     )
-    android_path = adb_push_to_tmp_dir(cpu_script)
+    android_path = adb_push_file(cpu_script, ANDROID_TMPDIR / cpu_script.name)
     adb_execute_as_root([android_path, governor])
 
 
@@ -384,7 +466,7 @@
         raise RuntimeError(
             f"Unsupported device '{device_model}' for setting GPU scaling policy"
         )
-    android_path = adb_push_to_tmp_dir(gpu_script)
+    android_path = adb_push_file(gpu_script, ANDROID_TMPDIR / gpu_script.name)
     adb_execute_as_root([android_path, policy])
 
 
diff --git a/build_tools/benchmarks/run_benchmarks_on_linux.py b/build_tools/benchmarks/run_benchmarks_on_linux.py
index d61d9f0..be252a3 100755
--- a/build_tools/benchmarks/run_benchmarks_on_linux.py
+++ b/build_tools/benchmarks/run_benchmarks_on_linux.py
@@ -52,17 +52,33 @@
         benchmark_results_filename: Optional[pathlib.Path],
         capture_filename: Optional[pathlib.Path],
     ) -> None:
-        case_dir = benchmark_case.benchmark_case_dir
+        module_dir = benchmark_case.module_dir
+        local_module_dir = module_dir.get_local_path()
+        if local_module_dir:
+            case_tmp_dir = local_module_dir
+            module_path = local_module_dir / iree_artifacts.MODULE_FILENAME
+        else:
+            module_rel_dir = iree_artifacts.get_module_dir_path(
+                benchmark_case.run_config.module_generation_config
+            )
+            case_tmp_dir = self.config.tmp_dir / module_rel_dir
+            case_tmp_dir.mkdir(parents=True, exist_ok=True)
+            module_url = (module_dir / iree_artifacts.MODULE_FILENAME).get_url()
+            assert module_url is not None
+            module_path = self.__fetch_file(
+                uri=module_url, dest=case_tmp_dir / iree_artifacts.MODULE_FILENAME
+            )
+
         inputs_dir = None
         expected_output_dir = None
         if benchmark_case.input_uri:
             inputs_dir = self.__fetch_and_unpack_npy(
-                uri=benchmark_case.input_uri, dest_dir=case_dir / "inputs_npy"
+                uri=benchmark_case.input_uri, dest_dir=case_tmp_dir / "inputs_npy"
             )
         if benchmark_case.expected_output_uri:
             expected_output_dir = self.__fetch_and_unpack_npy(
                 uri=benchmark_case.expected_output_uri,
-                dest_dir=case_dir / "expected_outputs_npy",
+                dest_dir=case_tmp_dir / "expected_outputs_npy",
             )
 
         if benchmark_results_filename:
@@ -75,6 +91,7 @@
                 self.__run_verify(
                     tool_dir=self.config.normal_benchmark_tool_dir,
                     benchmark_case=benchmark_case,
+                    module_path=module_path,
                     inputs_dir=inputs_dir,
                     expected_outputs_dir=expected_output_dir,
                 )
@@ -82,18 +99,22 @@
             self.__run_benchmark(
                 tool_dir=self.config.normal_benchmark_tool_dir,
                 benchmark_case=benchmark_case,
+                module_path=module_path,
                 results_filename=benchmark_results_filename,
             )
 
         if capture_filename:
             self.__run_capture(
-                benchmark_case=benchmark_case, capture_filename=capture_filename
+                benchmark_case=benchmark_case,
+                module_path=module_path,
+                capture_filename=capture_filename,
             )
 
     def __build_tool_cmds(
         self,
         benchmark_case: BenchmarkCase,
         tool_path: pathlib.Path,
+        module_path: pathlib.Path,
         inputs_dir: Optional[pathlib.Path] = None,
     ) -> List[Any]:
         run_config = benchmark_case.run_config
@@ -102,8 +123,7 @@
         )
         cmds.append(tool_path)
 
-        module_dir_path = benchmark_case.benchmark_case_dir
-        cmds += [f"--module={module_dir_path / iree_artifacts.MODULE_FILENAME}"]
+        cmds += [f"--module={module_path}"]
         cmds += run_config.materialize_run_flags(
             gpu_id=self.gpu_id,
             inputs_dir=inputs_dir,
@@ -126,8 +146,10 @@
         if dest.exists():
             return dest
         req = requests.get(uri, stream=True, timeout=60)
+        if not req.ok:
+            raise RuntimeError(f"Failed to fetch {uri}: {req.status_code} - {req.text}")
         with dest.open("wb") as dest_file:
-            for data in req.iter_content():
+            for data in req.iter_content(chunk_size=64 * 1024 * 1024):
                 dest_file.write(data)
         return dest
 
@@ -143,12 +165,14 @@
         self,
         tool_dir: pathlib.Path,
         benchmark_case: BenchmarkCase,
+        module_path: pathlib.Path,
         inputs_dir: pathlib.Path,
         expected_outputs_dir: pathlib.Path,
     ):
         cmd = self.__build_tool_cmds(
             benchmark_case=benchmark_case,
             tool_path=tool_dir / "iree-run-module",
+            module_path=module_path,
             inputs_dir=inputs_dir,
         )
         # Currently only support single output.
@@ -160,11 +184,15 @@
         self,
         tool_dir: pathlib.Path,
         benchmark_case: BenchmarkCase,
+        module_path: pathlib.Path,
         results_filename: pathlib.Path,
     ):
         tool_name = benchmark_case.benchmark_tool_name
-        tool_path = tool_dir / tool_name
-        cmd = self.__build_tool_cmds(benchmark_case=benchmark_case, tool_path=tool_path)
+        cmd = self.__build_tool_cmds(
+            benchmark_case=benchmark_case,
+            tool_path=tool_dir / tool_name,
+            module_path=module_path,
+        )
 
         if tool_name == "iree-benchmark-module":
             cmd.extend(
@@ -186,18 +214,21 @@
         results_filename.write_text(json.dumps(benchmark_metrics.to_json_object()))
 
     def __run_capture(
-        self, benchmark_case: BenchmarkCase, capture_filename: pathlib.Path
+        self,
+        benchmark_case: BenchmarkCase,
+        module_path: pathlib.Path,
+        capture_filename: pathlib.Path,
     ):
         capture_config = self.config.trace_capture_config
         if capture_config is None:
             raise ValueError("capture_config can't be None.")
 
         tool_name = benchmark_case.benchmark_tool_name
-        tool_path = (
-            capture_config.traced_benchmark_tool_dir
-            / benchmark_case.benchmark_tool_name
+        cmd = self.__build_tool_cmds(
+            benchmark_case=benchmark_case,
+            tool_path=capture_config.traced_benchmark_tool_dir / tool_name,
+            module_path=module_path,
         )
-        cmd = self.__build_tool_cmds(benchmark_case=benchmark_case, tool_path=tool_path)
 
         if tool_name == "iree-benchmark-module":
             cmd.extend(