Adding iree-benchmark-executable tool. (#16550)

This allows for executables extracted from vmfb ZIP files, dumped by
`--iree-hal-dump-executable-binaries-to=`, or compiled using
`iree-compile --compile-mode=hal-executable` to be executed standalone
without any compiled host code. Because it has no host code everything
that the compiler was producing to dispatch must be provided as flags.
Other features the runtime normal provides like fat binary/format
selection are also not available and users must provide the correct
executable format and file for their target.

This is not intended to be used by normal humans - the easy path to
benchmarking is to generate the benchmark executables via
`--iree-hal-dump-executable-benchmarks-to=` or to author a benchmark in
the frontend.

`--help` shows some information and the included test shows VMVX. The
`--binding=` flag used to provide input/output bindings matches the
format of `--input=` in other tooling and can be used to provide zeroed
sized buffers or parse or read contents from files.

Simple example using the checked-in x86-64 test ELF:
```
iree-benchmark-executable \
  --device=local-sync \
  --executable_format=embedded-elf-x86_64 \
  --executable_file=runtime/src/iree/hal/local/elf/testdata/elementwise_mul_x86_64.so \
  --entry_point=0 \
  --binding=4xf32=1,2,3,4 \
  --binding=4xf32=100,200,300,400 \
  --binding=4xf32 \
  --workgroup_count=1,1,1
```
(one benchmark will be run for each workgroup_count specified)

This currently makes some assumptions that will not hold in the future,
such as all bindings being in set 0 and densely packed. Future changes
will probably make the binding flag specify set/binding ordinals.
Currently command buffers are recorded within the dispatch inner loop to
enable legacy backends (really just ROCM) to work - once ROCM is dead we
can move the command buffer out of the loop and reuse the command buffer
such that the only thing we do while timing is submit-and-wait. We can
still have a mode for ALLOW_INLINE_EXECUTION for backends that can use
it but the compiler will soon start generating secondary command buffers
without that set so it may be removed from here.
diff --git a/.github/workflows/validate_and_publish_release.yml b/.github/workflows/validate_and_publish_release.yml
index 2f9b8a8..3e060fe 100644
--- a/.github/workflows/validate_and_publish_release.yml
+++ b/.github/workflows/validate_and_publish_release.yml
@@ -59,6 +59,9 @@
           TRACY_NO_INVARIANT_CHECK=1 IREE_PY_RUNTIME=tracy \
             python -m iree.runtime._package_test
       # Binaries from the tarball
+      - name: Run iree-benchmark-executable
+        id: run_iree_benchmark_executable
+        run: ./bin/iree-benchmark-executable --help
       - name: Run iree-benchmark-module
         id: run_iree_benchmark_module
         run: ./bin/iree-benchmark-module --help
@@ -90,6 +93,9 @@
       - name: Py iree-run-module
         id: py_iree-run-module
         run: iree-run-module --help
+      - name: Py iree-benchmark-executable
+        id: py_iree_benchmark_executable
+        run: iree-benchmark-executable --help
       - name: Py iree-benchmark-module
         id: py_iree_benchmark_module
         run: iree-benchmark-module --help
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index 88c17d1..9d20cf3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -1939,7 +1939,6 @@
     custom<SymbolVisibility>($sym_visibility)
     $sym_name
     attr-dict-with-keyword
-    ``
     $body
   }];
 
diff --git a/runtime/bindings/python/CMakeLists.txt b/runtime/bindings/python/CMakeLists.txt
index 83fb55b..ddf8200 100644
--- a/runtime/bindings/python/CMakeLists.txt
+++ b/runtime/bindings/python/CMakeLists.txt
@@ -141,6 +141,7 @@
     "iree/runtime/version.py"
     "iree/_runtime/__init__.py"
     "iree/_runtime/libs.py"
+    "iree/_runtime/scripts/iree_benchmark_executable/__main__.py"
     "iree/_runtime/scripts/iree_benchmark_module/__main__.py"
     "iree/_runtime/scripts/iree_cpuinfo/__main__.py"
     "iree/_runtime/scripts/iree_convert_parameters/__main__.py"
@@ -155,6 +156,12 @@
 
 iree_symlink_tool(
   TARGET runtime
+  FROM_TOOL_TARGET iree-benchmark-executable
+  TO_EXE_NAME iree/_runtime_libs/iree-benchmark-executable
+)
+
+iree_symlink_tool(
+  TARGET runtime
   FROM_TOOL_TARGET iree-benchmark-module
   TO_EXE_NAME iree/_runtime_libs/iree-benchmark-module
 )
@@ -317,6 +324,7 @@
   CALL install
   TARGETS
     iree-cpuinfo
+    iree-benchmark-executable
     iree-benchmark-module
     iree-convert-parameters
     iree-create-parameters
diff --git a/runtime/bindings/python/iree/_runtime/scripts/iree_benchmark_executable/__main__.py b/runtime/bindings/python/iree/_runtime/scripts/iree_benchmark_executable/__main__.py
new file mode 100644
index 0000000..f472e44
--- /dev/null
+++ b/runtime/bindings/python/iree/_runtime/scripts/iree_benchmark_executable/__main__.py
@@ -0,0 +1,21 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import subprocess
+import sys
+from ... import libs
+
+
+def main(args=None):
+    if args is None:
+        args = sys.argv[1:]
+    exe = os.path.join(libs.library_path, "iree-benchmark-executable")
+    return subprocess.call(args=[exe] + args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/runtime/setup.py b/runtime/setup.py
index 3f6aec8..cf77f43 100644
--- a/runtime/setup.py
+++ b/runtime/setup.py
@@ -565,6 +565,7 @@
             "iree._runtime_libs": [
                 f"*{sysconfig.get_config_var('EXT_SUFFIX')}",
                 "iree-run-module*",
+                "iree-benchmark-executable*",
                 "iree-benchmark-module*",
                 # These utilities are invariant wrt tracing and are only built for the default runtime.
                 "iree-create-parameters*",
@@ -578,6 +579,7 @@
             "iree._runtime_libs_tracy": [
                 f"*{sysconfig.get_config_var('EXT_SUFFIX')}",
                 "iree-run-module*",
+                "iree-benchmark-executable*",
                 "iree-benchmark-module*",
             ]
             + (["iree-tracy-capture"] if ENABLE_TRACY_TOOLS else [])
@@ -588,6 +590,7 @@
     entry_points={
         "console_scripts": [
             "iree-run-module = iree._runtime.scripts.iree_run_module.__main__:main",
+            "iree-benchmark-executable = iree._runtime.scripts.iree_benchmark_executable.__main__:main",
             "iree-benchmark-module = iree._runtime.scripts.iree_benchmark_module.__main__:main",
             "iree-create-parameters = iree._runtime.scripts.iree_create_parameters.__main__:main",
             "iree-convert-parameters = iree._runtime.scripts.iree_convert_parameters.__main__:main",
diff --git a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir
index b4b4a9a..1ae713c 100644
--- a/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir
+++ b/runtime/src/iree/hal/local/elf/testdata/elementwise_mul.mlir
@@ -43,9 +43,9 @@
   // exported.
   builtin.module {
     func.func @elementwise_mul() {
-      %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
-      %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:4xf32>
-      %dst = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:4xf32>
+      %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
+      %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
+      %dst = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<4xf32>>
       %workgroup_size_x = hal.interface.workgroup.size[0] : index
       %workgroup_id_x = hal.interface.workgroup.id[0] : index
       %workgroup_count_x = hal.interface.workgroup.count[0] : index
@@ -54,8 +54,8 @@
       %end_i = arith.constant 4 : index
       scf.for %i = %base_i to %end_i step %step_i {
         %remaining = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%i)[%workgroup_size_x]
-        %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
-        %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:4xf32> -> tensor<?xf32>
+        %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
+        %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
         %dst_init = tensor.empty(%remaining) : tensor<?xf32>
         %dst_tile = linalg.generic {
           indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
@@ -66,7 +66,7 @@
             %dst_value = arith.mulf %lhs_value, %rhs_value : f32
             linalg.yield %dst_value : f32
           } -> tensor<?xf32>
-        flow.dispatch.tensor.store %dst_tile, %dst, offsets = [%i], sizes = [%remaining], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:4xf32>
+        flow.dispatch.tensor.store %dst_tile, %dst, offsets = [%i], sizes = [%remaining], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
       }
       return
     }
diff --git a/runtime/src/iree/testing/benchmark.h b/runtime/src/iree/testing/benchmark.h
index cc258d5..55e8ee5 100644
--- a/runtime/src/iree/testing/benchmark.h
+++ b/runtime/src/iree/testing/benchmark.h
@@ -34,7 +34,7 @@
   iree_allocator_t host_allocator;
 } iree_benchmark_state_t;
 
-// Returns a range argument with the given ordial.
+// Returns a range argument with the given ordinal.
 int64_t iree_benchmark_get_range(iree_benchmark_state_t* state,
                                  iree_host_size_t ordinal);
 
diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel
index 18d620f..bca7a0f 100644
--- a/tools/BUILD.bazel
+++ b/tools/BUILD.bazel
@@ -25,6 +25,22 @@
 exports_files(["lit.cfg.py"])
 
 iree_runtime_cc_binary(
+    name = "iree-benchmark-executable",
+    srcs = ["iree-benchmark-executable-main.c"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:file_io",
+        "//runtime/src/iree/base/internal:flags",
+        "//runtime/src/iree/hal",
+        "//runtime/src/iree/modules/hal:types",
+        "//runtime/src/iree/testing:benchmark",
+        "//runtime/src/iree/tooling:device_util",
+        "//runtime/src/iree/tooling:function_io",
+        "//runtime/src/iree/vm",
+    ],
+)
+
+iree_runtime_cc_binary(
     name = "iree-benchmark-module",
     srcs = ["iree-benchmark-module-main.cc"],
     deps = [
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 08e3e51..7716f3e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -65,6 +65,23 @@
 
 iree_cc_binary(
   NAME
+    iree-benchmark-executable
+  SRCS
+    "iree-benchmark-executable-main.c"
+  DEPS
+    iree::base
+    iree::base::internal::flags
+    iree::hal
+    iree::modules::hal::types
+    iree::testing::benchmark
+    iree::tooling::device_util
+    iree::tooling::function_io
+    iree::vm
+  INSTALL_COMPONENT IREETools-Runtime
+)
+
+iree_cc_binary(
+  NAME
     iree-benchmark-module
   SRCS
     "iree-benchmark-module-main.cc"
diff --git a/tools/iree-benchmark-executable-main.c b/tools/iree-benchmark-executable-main.c
new file mode 100644
index 0000000..f3bdb46
--- /dev/null
+++ b/tools/iree-benchmark-executable-main.c
@@ -0,0 +1,577 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/file_io.h"
+#include "iree/base/internal/flags.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/types.h"
+#include "iree/testing/benchmark.h"
+#include "iree/tooling/device_util.h"
+#include "iree/tooling/function_io.h"
+#include "iree/vm/api.h"
+
+IREE_FLAG(
+    int32_t, batch_size, 64,
+    "Number of dispatches to perform per command buffer submission.\n"
+    "Higher numbers will reduce the effect of submission overheads on the\n"
+    "final timings but too high a value may result in hangs.");
+
+IREE_FLAG(string, executable_format, "",
+          "Format of the executable file being loaded.");
+IREE_FLAG(string, executable_file, "", "Path to the executable file to load.");
+
+IREE_FLAG(int32_t, entry_point, 0, "Entry point ordinal to run.");
+
+IREE_FLAG_LIST(
+    string, workgroup_count,
+    "`x,y,z` dimensions of the workgroup count defining the number of\n"
+    "workgroup invocations that will be run per benchmark iteration.\n"
+    "Each occurrence of the flag will run a benchmark with that set of\n"
+    "workgroup count values.");
+
+// Total number of executable-level constants we (currently) allow; this is only
+// a limitation of how much memory we allocate and we could make this
+// dynamically growable.
+#define IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT 512
+// Total number of push constants we (currently) allow any executable to have.
+#define IREE_HAL_MAX_PUSH_CONSTANT_COUNT 64
+// Maximum number of descriptor sets in an pipeline layout.
+#define IREE_HAL_MAX_DESCRIPTOR_SET_COUNT 2
+// Total number of bindings we (currently) allow any executable to have.
+#define IREE_HAL_MAX_TOTAL_BINDING_COUNT \
+  (IREE_HAL_MAX_DESCRIPTOR_SET_COUNT * 32)
+
+// Parsed dispatch parameters from flags.
+// Used to construct the dispatch parameters for the benchmark invocation.
+struct {
+  int32_t set_count;
+  struct {
+    // For now we only track the binding counts and assume they are all storage
+    // buffers. When we support more types we'll need an encoding.
+    int32_t binding_count;
+  } sets[IREE_HAL_MAX_DESCRIPTOR_SET_COUNT];
+
+  int32_t executable_constant_count;
+  union {
+    uint32_t ui32;
+  } executable_constants[IREE_HAL_MAX_EXECUTABLE_CONSTANT_COUNT];
+
+  int32_t push_constant_count;
+  union {
+    uint32_t ui32;
+  } push_constants[IREE_HAL_MAX_PUSH_CONSTANT_COUNT];
+
+  int32_t binding_count;
+  iree_string_view_t binding_specs[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
+  char binding_cconv[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
+  iree_hal_descriptor_set_layout_binding_t
+      binding_layouts[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
+} parsed_params = {
+    .executable_constant_count = 0,
+    .push_constant_count = 0,
+    .binding_count = 0,
+};
+
+static iree_status_t parse_executable_constant(iree_string_view_t flag_name,
+                                               void* storage,
+                                               iree_string_view_t value) {
+  IREE_ASSERT_LE(parsed_params.executable_constant_count + 1,
+                 IREE_ARRAYSIZE(parsed_params.executable_constants),
+                 "too many executable constants");
+  uint32_t value_ui32 = 0;
+  if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "invalid executable constant value `%.*s`; expects uint32_t",
+        (int)value.size, value.data);
+  }
+  parsed_params.executable_constants[parsed_params.executable_constant_count++]
+      .ui32 = value_ui32;
+  return iree_ok_status();
+}
+static void print_executable_constant(iree_string_view_t flag_name,
+                                      void* storage, FILE* file) {
+  if (parsed_params.executable_constant_count == 0) {
+    fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
+            flag_name.data);
+    return;
+  }
+  for (int32_t i = 0; i < parsed_params.executable_constant_count; ++i) {
+    fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
+            parsed_params.executable_constants[i].ui32);
+    if (i < parsed_params.executable_constant_count - 1) {
+      fprintf(file, "\n");
+    }
+  }
+}
+IREE_FLAG_CALLBACK(parse_executable_constant, print_executable_constant,
+                   &parsed_params, executable_constant,
+                   "Appends a uint32_t executable constant value.\n");
+
+static iree_status_t parse_push_constant(iree_string_view_t flag_name,
+                                         void* storage,
+                                         iree_string_view_t value) {
+  IREE_ASSERT_LE(parsed_params.push_constant_count + 1,
+                 IREE_ARRAYSIZE(parsed_params.push_constants),
+                 "too many push constants");
+  uint32_t value_ui32 = 0;
+  if (!iree_string_view_atoi_uint32(value, &value_ui32)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "invalid push constant value `%.*s`; expects uint32_t", (int)value.size,
+        value.data);
+  }
+  parsed_params.push_constants[parsed_params.push_constant_count++].ui32 =
+      value_ui32;
+  return iree_ok_status();
+}
+static void print_push_constant(iree_string_view_t flag_name, void* storage,
+                                FILE* file) {
+  if (parsed_params.push_constant_count == 0) {
+    fprintf(file, "# --%.*s=[integer value]\n", (int)flag_name.size,
+            flag_name.data);
+    return;
+  }
+  for (int32_t i = 0; i < parsed_params.push_constant_count; ++i) {
+    fprintf(file, "--%.*s=%u", (int)flag_name.size, flag_name.data,
+            parsed_params.push_constants[i].ui32);
+    if (i < parsed_params.push_constant_count - 1) {
+      fprintf(file, "\n");
+    }
+  }
+}
+IREE_FLAG_CALLBACK(parse_push_constant, print_push_constant, &parsed_params,
+                   push_constant, "Appends a uint32_t push constant value.\n");
+
+static iree_status_t parse_binding(iree_string_view_t flag_name, void* storage,
+                                   iree_string_view_t value) {
+  IREE_ASSERT_LE(parsed_params.binding_count + 1,
+                 IREE_ARRAYSIZE(parsed_params.binding_specs),
+                 "too many bindings");
+  int32_t i = parsed_params.binding_count++;
+  parsed_params.binding_specs[i] = value;
+  parsed_params.binding_cconv[i] = 'r';
+  // TODO(benvanik): allow for a specification of type/immutability.
+  parsed_params.binding_layouts[i] = (iree_hal_descriptor_set_layout_binding_t){
+      .binding = (uint32_t)i,
+      .type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+      .flags = IREE_HAL_DESCRIPTOR_FLAG_NONE,
+  };
+  return iree_ok_status();
+}
+static void print_binding(iree_string_view_t flag_name, void* storage,
+                          FILE* file) {
+  if (parsed_params.binding_count == 0) {
+    fprintf(file, "# --%.*s=\"shapextype[=values]\"\n", (int)flag_name.size,
+            flag_name.data);
+    return;
+  }
+  for (int32_t i = 0; i < parsed_params.binding_count; ++i) {
+    const iree_string_view_t binding_spec = parsed_params.binding_specs[i];
+    fprintf(file, "--%.*s=\"%.*s\"\n", (int)flag_name.size, flag_name.data,
+            (int)binding_spec.size, binding_spec.data);
+  }
+}
+IREE_FLAG_CALLBACK(
+    parse_binding, print_binding, &parsed_params, binding,
+    "Appends a binding to the dispatch parameters.\n"
+    "Bindings are defined by their shape, element type, and their data.\n"
+    "There must be one binding for every declared layout binding.\n"
+    "Examples:\n"
+    "  # 16 4-byte elements zero-initialized:\n"
+    "  --binding=2x8xi32\n"
+    "  # 10000 bytes all initialized to 123:\n"
+    "  --binding=10000xi8=123\n"
+    "  # 2 4-byte floating-point values with contents [[1.4], [2.1]]:\n"
+    "  --binding=2x1xf32=1.4,2.1\n"
+    "  # First array from a numpy file followed by the second:\n"
+    "  --binding=@file.npy\n"
+    "  --binding=+file.npy\n"
+    "  # All arrays from a numpy file\n"
+    "  --binding=*file.npy\n"
+    "  # Binary tensor<2x2xf32> and tensor<4xf32> read from a single file\n"
+    "  --binding=2x2xf32=@file.ext\n"
+    "  --binding=4xf32=+file.ext");
+
+typedef struct iree_benchmark_executable_args_t {
+  iree_hal_device_t* device;
+  iree_hal_executable_t* executable;
+  iree_hal_pipeline_layout_t* pipeline_layout;
+  const iree_hal_descriptor_set_binding_t* bindings;
+  uint32_t workgroup_count[3];
+} iree_benchmark_executable_args_t;
+
+// NOTE: error handling is here just for better diagnostics: it is not tracking
+// allocations correctly and will leak. Don't use this as an example for how to
+// write robust code.
+static iree_status_t iree_benchmark_executable_run(
+    const iree_benchmark_def_t* benchmark_def,
+    iree_benchmark_state_t* benchmark_state) {
+  iree_benchmark_executable_args_t* args =
+      (iree_benchmark_executable_args_t*)benchmark_def->user_data;
+
+  iree_hal_semaphore_t* fence_semaphore = NULL;
+  uint64_t fence_value = 0ull;
+  IREE_RETURN_IF_ERROR(
+      iree_hal_semaphore_create(args->device, fence_value, &fence_semaphore));
+  iree_hal_semaphore_list_t wait_semaphore_list =
+      iree_hal_semaphore_list_empty();
+  iree_hal_semaphore_list_t signal_semaphore_list = {
+      .count = 1,
+      .semaphores = &fence_semaphore,
+      .payload_values = &fence_value,
+  };
+
+  // Start profiling now - all subsequent device operations will be what the
+  // user wants to measure.
+  IREE_RETURN_IF_ERROR(iree_hal_begin_profiling_from_flags(args->device));
+
+  // Submit the command buffer and wait for it to complete.
+  // Note that each iteration runs through the whole grid as it's important that
+  // we are testing the memory access patterns: if we just ran the same single
+  // workgroup processing the same exact region of memory over and over we are
+  // not testing cache effects. This means we need to account for the total
+  // number of workgroups executed.
+  int64_t dispatch_count = 0;
+  while (iree_benchmark_keep_running(benchmark_state, FLAG_batch_size)) {
+    // TODO(benvanik): record a secondary command buffer and just replay it
+    // here. This should fix the overhead at just primary command buffer
+    // creation. Most backends don't support reusable command buffers, yet, and
+    // some only support inline execution so we are conservatively doing that.
+    // In the future we should have an option (possibly based on device query)
+    // as to which path to use.
+
+    // Record a command buffer with the dispatches.
+    // Note that today we are doing this inside of the benchmark loop so that
+    // we can use inline execution. This is a boost to devices that support it
+    // like CUDA streams and synchronous CPU executors but a pessimization to
+    // devices that benefit from reusable command buffers like CUDA graphs.
+    // In the future we can add a flag that switches the mode between
+    // reusable and one-shot.
+    iree_hal_command_buffer_t* command_buffer = NULL;
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_create(
+        args->device,
+        IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT |
+            IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION,
+        IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
+        /*binding_capacity=*/0, &command_buffer));
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_begin(command_buffer));
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_constants(
+        command_buffer, args->pipeline_layout, /*offset=*/0,
+        &parsed_params.push_constants[0].ui32,
+        parsed_params.push_constant_count *
+            sizeof(parsed_params.push_constants[0])));
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_push_descriptor_set(
+        command_buffer, args->pipeline_layout, /*set=*/0,
+        parsed_params.binding_count, args->bindings));
+    for (int32_t i = 0; i < FLAG_batch_size; ++i) {
+      IREE_RETURN_IF_ERROR(iree_hal_command_buffer_dispatch(
+          command_buffer, args->executable, FLAG_entry_point,
+          args->workgroup_count[0], args->workgroup_count[1],
+          args->workgroup_count[2]));
+      IREE_RETURN_IF_ERROR(iree_hal_command_buffer_execution_barrier(
+          command_buffer, IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE,
+          IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE,
+          IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, 0, NULL, 0, NULL));
+    }
+    IREE_RETURN_IF_ERROR(iree_hal_command_buffer_end(command_buffer));
+
+    // Submit the command buffer; if the device could not start executing while
+    // we were recording then this will kick off the execution.
+    ++fence_value;
+    IREE_RETURN_IF_ERROR(iree_hal_device_queue_execute(
+        args->device, IREE_HAL_QUEUE_AFFINITY_ANY, wait_semaphore_list,
+        signal_semaphore_list, 1, &command_buffer));
+
+    // Block and wait for the submission to complete.
+    // Note that this will include round-trip overhead and if the dispatch or
+    // batch size is small then the final time may end up being mostly overhead.
+    IREE_RETURN_IF_ERROR(iree_hal_semaphore_wait(fence_semaphore, fence_value,
+                                                 iree_infinite_timeout()));
+
+    iree_benchmark_pause_timing(benchmark_state);
+
+    // Don't count cleanup time in the benchmark.
+    iree_hal_command_buffer_release(command_buffer);
+
+    // Accumulate the total number of dispatches executed.
+    dispatch_count += FLAG_batch_size;
+
+    // Flush profiling if recording. Note that we don't want to include the
+    // profiling time in the benchmark result.
+    IREE_RETURN_IF_ERROR(iree_hal_device_profiling_flush(args->device));
+
+    iree_benchmark_resume_timing(benchmark_state);
+  }
+
+  // End profiling before cleaning up so tooling doesn't capture it.
+  IREE_RETURN_IF_ERROR(iree_hal_end_profiling_from_flags(args->device));
+
+  // To get a total time per invocation we set the item count to the total
+  // invocations dispatched. That gives us both total dispatch and single
+  // invocation times in the reporter output.
+  int64_t total_invocations = dispatch_count * args->workgroup_count[0] *
+                              args->workgroup_count[1] *
+                              args->workgroup_count[2];
+  iree_benchmark_set_items_processed(benchmark_state, total_invocations);
+
+  iree_hal_semaphore_release(fence_semaphore);
+
+  return iree_ok_status();
+}
+
+// Parses an `x,y,z` workgroup count.
+static iree_status_t iree_parse_workgroup_count(
+    iree_string_view_t workgroup_count_str, uint32_t* out_workgroup_count) {
+  iree_string_view_t str = workgroup_count_str;
+  iree_string_view_t str_x;
+  iree_string_view_split(str, ',', &str_x, &str);
+  iree_string_view_t str_y;
+  iree_string_view_split(str, ',', &str_y, &str);
+  iree_string_view_t str_z = str;
+  if (!iree_string_view_atoi_uint32(str_x, &out_workgroup_count[0]) ||
+      !iree_string_view_atoi_uint32(str_y, &out_workgroup_count[1]) ||
+      !iree_string_view_atoi_uint32(str_z, &out_workgroup_count[2])) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "invalid workgroup count string `%.*s`; expects `X,Y,Z`",
+        (int)workgroup_count_str.size, workgroup_count_str.data);
+  }
+  return iree_ok_status();
+}
+
+// Runs one benchmark per workgroup count specified using the same device
+// and input/output buffers.
+static iree_status_t iree_benchmark_executable_from_flags(
+    iree_allocator_t host_allocator) {
+  iree_vm_instance_t* instance = NULL;
+  IREE_RETURN_IF_ERROR(iree_vm_instance_create(IREE_VM_TYPE_CAPACITY_DEFAULT,
+                                               host_allocator, &instance));
+  IREE_RETURN_IF_ERROR(iree_hal_module_register_inline_types(instance));
+
+  // Create the HAL device we'll be using during execution.
+  // Devices can be very expensive to create and we want to avoid doing it
+  // multiple times throughout the benchmark execution.
+  iree_hal_device_t* device = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_create_device_from_flags(
+      iree_hal_available_driver_registry(), iree_hal_default_device_uri(),
+      host_allocator, &device));
+
+  // We'll reuse the same executable cache so that once we load the executable
+  // we'll be able to reuse any driver-side optimizations.
+  iree_hal_executable_cache_t* executable_cache = NULL;
+  iree_status_t loop_status = iree_ok_status();
+  IREE_RETURN_IF_ERROR(iree_hal_executable_cache_create(
+      device, iree_make_cstring_view("cache"), iree_loop_inline(&loop_status),
+      &executable_cache));
+  IREE_RETURN_IF_ERROR(loop_status);
+
+  // Allocate storage for buffers and populate them.
+  // They only need to remain valid for the duration of the invocation and all
+  // memory accessed by the invocation will come from here.
+  // Note that we do this parsing first so that we can reflect on the I/O to
+  // infer the pipeline layout.
+  iree_hal_allocator_t* device_allocator = iree_hal_device_allocator(device);
+  iree_vm_list_t* binding_list = NULL;
+  IREE_RETURN_IF_ERROR(iree_tooling_parse_variants(
+      iree_make_string_view(parsed_params.binding_cconv,
+                            parsed_params.binding_count),
+      (iree_string_view_list_t){parsed_params.binding_count,
+                                parsed_params.binding_specs},
+      device, device_allocator, host_allocator, &binding_list));
+  iree_hal_descriptor_set_binding_t bindings[IREE_HAL_MAX_TOTAL_BINDING_COUNT];
+  for (iree_host_size_t i = 0; i < parsed_params.binding_count; ++i) {
+    iree_vm_ref_t value = iree_vm_ref_null();
+    IREE_RETURN_IF_ERROR(iree_vm_list_get_ref_assign(binding_list, i, &value));
+    iree_hal_buffer_t* buffer = NULL;
+    if (iree_hal_buffer_isa(value)) {
+      buffer = iree_hal_buffer_deref(value);
+    } else if (iree_hal_buffer_view_isa(value)) {
+      buffer = iree_hal_buffer_view_buffer(iree_hal_buffer_view_deref(value));
+    } else {
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "bindings must be shaped types (4xf32, etc), binding %" PRIhsz
+          " is not",
+          i);
+    }
+    bindings[i] = (iree_hal_descriptor_set_binding_t){
+        .binding = i,
+        .buffer_slot = 0,
+        .buffer = buffer,
+        .offset = 0,
+        .length = IREE_WHOLE_BUFFER,
+    };
+  }
+
+  // Setup the specification used to perform the executable load.
+  // This information is normally used to select the appropriate loader but in
+  // this benchmark we only have a single one.
+  // TODO(benvanik): expose the flags once they are implemented anywhere.
+  iree_hal_executable_params_t executable_params;
+  iree_hal_executable_params_initialize(&executable_params);
+  executable_params.caching_mode =
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALLOW_OPTIMIZATION |
+      IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA;
+
+  // Load the executable data into memory.
+  // In normal usage this would be mapped from the containing module file (which
+  // itself may be mapped from disk).
+  iree_file_contents_t* file_contents = NULL;
+  if (strcmp(FLAG_executable_file, "-") == 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_stdin_read_contents(host_allocator, &file_contents));
+  } else {
+    IREE_RETURN_IF_ERROR(iree_file_read_contents(
+        FLAG_executable_file, IREE_FILE_READ_FLAG_DEFAULT, host_allocator,
+        &file_contents));
+  }
+  executable_params.executable_format =
+      iree_make_cstring_view(FLAG_executable_format);
+  executable_params.executable_data = file_contents->const_buffer;
+
+  // Setup the layouts defining how each entry point is interpreted.
+  iree_hal_pipeline_layout_t* pipeline_layout = NULL;
+  iree_hal_descriptor_set_layout_t* descriptor_set_layout = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_descriptor_set_layout_create(
+      device, IREE_HAL_DESCRIPTOR_SET_LAYOUT_FLAG_NONE,
+      parsed_params.binding_count, parsed_params.binding_layouts,
+      &descriptor_set_layout));
+  IREE_RETURN_IF_ERROR(iree_hal_pipeline_layout_create(
+      device, parsed_params.push_constant_count,
+      /*set_layout_count=*/1, &descriptor_set_layout, &pipeline_layout));
+  executable_params.pipeline_layout_count = 1;
+  executable_params.pipeline_layouts = &pipeline_layout;
+
+  // Executable-level constants allow us to perform some basic load-time value
+  // propagation - usually dependent on device features or tuning parameters.
+  executable_params.constant_count = parsed_params.executable_constant_count;
+  executable_params.constants = &parsed_params.executable_constants[0].ui32;
+
+  // Perform the load, which will fail if the executable cannot be loaded or
+  // there was an issue with the layouts.
+  iree_hal_executable_t* executable = NULL;
+  IREE_RETURN_IF_ERROR(iree_hal_executable_cache_prepare_executable(
+      executable_cache, &executable_params, &executable));
+
+  // Register one benchmark per workgroup count specified.
+  iree_benchmark_executable_args_t* args = NULL;
+  IREE_RETURN_IF_ERROR(iree_allocator_malloc(
+      host_allocator, sizeof(*args) * FLAG_workgroup_count_list().count,
+      (void**)&args));
+  for (iree_host_size_t i = 0; i < FLAG_workgroup_count_list().count; ++i) {
+    args[i] = (iree_benchmark_executable_args_t){
+        .device = device,
+        .executable = executable,
+        .pipeline_layout = pipeline_layout,
+        .bindings = bindings,
+        .workgroup_count = {1, 1, 1},
+    };
+    IREE_RETURN_IF_ERROR(iree_parse_workgroup_count(
+        FLAG_workgroup_count_list().values[i], args[i].workgroup_count));
+    iree_benchmark_def_t benchmark_def = {
+        .flags = IREE_BENCHMARK_FLAG_MEASURE_PROCESS_CPU_TIME |
+                 IREE_BENCHMARK_FLAG_USE_REAL_TIME,
+        .time_unit = IREE_BENCHMARK_UNIT_NANOSECOND,
+        .minimum_duration_ns = 0,
+        .iteration_count = 0,
+        .run = iree_benchmark_executable_run,
+        .user_data = &args[i],
+    };
+    char benchmark_name[512];
+    snprintf(benchmark_name, sizeof(benchmark_name) - 1, "dispatch_%ux%ux%u",
+             args[i].workgroup_count[0], args[i].workgroup_count[1],
+             args[i].workgroup_count[2]);
+    iree_benchmark_register(iree_make_cstring_view(benchmark_name),
+                            &benchmark_def);
+  }
+  iree_benchmark_run_specified();
+  iree_allocator_free(host_allocator, args);
+
+  iree_vm_list_release(binding_list);
+  iree_hal_executable_release(executable);
+  iree_hal_descriptor_set_layout_release(descriptor_set_layout);
+  iree_hal_pipeline_layout_release(pipeline_layout);
+  iree_file_contents_free(file_contents);
+  iree_hal_executable_cache_release(executable_cache);
+  iree_hal_device_release(device);
+  iree_vm_instance_release(instance);
+
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t host_allocator = iree_allocator_system();
+  int exit_code = EXIT_SUCCESS;
+
+  iree_flags_set_usage(
+      "iree-benchmark-executable",
+      "Benchmarks a single entry point within an executable library.\n"
+      "The parameters used can be inferred from the entry point "
+      "`hal.interface` and dispatches to it in the source program.\n"
+      "\n"
+      "Executables can be extracted from VMFB files using `unzip` or dumped\n"
+      "during compilation using --iree-hal-dump-executable-binaries-to=path/.\n"
+      "\n"
+      "The compiler can directly compile `hal.executable.source` and\n"
+      "`hal.executable` ops to the appropriate binaries by using the\n"
+      "`iree-compile --compile-mode=hal-executable` mode.\n"
+      "\n"
+      "Example flags for various compilation backends:\n"
+      "  --iree-hal-target-backends=vmvx\n"
+      "    --device=local-sync or --device=local-task\n"
+      "    --executable_format=vmvx-bytecode-fb\n"
+      "  --iree-hal-target-backends=llvm-cpu\n"
+      "    --device=local-sync or --device=local-task\n"
+      "    --executable_format=embedded-elf-x86_64\n"
+      "    --executable_format=system-dll-x86_64\n"
+      "  --iree-hal-target-backends=vulkan-spirv\n"
+      "    --device=vulkan\n"
+      "    --executable_format=vulkan-spirv-fb\n"
+      "\n"
+      "Note that this tool is intentionally low level: you must specify all\n"
+      "of the push constant/binding parameters precisely as they are expected\n"
+      "by the executable. `iree-benchmark-module` is the user-friendly\n"
+      "benchmarking tool while this one favors direct access to the\n"
+      "executables (bypassing all of the IREE VM, HAL APIs, task system,\n"
+      "etc).\n"
+      "\n"
+      "Example --flagfile:\n"
+      "  --device=local-sync\n"
+      "  --executable_format=embedded-elf-x86_64\n"
+      "  --executable_file=runtime/src/iree/hal/local/elf/testdata/"
+      "elementwise_mul_x86_64.so\n"
+      "  --entry_point=0\n"
+      "  --binding=4xf32=1,2,3,4\n"
+      "  --binding=4xf32=100,200,300,400\n"
+      "  --binding=4xf32=0,0,0,0\n"
+      "  --workgroup_count=1,1,1\n"
+      "\n");
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_UNDEFINED_OK, &argc, &argv);
+  iree_benchmark_initialize(&argc, argv);
+
+  iree_status_t status = iree_benchmark_executable_from_flags(host_allocator);
+  if (!iree_status_is_ok(status)) {
+    iree_status_fprint(stderr, status);
+    iree_status_free(status);
+    exit_code = EXIT_FAILURE;
+  }
+  fflush(stderr);
+
+  IREE_TRACE_ZONE_END(z0);
+  IREE_TRACE_APP_EXIT(exit_code);
+  return exit_code;
+}
diff --git a/tools/iree-convert-parameters-main.c b/tools/iree-convert-parameters-main.c
index 72b5a2d..460a61c 100644
--- a/tools/iree-convert-parameters-main.c
+++ b/tools/iree-convert-parameters-main.c
@@ -197,6 +197,7 @@
 }
 
 int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_allocator_t host_allocator = iree_allocator_system();
@@ -288,5 +289,6 @@
   fflush(stderr);
 
   IREE_TRACE_ZONE_END(z0);
+  IREE_TRACE_APP_EXIT(exit_code);
   return exit_code;
 }
diff --git a/tools/iree-create-parameters-main.c b/tools/iree-create-parameters-main.c
index 81be97d..de628e2 100644
--- a/tools/iree-create-parameters-main.c
+++ b/tools/iree-create-parameters-main.c
@@ -219,6 +219,7 @@
 }
 
 int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_allocator_t host_allocator = iree_allocator_system();
@@ -322,5 +323,6 @@
   fflush(stderr);
 
   IREE_TRACE_ZONE_END(z0);
+  IREE_TRACE_APP_EXIT(exit_code);
   return exit_code;
 }
diff --git a/tools/iree-dump-module-main.c b/tools/iree-dump-module-main.c
index 8d284bb..77f1881 100644
--- a/tools/iree-dump-module-main.c
+++ b/tools/iree-dump-module-main.c
@@ -549,6 +549,8 @@
           "  'flatbuffer-json': module flatbuffer in JSON format.\n");
 
 int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
+
   iree_allocator_t host_allocator = iree_allocator_system();
   int exit_code = EXIT_SUCCESS;
 
@@ -560,6 +562,7 @@
 
   if (argc < 2) {
     fprintf(stderr, "Syntax: iree-dump-module [--output=...] module.vmfb\n");
+    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
     return EXIT_FAILURE;
   }
 
@@ -610,5 +613,6 @@
     exit_code = EXIT_FAILURE;
   }
   fflush(stderr);
+  IREE_TRACE_APP_EXIT(exit_code);
   return exit_code;
 }
diff --git a/tools/iree-dump-parameters-main.c b/tools/iree-dump-parameters-main.c
index 545ce66..0092e0c 100644
--- a/tools/iree-dump-parameters-main.c
+++ b/tools/iree-dump-parameters-main.c
@@ -110,6 +110,7 @@
 //===----------------------------------------------------------------------===//
 
 int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_allocator_t host_allocator = iree_allocator_system();
@@ -126,6 +127,7 @@
             "Use one or more --parameters=file.ext flags to specify parameter "
             "files.\n");
     IREE_TRACE_ZONE_END(z0);
+    IREE_TRACE_APP_EXIT(exit_code);
     return EXIT_FAILURE;
   }
 
@@ -164,5 +166,6 @@
   fflush(stderr);
 
   IREE_TRACE_ZONE_END(z0);
+  IREE_TRACE_APP_EXIT(exit_code);
   return exit_code;
 }
diff --git a/tools/test/BUILD.bazel b/tools/test/BUILD.bazel
index 14e39d8..f13a42f 100644
--- a/tools/test/BUILD.bazel
+++ b/tools/test/BUILD.bazel
@@ -25,6 +25,7 @@
             "executable_benchmarks.mlir",
             "executable_configurations.mlir",
             "executable_sources.mlir",
+            "iree-benchmark-executable.mlir",
             "iree-benchmark-module.mlir",
             "iree-dump-parameters.txt",
             "iree-run-mlir.mlir",
@@ -56,6 +57,7 @@
         "hostonly",
     ],
     tools = [
+        "//tools:iree-benchmark-executable",
         "//tools:iree-benchmark-module",
         "//tools:iree-compile",
         "//tools:iree-dump-parameters",
diff --git a/tools/test/CMakeLists.txt b/tools/test/CMakeLists.txt
index dcacb08..75dde66 100644
--- a/tools/test/CMakeLists.txt
+++ b/tools/test/CMakeLists.txt
@@ -21,6 +21,7 @@
     "executable_benchmarks.mlir"
     "executable_configurations.mlir"
     "executable_sources.mlir"
+    "iree-benchmark-executable.mlir"
     "iree-benchmark-module.mlir"
     "iree-dump-parameters.txt"
     "iree-run-mlir.mlir"
@@ -38,6 +39,7 @@
   TOOLS
     ${IREE_LLD_TARGET}
     FileCheck
+    iree-benchmark-executable
     iree-benchmark-module
     iree-compile
     iree-dump-parameters
diff --git a/tools/test/iree-benchmark-executable.mlir b/tools/test/iree-benchmark-executable.mlir
new file mode 100644
index 0000000..b54f61e
--- /dev/null
+++ b/tools/test/iree-benchmark-executable.mlir
@@ -0,0 +1,85 @@
+// Tests the iree-benchmark-executable tool against the portable VMVX target.
+// Other backends can be tested by using the appropriate compiler flags and
+// matching device and executable format flags.
+//
+// Examples:
+//   --iree-hal-target-backends=vmvx
+//     --device=local-sync or --device=local-task
+//     --executable_format=vmvx-bytecode-fb
+//   --iree-hal-target-backends=llvm-cpu
+//     --device=local-sync or --device=local-task
+//     --executable_format=embedded-elf-x86_64
+//     --executable_format=system-dll-x86_64
+//   --iree-hal-target-backends=vulkan-spirv
+//     --device=vulkan
+//     --executable_format=vulkan-spirv-fb
+
+// RUN: iree-compile \
+// RUN:     --compile-mode=hal-executable \
+// RUN:     --iree-hal-target-backends=vmvx \
+// RUN:     %s | \
+// RUN: iree-benchmark-executable \
+// RUN:     --device=local-sync \
+// RUN:     --executable_format=vmvx-bytecode-fb \
+// RUN:     --executable_file=- \
+// RUN:     --entry_point=0 \
+// RUN:     --binding=512xf32 \
+// RUN:     --binding=512xf32 \
+// RUN:     --binding=512xf32 \
+// RUN:     --workgroup_count=1,1,1 \
+// RUN:     --workgroup_count=512,1,1 | \
+// RUN: FileCheck %s
+
+// CHECK: BM_dispatch_1x1x1
+// CHECK: BM_dispatch_512x1x1
+
+// lhs * rhs => dst / s0b0 * s0b1 => s0b2
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable.source public @executable {
+  hal.executable.export public @elementwise_mul ordinal(0) layout(#pipeline_layout) attributes {
+    workgroup_size = [1 : index, 1 : index, 1 : index]
+  } {
+  ^bb0(%device: !hal.device):
+    // Unused - the workgroup count is provided to the tool.
+    %c1 = arith.constant 1 : index
+    hal.return %c1, %c1, %c1 : index, index, index
+  }
+  builtin.module {
+    func.func @elementwise_mul() {
+      %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
+      %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
+      %dst = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<4xf32>>
+      // TODO(#16554): GPU/SPIR-V lowering doesn't handle workgroup size queries.
+      // %workgroup_size_x = hal.interface.workgroup.size[0] : index
+      %workgroup_size_x = arith.constant 1 : index
+      %workgroup_id_x = hal.interface.workgroup.id[0] : index
+      %workgroup_count_x = hal.interface.workgroup.count[0] : index
+      %base_i = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+      %step_i = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+      %end_i = arith.constant 4 : index
+      scf.for %i = %base_i to %end_i step %step_i {
+        %remaining = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%i)[%workgroup_size_x]
+        %lhs_tile = flow.dispatch.tensor.load %lhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
+        %rhs_tile = flow.dispatch.tensor.load %rhs, offsets = [%i], sizes = [%remaining], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
+        %dst_init = tensor.empty(%remaining) : tensor<?xf32>
+        %dst_tile = linalg.generic {
+          indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
+          iterator_types = ["parallel"]
+        } ins(%lhs_tile, %rhs_tile : tensor<?xf32>, tensor<?xf32>)
+          outs(%dst_init : tensor<?xf32>) {
+          ^bb0(%lhs_value: f32, %rhs_value: f32, %init_value: f32):
+            %dst_value = arith.mulf %lhs_value, %rhs_value : f32
+            linalg.yield %dst_value : f32
+          } -> tensor<?xf32>
+        flow.dispatch.tensor.store %dst_tile, %dst, offsets = [%i], sizes = [%remaining], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
+      }
+      return
+    }
+  }
+}