Adding state functionality to iree-run-trace and improving ergonomics. (#12534)

In order to support executing pipelines where outputs of one call are
passed into another the trace replay functionality has grown slightly
closer to turing complete (and loops are definitely coming :) by
obtaining input/output control, numpy npy file access, and a blackboard
for temporary values. A test demonstrating the file format and some
`--help` info has been added to `iree-run-trace` to at least have a
reference not generated by python and ensure it mostly works.

---

The new `!input.get`/`!input.take`/`!output.set`/`!output.push` macros
can be used in any source sequence such as function call arguments.
These will either get (assign semantics) or take (move semantics) a
value from the input list and set or push a value to the output list.
`iree-run-trace` now supports the same `--input=`/`--output=` flags as
`iree-run-module` and they define the input/output handling for the
whole trace pipeline as if calling a single function.

```yaml
type: call
function: module.fn
# pass the first two `--input=` flag values and a constant
args:
- !input.take 0
- !input.take 1
- !hal.buffer_view 4xf32=0,1,2,3
# store the two results into `--output=` 0 and 1 (pushing)
results:
- !output.set 0
- !output.push
```

---

In addition to the input/output lists there's also a user-defined
blackboard that provides storage for the duration of the trace. Slots
can be set by using `!blackboard.set`/`!blackboard.push` on any target
sequence such as function call results and later retrieved in any source
sequence with `!blackboard.get`/`!blackboard.take`.

```yaml
# save call results to the blackboard
type: call
function: module.return_two_things
results:
- !blackboard.push
- !blackboard.push
---
# load prior results from the blackboard
type: call
function: module.consume_three_things
args:
- !input.take 0
- !blackboard.take 0
- !blackboard.take 1
```

---

The `--input=` and `--output=`-style works for pipeline-style traces
while larger traces may need programmatic control over I/O and the
blackboard. The `numpy_load` and `numpy_save` events have been added
which allow for loading or saving one or more `arrays` to a .npy file
`path`. This can be used to stream outputs during processing by using
the `append: true` node when saving or sharding to different files.

```yaml
# load blackboard slot 3 and 4 from a .npy file
type: numpy_load
path: input.npy
arrays:
- !blackboard.set 3
- !blackboard.set 4
---
# save a few arrays to a .npy file
type: numpy_save
path: output.npy
append: false
arrays:
- !blackboard.get 3
- !input.get 0
- !hal.buffer_view 4xf32=0,1,2,3
```

---

There's some helpers that'd be useful to add (enqueue/dequeue, pop, etc)
that could make it easier to write more complex pipelines. The
blackboard could also be changed to using a hash table so that string
keys could be used instead of just ordinals.

Fixes #12525.
Fixes #12526.
diff --git a/.github/workflows/benchmark_execution.yml b/.github/workflows/benchmark_execution.yml
index 2b7bcaf..d317f5e 100644
--- a/.github/workflows/benchmark_execution.yml
+++ b/.github/workflows/benchmark_execution.yml
@@ -165,19 +165,15 @@
       - name: "Running benchmarks"
         id: run
         env:
-          BENCHMARK_CONFIG: ${{ steps.download-assets.outputs.benchmark-config }}
+          IREE_EXECUTION_BENCHMARK_CONFIG: ${{ steps.download-assets.outputs.benchmark-config }}
           IREE_DOCKER_WRAPPER: ./build_tools/github_actions/docker_run.sh
           IREE_NORMAL_BENCHMARK_TOOLS_DIR: ${{ steps.unpack-tools.outputs.normal-benchmark-tools-dir }}
           IREE_TRACED_BENCHMARK_TOOLS_DIR: ${{ steps.unpack-tools.outputs.traced-benchmark-tools-dir }}
-          IREE_DEVICE_NAME: ${{ env.DEVICE_NAME }}
+          IREE_TARGET_DEVICE_NAME: ${{ env.DEVICE_NAME }}
           IREE_E2E_TEST_ARTIFACTS_DIR: ${{ env.E2E_TEST_ARTIFACTS_DIR }}
-          IREE_RUN_CONFIG: run-config.json
           IREE_BENCHMARK_RESULTS: ${{ env.BENCHMARK_RESULTS_DIR }}/benchmark-results-${{ matrix.benchmark.device_name }}.json
         run: |
           mkdir -p ${BENCHMARK_RESULTS_DIR}
-          jq --arg DEVICE_NAME "${IREE_DEVICE_NAME}" \
-            '.[$DEVICE_NAME] | .run_configs' \
-            "${BENCHMARK_CONFIG}" > "${IREE_RUN_CONFIG}"
           ./build_tools/benchmarks/run_benchmarks.sh
           echo "benchmark-results=${IREE_BENCHMARK_RESULTS}" >> "${GITHUB_OUTPUT}"
       - name: "Uploading benchmark results"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c34cc8c..902ff37 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -65,7 +65,7 @@
       BASE_REF: HEAD^
     outputs:
       should-run: ${{ steps.configure.outputs.should-run }}
-      ci-stage: ${{ steps.configure.outputs.ci-stage }}
+      is-pr: ${{ steps.configure.outputs.is-pr }}
       runner-env: ${{ steps.configure.outputs.runner-env }}
       runner-group: ${{ steps.configure.outputs.runner-group }}
       write-caches: ${{ steps.configure.outputs.write-caches }}
@@ -126,7 +126,7 @@
   ##############################################################################
   build_all:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -191,7 +191,7 @@
 
   build_test_all_windows:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true' && needs.setup.outputs.ci-stage == 'postsubmit'
+    if: fromJson(needs.setup.outputs.should-run) && ! fromJson(needs.setup.outputs.is-pr)
     runs-on: managed-windows-cpu
     defaults:
       run:
@@ -258,7 +258,7 @@
 
   build_test_all_macos:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true' && needs.setup.outputs.ci-stage == 'postsubmit'
+    if: fromJson(needs.setup.outputs.should-run) && ! fromJson(needs.setup.outputs.is-pr)
     runs-on:
       - ${{ github.repository == 'openxla/iree' && 'self-hosted' || 'macos-11' }}  # must come first
       - runner-group=postsubmit
@@ -291,7 +291,7 @@
 
   build_test_all_bazel:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -314,7 +314,7 @@
 
   test_all:
     needs: [setup, build_all]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -344,7 +344,7 @@
 
   test_gpu:
     needs: [setup, build_all]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -387,7 +387,7 @@
   ##############################################################################
   build_test_runtime:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on: ubuntu-20.04-64core
     env:
       BUILD_DIR: build-runtime
@@ -415,7 +415,7 @@
 
   build_test_runtime_windows:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on: managed-windows-cpu
     defaults:
       run:
@@ -444,7 +444,7 @@
   ##############################################################################
   build_tf_integrations:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -491,7 +491,7 @@
 
   test_tf_integrations:
     needs: [setup, build_all, build_tf_integrations]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -530,7 +530,7 @@
 
   test_tf_integrations_gpu:
     needs: [setup, build_all, build_tf_integrations]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -578,7 +578,7 @@
   # TODO(#11263): Drop this job once the IREE_BUILD_BENCHMARKS is removed.
   test_build_benchmark_suites:
     needs: [setup, build_all, build_tf_integrations]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -621,7 +621,7 @@
   ##############################################################################
   python_release_packages:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -662,7 +662,7 @@
 
   asan:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -690,7 +690,7 @@
 
   tsan:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -715,7 +715,7 @@
 
   small_runtime:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on: ubuntu-20.04-64core
     env:
       BUILD_DIR: build-runtime
@@ -742,7 +742,7 @@
 
   gcc:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -774,7 +774,7 @@
 
   tracing:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -803,7 +803,7 @@
 
   debug:
     needs: setup
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -836,7 +836,7 @@
 
   build_benchmark_tools:
     needs: [setup, build_all]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -919,7 +919,7 @@
 
   build_e2e_test_artifacts:
     needs: [setup, build_all, build_tf_integrations]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -987,7 +987,7 @@
 
   compilation_benchmarks:
     needs: [setup, build_e2e_test_artifacts]
-    if: needs.setup.outputs.should-run == 'true' && needs.setup.outputs.benchmark-presets != ''
+    if: fromJson(needs.setup.outputs.should-run) && needs.setup.outputs.benchmark-presets != ''
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -1036,12 +1036,11 @@
           GENERATION_CONFIG: generation-config.json
           COMPILE_STATS_RESULTS: benchmark-results/compile-stats-results.json
         run: |
-          jq '.generation_configs' "${COMPILATION_CONFIG}" > "${GENERATION_CONFIG}"
           mkdir -p benchmark-results
           ./build_tools/benchmarks/collect_compilation_statistics.py alpha \
             --e2e_test_artifacts_dir="${E2E_TEST_ARTIFACTS_DIR}" \
             --build_log="${E2E_TEST_ARTIFACTS_BUILD_LOG}" \
-            --generation_config="${GENERATION_CONFIG}" \
+            --compilation_benchmark_config="${COMPILATION_CONFIG}" \
             --output="${COMPILE_STATS_RESULTS}"
           echo "compile-stats-results=${COMPILE_STATS_RESULTS}" >> "${GITHUB_OUTPUT}"
       - name: "Uploading benchmark results"
@@ -1061,7 +1060,7 @@
 
   execution_benchmarks:
     needs: [setup, build_benchmark_tools, build_e2e_test_artifacts]
-    if: needs.setup.outputs.should-run == 'true' && needs.setup.outputs.benchmark-presets != ''
+    if: fromJson(needs.setup.outputs.should-run) && needs.setup.outputs.benchmark-presets != ''
     uses: ./.github/workflows/benchmark_execution.yml
     with:
       # env.GCS_DIR is also duplicated in this workflow. See the note there on
@@ -1075,7 +1074,7 @@
 
   process_benchmark_results:
     needs: [setup, compilation_benchmarks, execution_benchmarks]
-    if: needs.setup.outputs.should-run == 'true' && needs.setup.outputs.benchmark-presets != ''
+    if: fromJson(needs.setup.outputs.should-run) && needs.setup.outputs.benchmark-presets != ''
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -1112,7 +1111,7 @@
             "${EXECUTION_BENCHMARK_RESULTS_DIR}"
           echo "execution-benchmark-results-pattern=${EXECUTION_BENCHMARK_RESULTS_DIR}/benchmark-results-*.json" >> "${GITHUB_OUTPUT}"
       - name: Generating comment
-        if: needs.setup.outputs.ci-stage == 'presubmit'
+        if: fromJson(needs.setup.outputs.is-pr)
         id: generate-comment
         env:
           # Wildcard pattern to match all execution benchmark results. Empty if
@@ -1139,7 +1138,7 @@
         # Due to security reasons, instead of posting the comment to PR, we only
         # upload the comment data in presubmit workflow and trigger the posting
         # workflow on the main branch. See post_benchmark_comment.yaml
-        if: needs.setup.outputs.ci-stage == 'presubmit'
+        if: fromJson(needs.setup.outputs.is-pr)
         env:
           BENCHMARK_COMMENT_ARTIFACT: ${{ steps.generate-comment.outputs.benchmark-comment-artifact }}
           BENCHMARK_COMMENT_GCS_ARTIFACT: ${{ env.GCS_DIR }}/${{ steps.generate-comment.outputs.benchmark-comment-artifact }}
@@ -1148,7 +1147,7 @@
             "${BENCHMARK_COMMENT_ARTIFACT}" \
             "${BENCHMARK_COMMENT_GCS_ARTIFACT}"
       - name: Uploading results to dashboard
-        if: needs.setup.outputs.ci-stage == 'postsubmit'
+        if: github.ref_name == 'main'
         env:
           EXECUTION_BENCHMARK_RESULTS_PATTERN: ${{ steps.download-execution-results.outputs.execution-benchmark-results-pattern }}
           IREE_DASHBOARD_API_TOKEN: ${{ secrets.IREE_DASHBOARD_API_TOKEN }}
@@ -1167,7 +1166,7 @@
 
   cross_compile_and_test:
     needs: [setup, build_all]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -1256,7 +1255,7 @@
 
   test_benchmark_suites:
     needs: [setup, build_all, build_e2e_test_artifacts]
-    if: needs.setup.outputs.should-run == 'true'
+    if: fromJson(needs.setup.outputs.should-run)
     runs-on:
       - self-hosted  # must come first
       - runner-group=${{ needs.setup.outputs.runner-group }}
@@ -1393,7 +1392,7 @@
           fi
       - name: Posting to Discord
         uses: sarisia/actions-status-discord@c193626e5ce172002b8161e116aa897de7ab5383 # v1.10.2
-        if: failure() && needs.setup.outputs.ci-stage == 'postsubmit'
+        if: failure() && github.ref_name == 'main'
         with:
           webhook: ${{ secrets.DISCORD_WEBHOOK }}
           description: "The following jobs failed: ${{ steps.failed_jobs.outputs.failed-jobs }}"
diff --git a/.github/workflows/run_shark_tank.yml b/.github/workflows/run_shark_tank.yml
index d1a4dc6..6c8f768 100644
--- a/.github/workflows/run_shark_tank.yml
+++ b/.github/workflows/run_shark_tank.yml
@@ -20,7 +20,6 @@
       PR_BODY: ${{ github.event.pull_request.body }}
     outputs:
       should-run: ${{ steps.configure.outputs.should-run }}
-      ci-stage: ${{ steps.configure.outputs.ci-stage }}
       runner-env: ${{ steps.configure.outputs.runner-env }}
       runner-group: ${{ steps.configure.outputs.runner-group }}
       write-caches: ${{ steps.configure.outputs.write-caches }}
diff --git a/build_tools/benchmarks/collect_compilation_statistics.py b/build_tools/benchmarks/collect_compilation_statistics.py
index bc91dd2..3550065 100755
--- a/build_tools/benchmarks/collect_compilation_statistics.py
+++ b/build_tools/benchmarks/collect_compilation_statistics.py
@@ -133,11 +133,13 @@
   return module_path
 
 
-def get_module_map_from_generation_config(
-    serialized_gen_config: TextIO, e2e_test_artifacts_dir: pathlib.PurePath
+def get_module_map_from_compilation_benchmark_config(
+    compilation_benchmark_config_data: TextIO,
+    e2e_test_artifacts_dir: pathlib.PurePath
 ) -> Dict[CompilationInfo, pathlib.Path]:
+  benchmark_config = json.load(compilation_benchmark_config_data)
   gen_configs = serialization.unpack_and_deserialize(
-      data=json.load(serialized_gen_config),
+      data=benchmark_config["generation_configs"],
       root_type=List[iree_definitions.ModuleGenerationConfig])
   module_map = {}
   for gen_config in gen_configs:
@@ -203,8 +205,9 @@
 
 
 def _alpha_get_module_map_and_build_log(args: argparse.Namespace):
-  module_map = get_module_map_from_generation_config(
-      serialized_gen_config=args.generation_config.open("r"),
+  config_data = args.compilation_benchmark_config.open("r")
+  module_map = get_module_map_from_compilation_benchmark_config(
+      compilation_benchmark_config_data=config_data,
       e2e_test_artifacts_dir=args.e2e_test_artifacts_dir)
   return module_map, args.build_log
 
@@ -257,10 +260,10 @@
   alpha_parser.set_defaults(
       get_module_map_and_build_log=_alpha_get_module_map_and_build_log)
   alpha_parser.add_argument(
-      "--generation_config",
+      "--compilation_benchmark_config",
       type=_check_file_path,
       required=True,
-      help="Exported module generation config of e2e test artifacts.")
+      help="Exported compilation benchmark config of e2e test artifacts.")
   alpha_parser.add_argument("--build_log",
                             type=_check_file_path,
                             required=True,
diff --git a/build_tools/benchmarks/collect_compilation_statistics_test.py b/build_tools/benchmarks/collect_compilation_statistics_test.py
index e03e4f2..ac96bdc 100644
--- a/build_tools/benchmarks/collect_compilation_statistics_test.py
+++ b/build_tools/benchmarks/collect_compilation_statistics_test.py
@@ -92,7 +92,7 @@
 
     self.assertEqual(moduel_path, "/abcd-compile-stats.vmfb")
 
-  def test_get_module_map_from_generation_config(self):
+  def test_get_module_map_from_compilation_benchmark_config(self):
     model_a = common_definitions.Model(
         id="1234",
         name="tflite_m",
@@ -126,12 +126,14 @@
         imported_model=imported_model_a, compile_config=compile_config_a)
     gen_config_b = iree_definitions.ModuleGenerationConfig.with_flag_generation(
         imported_model=imported_model_a, compile_config=compile_config_b)
-    serialized_gen_config = json.dumps(
-        serialization.serialize_and_pack([gen_config_a, gen_config_b]))
+    benchmark_config = dict(generation_configs=serialization.serialize_and_pack(
+        [gen_config_a, gen_config_b]),
+                            module_dir_paths=["a", "b"])
     root_dir = pathlib.PurePath("artifacts_dir")
 
-    module_map = collect_compilation_statistics.get_module_map_from_generation_config(
-        serialized_gen_config=StringIO(serialized_gen_config),
+    module_map = collect_compilation_statistics.get_module_map_from_compilation_benchmark_config(
+        compilation_benchmark_config_data=StringIO(
+            json.dumps(benchmark_config)),
         e2e_test_artifacts_dir=root_dir)
 
     compile_info_a = common.benchmark_definition.CompilationInfo(
diff --git a/build_tools/benchmarks/common/benchmark_config.py b/build_tools/benchmarks/common/benchmark_config.py
index 2677857..2d2bc59 100644
--- a/build_tools/benchmarks/common/benchmark_config.py
+++ b/build_tools/benchmarks/common/benchmark_config.py
@@ -113,7 +113,7 @@
     else:
       # TODO(#11076): Remove legacy path.
       build_dir = args.build_dir.resolve()
-      if args.run_config is not None:
+      if args.execution_benchmark_config is not None:
         root_benchmark_dir = build_dir / E2E_TEST_ARTIFACTS_REL_PATH
       else:
         root_benchmark_dir = build_dir / BENCHMARK_SUITE_REL_PATH
diff --git a/build_tools/benchmarks/common/benchmark_config_test.py b/build_tools/benchmarks/common/benchmark_config_test.py
index 7bef77a..e432cf5 100644
--- a/build_tools/benchmarks/common/benchmark_config_test.py
+++ b/build_tools/benchmarks/common/benchmark_config_test.py
@@ -93,13 +93,15 @@
 
   def test_build_from_args_with_e2e_test_artifacts_dir(self):
     with tempfile.TemporaryDirectory() as e2e_test_artifacts_dir:
-      run_config = pathlib.Path(e2e_test_artifacts_dir) / "run_config.json"
-      run_config.touch()
+      exec_bench_config = pathlib.Path(
+          e2e_test_artifacts_dir) / "exec_bench_config.json"
+      exec_bench_config.touch()
       args = common_arguments.Parser().parse_args([
           f"--tmp_dir={self.tmp_dir}",
           f"--normal_benchmark_tool_dir={self.normal_tool_dir}",
           f"--e2e_test_artifacts_dir={e2e_test_artifacts_dir}",
-          f"--run_config={run_config}"
+          f"--execution_benchmark_config={exec_bench_config}",
+          f"--target_device_name=device_a",
       ])
 
       config = benchmark_config.BenchmarkConfig.build_from_args(
@@ -108,14 +110,16 @@
       self.assertEqual(config.root_benchmark_dir,
                        pathlib.Path(e2e_test_artifacts_dir))
 
-  def test_build_from_args_with_run_config_and_build_dir(self):
+  def test_build_from_args_with_execution_benchmark_config_and_build_dir(self):
     with tempfile.TemporaryDirectory() as e2e_test_artifacts_dir:
-      run_config = pathlib.Path(e2e_test_artifacts_dir) / "run_config.json"
-      run_config.touch()
+      exec_bench_config = pathlib.Path(
+          e2e_test_artifacts_dir) / "exec_bench_config.json"
+      exec_bench_config.touch()
       args = common_arguments.Parser().parse_args([
           f"--tmp_dir={self.tmp_dir}",
           f"--normal_benchmark_tool_dir={self.normal_tool_dir}",
-          f"--run_config={run_config}",
+          f"--execution_benchmark_config={exec_bench_config}",
+          f"--target_device_name=device_a",
           str(self.build_dir)
       ])
 
diff --git a/build_tools/benchmarks/common/common_arguments.py b/build_tools/benchmarks/common/common_arguments.py
index ea14265..b6fc8f0 100644
--- a/build_tools/benchmarks/common/common_arguments.py
+++ b/build_tools/benchmarks/common/common_arguments.py
@@ -59,7 +59,7 @@
         help=(
             "Path to the IREE e2e test artifacts directory. This will override "
             "<build-dir> and eventually replace it. For now must use with "
-            "--run_config"))
+            "--execution_benchmark_config"))
 
     self.add_argument(
         "--normal_benchmark_tool_dir",
@@ -155,17 +155,33 @@
         "for). In that case, no --benchmark_repetitions flag will be passed."
         " If not specified, a --benchmark_repetitions will be passed "
         "instead.")
-    self.add_argument("--run_config",
+    self.add_argument("--execution_benchmark_config",
                       type=_check_file_path,
                       default=None,
-                      help="JSON file of the run config")
+                      help="JSON config for the execution benchmarks")
+    self.add_argument("--target_device_name",
+                      type=str,
+                      default=None,
+                      help="Target device in benchmark config to run")
 
   def parse_args(
       self, arg_strs: Optional[Sequence[str]] = None) -> argparse.Namespace:
     args = super().parse_args(arg_strs)
 
-    if args.e2e_test_artifacts_dir is not None and args.run_config is None:
-      raise self.error("--e2e_test_artifacts_dir requires --run_config.")
+    # TODO(#11076): Remove these checks and make --execution_benchmark_config
+    # and --target_device_name required args.
+    use_new_benchmark_suite = (args.execution_benchmark_config is not None or
+                               args.target_device_name is not None)
+    if use_new_benchmark_suite:
+      if (args.execution_benchmark_config is None or
+          args.target_device_name is None):
+        self.error(
+            "--execution_benchmark_config and --target_device_name must be set together."
+        )
+    elif args.e2e_test_artifacts_dir is not None:
+      self.error(
+          "--e2e_test_artifacts_dir requires --execution_benchmark_config and --target_device_name."
+      )
 
     return args
 
diff --git a/build_tools/benchmarks/common/common_arguments_test.py b/build_tools/benchmarks/common/common_arguments_test.py
index 143a850..1d9a3a6 100644
--- a/build_tools/benchmarks/common/common_arguments_test.py
+++ b/build_tools/benchmarks/common/common_arguments_test.py
@@ -42,12 +42,19 @@
     with self.assertRaises(SystemExit):
       arg_parser.parse_args(["--trace_capture_tool=nonexistent", "."])
 
-  def test_parser_e2e_test_artifacts_dir_requires_run_config(self):
+  def test_parser_e2e_test_artifacts_dir_needs_execution_benchmark_config(self):
     arg_parser = common.common_arguments.Parser()
     with tempfile.TemporaryDirectory() as tempdir:
       with self.assertRaises(SystemExit):
         arg_parser.parse_args([f"--e2e_test_artifacts_dir={tempdir}"])
 
+  def test_parser_only_execution_benchmark_config_or_target_device_name(self):
+    arg_parser = common.common_arguments.Parser()
+    with self.assertRaises(SystemExit):
+      arg_parser.parse_args([f"--execution_benchmark_config"])
+    with self.assertRaises(SystemExit):
+      arg_parser.parse_args([f"--target_device_name"])
+
 
 if __name__ == "__main__":
   unittest.main()
diff --git a/build_tools/benchmarks/run_benchmarks.sh b/build_tools/benchmarks/run_benchmarks.sh
index 991973f..7b8694b 100755
--- a/build_tools/benchmarks/run_benchmarks.sh
+++ b/build_tools/benchmarks/run_benchmarks.sh
@@ -20,11 +20,11 @@
 set -euo pipefail
 
 DOCKER_WRAPPER="${IREE_DOCKER_WRAPPER:-./build_tools/docker/docker_run.sh}"
-DEVICE_NAME="${IREE_DEVICE_NAME}"
 NORMAL_BENCHMARK_TOOLS_DIR="${IREE_NORMAL_BENCHMARK_TOOLS_DIR}"
 E2E_TEST_ARTIFACTS_DIR="${1:-${IREE_E2E_TEST_ARTIFACTS_DIR}}"
-RUN_CONFIG="${2:-${IREE_RUN_CONFIG}}"
-BENCHMARK_RESULTS="${3:-${IREE_BENCHMARK_RESULTS}}"
+EXECUTION_BENCHMARK_CONFIG="${2:-${IREE_EXECUTION_BENCHMARK_CONFIG}}"
+TARGET_DEVICE_NAME="${3:-${IREE_TARGET_DEVICE_NAME}}"
+BENCHMARK_RESULTS="${4:-${IREE_BENCHMARK_RESULTS}}"
 
 if [[ "${DEVICE_NAME}" == "a2-highgpu-1g" ]]; then
   ${DOCKER_WRAPPER} \
@@ -34,7 +34,8 @@
       ./build_tools/benchmarks/run_benchmarks_on_linux.py \
         --normal_benchmark_tool_dir="${NORMAL_BENCHMARK_TOOLS_DIR}" \
         --e2e_test_artifacts_dir="${E2E_TEST_ARTIFACTS_DIR}" \
-        --run_config="${RUN_CONFIG}" \
+        --execution_benchmark_config="${EXECUTION_BENCHMARK_CONFIG}" \
+        --target_device_name="${TARGET_DEVICE_NAME}" \
         --output="${BENCHMARK_RESULTS}" \
         --verbose
 elif [[ "${DEVICE_NAME}" == "c2-standard-16" ]]; then
@@ -43,7 +44,8 @@
       ./build_tools/benchmarks/run_benchmarks_on_linux.py \
         --normal_benchmark_tool_dir="${NORMAL_BENCHMARK_TOOLS_DIR}" \
         --e2e_test_artifacts_dir="${E2E_TEST_ARTIFACTS_DIR}" \
-        --run_config="${RUN_CONFIG}" \
+        --execution_benchmark_config="${EXECUTION_BENCHMARK_CONFIG}" \
+        --target_device_name="${TARGET_DEVICE_NAME}" \
         --output="${BENCHMARK_RESULTS}" \
         --device_model=GCP-c2-standard-16 \
         --cpu_uarch=CascadeLake \
diff --git a/build_tools/benchmarks/run_benchmarks_on_android.py b/build_tools/benchmarks/run_benchmarks_on_android.py
index 0ff954d..3891da4 100755
--- a/build_tools/benchmarks/run_benchmarks_on_android.py
+++ b/build_tools/benchmarks/run_benchmarks_on_android.py
@@ -339,7 +339,7 @@
   if args.verbose:
     print(device_info)
 
-  if args.run_config is not None:
+  if args.execution_benchmark_config is not None:
     raise ValueError("Run config option isn't supported yet.")
 
   commit = get_git_commit_hash("HEAD")
diff --git a/build_tools/benchmarks/run_benchmarks_on_linux.py b/build_tools/benchmarks/run_benchmarks_on_linux.py
index e8a022c..d8b43ce 100755
--- a/build_tools/benchmarks/run_benchmarks_on_linux.py
+++ b/build_tools/benchmarks/run_benchmarks_on_linux.py
@@ -147,14 +147,17 @@
   commit = get_git_commit_hash("HEAD")
   benchmark_config = BenchmarkConfig.build_from_args(args, commit)
 
-  if args.run_config is None:
+  if args.execution_benchmark_config is None:
     # TODO(#11076): Remove legacy path.
     benchmark_suite = BenchmarkSuite.load_from_benchmark_suite_dir(
         benchmark_config.root_benchmark_dir)
   else:
-    run_config_data = json.loads(args.run_config.read_text())
+    benchmark_groups = json.loads(args.execution_benchmark_config.read_text())
+    benchmark_group = benchmark_groups.get(args.target_device_name)
+    if benchmark_group is None:
+      raise ValueError("Target device not found in the benchmark config.")
     run_configs = serialization.unpack_and_deserialize(
-        data=run_config_data,
+        data=benchmark_group["run_configs"],
         root_type=typing.List[iree_definitions.E2EModelRunConfig])
     benchmark_suite = BenchmarkSuite.load_from_run_configs(
         run_configs=run_configs)
diff --git a/build_tools/cmake/iree_python.cmake b/build_tools/cmake/iree_python.cmake
index 5aaab3f..b5dcdd3 100644
--- a/build_tools/cmake/iree_python.cmake
+++ b/build_tools/cmake/iree_python.cmake
@@ -279,6 +279,9 @@
     "ARGS;LABELS;TIMEOUT"
     ${ARGN}
   )
+  if(NOT IREE_BUILD_PYTHON_BINDINGS)
+    return()
+  endif()
 
   iree_local_py_test(
     NAME
diff --git a/build_tools/github_actions/configure_ci.py b/build_tools/github_actions/configure_ci.py
index 49ac80d..04309a2 100755
--- a/build_tools/github_actions/configure_ci.py
+++ b/build_tools/github_actions/configure_ci.py
@@ -29,15 +29,12 @@
 
 import difflib
 import fnmatch
+import json
 import os
 import subprocess
 import textwrap
 from typing import Iterable, Mapping, MutableMapping
 
-PULL_REQUEST_EVENT_NAME = "pull_request"
-PUSH_EVENT_NAME = "push"
-SCHEDULE_EVENT_NAME = "schedule"
-WORKFLOW_DISPATCH_EVENT_NAME = "workflow_dispatch"
 SKIP_CI_KEY = "skip-ci"
 RUNNER_ENV_KEY = "runner-env"
 BENCHMARK_PRESET_KEY = "benchmarks"
@@ -86,7 +83,7 @@
   print(f"Setting outputs: {d}")
   step_output_file = os.environ["GITHUB_OUTPUT"]
   with open(step_output_file, "a") as f:
-    f.writelines(f"{k}={v}" "\n" for k, v in d.items())
+    f.writelines(f"{k}={v}" + "\n" for k, v in d.items())
 
 
 def write_job_summary(summary: str):
@@ -121,7 +118,9 @@
   </details>""").format("".join(diffs)))
 
 
-def get_trailers() -> Mapping[str, str]:
+def get_trailers(is_pr: bool) -> Mapping[str, str]:
+  if not is_pr:
+    return {}
   title = os.environ["PR_TITLE"]
   body = os.environ.get("PR_BODY", "")
   original_title = os.environ.get("ORIGINAL_PR_TITLE")
@@ -167,10 +166,10 @@
   return any(not skip_path(p) for p in get_modified_paths(base_ref))
 
 
-def should_run_ci(event_name, trailers) -> bool:
-  if event_name != PULL_REQUEST_EVENT_NAME:
-    print(f"Running CI independent of diff because run was not triggered by a"
-          f" pull request event (event name is '{event_name}')")
+def should_run_ci(is_pr: bool, trailers: Mapping[str, str]) -> bool:
+  if not is_pr:
+    print("Running CI independent of diff because run was not triggered by a"
+          " pull request event.")
     return True
 
   if SKIP_CI_KEY in trailers:
@@ -204,19 +203,7 @@
   return runner_env
 
 
-def get_ci_stage(event_name):
-  if event_name == PULL_REQUEST_EVENT_NAME:
-    return "presubmit"
-  elif event_name == PUSH_EVENT_NAME:
-    return "postsubmit"
-  elif event_name == SCHEDULE_EVENT_NAME:
-    return "postsubmit"
-  elif event_name == WORKFLOW_DISPATCH_EVENT_NAME:
-    return "unknown"
-  raise ValueError(f"Unrecognized event name '{event_name}'")
-
-
-def get_benchmark_presets(ci_stage: str, trailers: Mapping[str, str]) -> str:
+def get_benchmark_presets(is_pr: bool, trailers: Mapping[str, str]) -> str:
   """Parses and validates the benchmark presets from trailers.
 
   Args:
@@ -226,7 +213,7 @@
     A comma separated preset string, which later will be parsed by
     build_tools/benchmarks/export_benchmark_config.py.
   """
-  if ci_stage == "postsubmit":
+  if not is_pr:
     preset_options = ["all"]
   else:
     trailer = trailers.get(BENCHMARK_PRESET_KEY)
@@ -248,22 +235,16 @@
 
 
 def main():
-  output: MutableMapping[str, str] = {}
-  event_name = os.environ["GITHUB_EVENT_NAME"]
-  trailers = get_trailers() if event_name == PULL_REQUEST_EVENT_NAME else {}
-  if should_run_ci(event_name, trailers):
-    output["should-run"] = "true"
-  else:
-    output["should-run"] = "false"
-  output[RUNNER_ENV_KEY] = get_runner_env(trailers)
-  ci_stage = get_ci_stage(event_name)
-  output["ci-stage"] = ci_stage
-  output["runner-group"] = ci_stage
-  write_caches = "0"
-  if ci_stage == "postsubmit":
-    write_caches = "1"
-  output["write-caches"] = write_caches
-  output["benchmark-presets"] = get_benchmark_presets(ci_stage, trailers)
+  is_pr = os.environ["GITHUB_EVENT_NAME"] == "pull_request"
+  trailers = get_trailers(is_pr)
+  output = {
+      "should-run": json.dumps(should_run_ci(is_pr, trailers)),
+      "is-pr": json.dumps(is_pr),
+      "runner-env": get_runner_env(trailers),
+      "runner-group": "presubmit" if is_pr else "postsubmit",
+      "write-caches": "0" if is_pr else "1",
+      "benchmark-presets": get_benchmark_presets(is_pr, trailers),
+  }
 
   set_output(output)
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 44ec121..30186c6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -215,6 +215,9 @@
   nestedModulePM.addNestedPass<func::FuncOp>(
       createOptimizeVectorTransferPass());
 
+  // Hoist loop invariant code to avoid pipelining it.
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLoopInvariantCodeMotionPass());
   // Pipeline memory operations.
   nestedModulePM.addNestedPass<func::FuncOp>(createGPUPipeliningPass());
 }
@@ -270,6 +273,9 @@
   nestedModulePM.addPass(createCanonicalizerPass());
   nestedModulePM.addPass(createCSEPass());
 
+  // Hoist loop invariant code to avoid pipelining it.
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLoopInvariantCodeMotionPass());
   PipeliningSchedulingStrategy schedule =
       llvmgpuUseMMASync ? PipeliningSchedulingStrategy::nvidiaTensorCore
                         : PipeliningSchedulingStrategy::loadGlobalStage0;
diff --git a/runtime/src/iree/base/internal/cpu.c b/runtime/src/iree/base/internal/cpu.c
index 3bc25f1..66f784c 100644
--- a/runtime/src/iree/base/internal/cpu.c
+++ b/runtime/src/iree/base/internal/cpu.c
@@ -221,39 +221,6 @@
 }
 
 //===----------------------------------------------------------------------===//
-// Architecture-specific string lookup
-//===----------------------------------------------------------------------===//
-
-#define IREE_TEST_FIELD_BIT(field_key, field_value, bit_value)          \
-  if (iree_string_view_equal(key, IREE_SV(field_key))) {                \
-    *out_value = iree_all_bits_set((field_value), (bit_value)) ? 1 : 0; \
-    return true;                                                        \
-  }
-
-#if defined(IREE_ARCH_ARM_64)
-
-static bool iree_cpu_lookup_data_by_key_for_arch(
-    const uint64_t* fields, iree_string_view_t key,
-    int64_t* IREE_RESTRICT out_value) {
-  IREE_TEST_FIELD_BIT("dotprod", fields[0], IREE_CPU_DATA0_ARM_64_DOTPROD);
-  IREE_TEST_FIELD_BIT("i8mm", fields[0], IREE_CPU_DATA0_ARM_64_I8MM);
-  return false;
-}
-
-#else
-
-static bool iree_cpu_lookup_data_by_key_for_arch(
-    const uint64_t* fields, iree_string_view_t key,
-    int64_t* IREE_RESTRICT out_value) {
-  // Not yet implemented for this architecture.
-  return false;
-}
-
-#endif  // IREE_ARCH_*
-
-#undef IREE_TEST_FIELD_BIT
-
-//===----------------------------------------------------------------------===//
 // Processor data query
 //===----------------------------------------------------------------------===//
 
@@ -289,15 +256,30 @@
              sizeof(*out_fields));
 }
 
+//===----------------------------------------------------------------------===//
+// Processor data lookup by key
+//===----------------------------------------------------------------------===//
+
 iree_status_t iree_cpu_lookup_data_by_key(iree_string_view_t key,
                                           int64_t* IREE_RESTRICT out_value) {
-  if (!iree_cpu_lookup_data_by_key_for_arch(iree_cpu_data_cache_, key,
-                                            out_value)) {
-    return iree_make_status(IREE_STATUS_NOT_FOUND,
-                            "CPU data key '%.*s' not found", (int)key.size,
-                            key.data);
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name)  \
+  if (IREE_ARCH_ENUM == IREE_ARCH_ENUM_##arch) {                               \
+    if (iree_string_view_equal(key, IREE_SV(llvm_name))) {                     \
+      *out_value = iree_all_bits_set(                                          \
+                       (iree_cpu_data_cache_[field_index]),                    \
+                       IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name)) \
+                       ? 1                                                     \
+                       : 0;                                                    \
+      return iree_ok_status();                                                 \
+    }                                                                          \
   }
-  return iree_ok_status();
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
+
+  return iree_make_status(
+      IREE_STATUS_NOT_FOUND,
+      "CPU feature '%.*s' unknown on this architecture (%s)", (int)key.size,
+      key.data, IREE_ARCH);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/runtime/src/iree/base/target_platform.h b/runtime/src/iree/base/target_platform.h
index 1abd987..376fb27 100644
--- a/runtime/src/iree/base/target_platform.h
+++ b/runtime/src/iree/base/target_platform.h
@@ -11,6 +11,7 @@
 // one platform+architecture pair for that platform.
 //
 // IREE_ARCH ("arm_32", "arm_64", etc)
+// IREE_ARCH_ENUM (IREE_ARCH_ENUM_ARM_32, etc)
 // IREE_ARCH_ARM_32
 // IREE_ARCH_ARM_64
 // IREE_ARCH_RISCV_32
@@ -53,41 +54,60 @@
 // IREE_ARCH_*
 //==============================================================================
 
+enum iree_arch_enum_e {
+  IREE_ARCH_ENUM_ARM_32,
+  IREE_ARCH_ENUM_ARM_64,
+  IREE_ARCH_ENUM_RISCV_32,
+  IREE_ARCH_ENUM_RISCV_64,
+  IREE_ARCH_ENUM_WASM_32,
+  IREE_ARCH_ENUM_WASM_64,
+  IREE_ARCH_ENUM_X86_32,
+  IREE_ARCH_ENUM_X86_64,
+};
+
 #if defined(__arm__) || defined(__arm64) || defined(__aarch64__) || \
     defined(__thumb__) || defined(__TARGET_ARCH_ARM) ||             \
     defined(__TARGET_ARCH_THUMB) || defined(_M_ARM)
 #if defined(__arm64) || defined(__aarch64__)
 #define IREE_ARCH "arm_64"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_ARM_64
 #define IREE_ARCH_ARM_64 1
 #else
 #define IREE_ARCH "arm_32"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_ARM_32
 #define IREE_ARCH_ARM_32 1
 #endif  // __arm64
 #endif  // ARM
 
 #if defined(__riscv) && (__riscv_xlen == 32)
 #define IREE_ARCH "riscv_32"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_RISCV_32
 #define IREE_ARCH_RISCV_32 1
 #elif defined(__riscv) && (__riscv_xlen == 64)
 #define IREE_ARCH "riscv_64"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_RISCV_64
 #define IREE_ARCH_RISCV_64 1
 #endif  // RISCV
 
 #if defined(__wasm32__)
 #define IREE_ARCH "wasm_32"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_WASM_32
 #define IREE_ARCH_WASM_32 1
 #elif defined(__wasm64__)
 #define IREE_ARCH "wasm_64"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_WASM_64
 #define IREE_ARCH_WASM_64 1
 #endif  // WASM
 
 #if defined(__i386__) || defined(__i486__) || defined(__i586__) || \
     defined(__i686__) || defined(__i386) || defined(_M_IX86) || defined(_X86_)
 #define IREE_ARCH "x86_32"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_X86_32
 #define IREE_ARCH_X86_32 1
 #elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64__) || \
     defined(__amd64) || defined(_M_X64)
 #define IREE_ARCH "x86_64"
+#define IREE_ARCH_ENUM IREE_ARCH_ENUM_X86_64
 #define IREE_ARCH_X86_64 1
 #endif  // X86
 
diff --git a/runtime/src/iree/schemas/BUILD b/runtime/src/iree/schemas/BUILD
index 6f4464a..33af0e2 100644
--- a/runtime/src/iree/schemas/BUILD
+++ b/runtime/src/iree/schemas/BUILD
@@ -70,5 +70,6 @@
     name = "cpu_data",
     hdrs = [
         "cpu_data.h",
+        "cpu_feature_bits.inl",
     ],
 )
diff --git a/runtime/src/iree/schemas/CMakeLists.txt b/runtime/src/iree/schemas/CMakeLists.txt
index 78a8e9a..5e96962 100644
--- a/runtime/src/iree/schemas/CMakeLists.txt
+++ b/runtime/src/iree/schemas/CMakeLists.txt
@@ -93,6 +93,7 @@
     cpu_data
   HDRS
     "cpu_data.h"
+    "cpu_feature_bits.inl"
   DEPS
 
   PUBLIC
diff --git a/runtime/src/iree/schemas/cpu_data.h b/runtime/src/iree/schemas/cpu_data.h
index 50a4ab3..46df305 100644
--- a/runtime/src/iree/schemas/cpu_data.h
+++ b/runtime/src/iree/schemas/cpu_data.h
@@ -60,58 +60,16 @@
 // in the future.
 #define IREE_CPU_DATA_FIELD_COUNT 8
 
+#define IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) \
+  IREE_CPU_DATA##field_index##_##arch##_##bit_name
+
 // Bitmasks and values for processor data field 0.
 enum iree_cpu_data_field_0_e {
 
-  //===--------------------------------------------------------------------===//
-  // IREE_ARCH_ARM_64 / aarch64
-  //===--------------------------------------------------------------------===//
-
-  // TODO: add several common ARM ISA extensions and allocate some ranges of
-  // bits for some families/eras. If we just start out with bits 0 and 1
-  // allocated for dotprod and i8mm, we are quickly going to have a hard-to-read
-  // enumeration here.
-  IREE_CPU_DATA0_ARM_64_DOTPROD = 1ull << 0,
-  IREE_CPU_DATA0_ARM_64_I8MM = 1ull << 1,
-
-  //===--------------------------------------------------------------------===//
-  // IREE_ARCH_X86_64 / x86-64
-  //===--------------------------------------------------------------------===//
-
-  // SSE features. Note: SSE and SSE2 are mandatory parts of X86-64.
-  IREE_CPU_DATA0_X86_64_SSE3 = 1ull << 0,
-  IREE_CPU_DATA0_X86_64_SSSE3 = 1ull << 1,
-  IREE_CPU_DATA0_X86_64_SSE41 = 1ull << 2,
-  IREE_CPU_DATA0_X86_64_SSE42 = 1ull << 3,
-  IREE_CPU_DATA0_X86_64_SSE4A = 1ull << 4,
-
-  // AVX features.
-  IREE_CPU_DATA0_X86_64_AVX = 1ull << 10,
-  IREE_CPU_DATA0_X86_64_FMA3 = 1ull << 11,
-  IREE_CPU_DATA0_X86_64_FMA4 = 1ull << 12,
-  IREE_CPU_DATA0_X86_64_XOP = 1ull << 13,
-  IREE_CPU_DATA0_X86_64_F16C = 1ull << 14,
-  IREE_CPU_DATA0_X86_64_AVX2 = 1ull << 15,
-
-  // AVX-512 features.
-  IREE_CPU_DATA0_X86_64_AVX512F = 1ull << 20,
-  IREE_CPU_DATA0_X86_64_AVX512CD = 1ull << 21,
-  IREE_CPU_DATA0_X86_64_AVX512VL = 1ull << 22,
-  IREE_CPU_DATA0_X86_64_AVX512DQ = 1ull << 23,
-  IREE_CPU_DATA0_X86_64_AVX512BW = 1ull << 24,
-  IREE_CPU_DATA0_X86_64_AVX512IFMA = 1ull << 25,
-  IREE_CPU_DATA0_X86_64_AVX512VBMI = 1ull << 26,
-  IREE_CPU_DATA0_X86_64_AVX512VPOPCNTDQ = 1ull << 27,
-  IREE_CPU_DATA0_X86_64_AVX512VNNI = 1ull << 28,
-  IREE_CPU_DATA0_X86_64_AVX512VBMI2 = 1ull << 29,
-  IREE_CPU_DATA0_X86_64_AVX512BITALG = 1ull << 30,
-  IREE_CPU_DATA0_X86_64_AVX512BF16 = 1ull << 31,
-  IREE_CPU_DATA0_X86_64_AVX512FP16 = 1ull << 32,
-
-  // AMX features.
-  IREE_CPU_DATA0_X86_64_AMXTILE = 1ull << 50,
-  IREE_CPU_DATA0_X86_64_AMXINT8 = 1ull << 51,
-  IREE_CPU_DATA0_X86_64_AMXBF16 = 1ull << 52,
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
+  IREE_CPU_FEATURE_BIT_NAME(arch, field_index, bit_name) = 1ull << bit_pos,
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
 
 };
 
diff --git a/runtime/src/iree/schemas/cpu_feature_bits.inl b/runtime/src/iree/schemas/cpu_feature_bits.inl
new file mode 100644
index 0000000..93ef0a9
--- /dev/null
+++ b/runtime/src/iree/schemas/cpu_feature_bits.inl
@@ -0,0 +1,87 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//===----------------------------------------------------------------------===//
+// CPU features: IREE cpu_data bits and mapping to LLVM target attribute keys.
+//===----------------------------------------------------------------------===//
+//
+// Refer to the file comment in cpu_data.h. Summary:
+// - This is included in both compiler and runtime.
+// - Unconditionally define CPU features for all target architectures, not just
+//   host, because this is needed by the compiler when targeting non-host.
+// - The bit values will soon be set in stone, because they will be encoded in
+//   generated modules.
+// - Try to pack related features in the same cpu_data field and in nearby bits
+//   if possible, on a best-effort basis.
+
+#ifndef IREE_CPU_FEATURE_BIT
+#error Define IREE_CPU_FEATURE_BIT before including this file.
+#endif
+
+// Format:
+//   IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, "llvm_name")
+//
+// Where:
+//   - `arch` is the CPU architecture that this CPU feature applies to, in
+//     IREE's uppercase convention (e.g. ARM_64, X86_64; see IREE_ARCH_*).
+//   - `field_index` is the index into the array returned by `iree_cpu_data_fields()`.
+//     Allowed values range from 0 to (IREE_CPU_DATA_FIELD_COUNT-1).
+//   - `bit_pos` is the position of the feature bit within that cpu data field.
+//     As these fields are uint64_t, the range of `bit_pos` is 0..63.
+//   - `bit_name` is the suffix to use to form the IREE C identifier for this
+//     feature's bit value.
+//   - `llvm_name` is the string name of the corresponding LLVM target attribute
+//     (without a leading +).
+
+//===----------------------------------------------------------------------===//
+// IREE_ARCH_ARM_64 / aarch64
+//===----------------------------------------------------------------------===//
+
+// TODO: add several common ARM ISA extensions and allocate some ranges of
+// bits for some families/eras. If we just start out with bits 0 and 1
+// allocated for dotprod and i8mm, we are quickly going to have a hard-to-read
+// enumeration here.
+IREE_CPU_FEATURE_BIT(ARM_64, 0, 0, DOTPROD, "dotprod")
+IREE_CPU_FEATURE_BIT(ARM_64, 0, 1, I8MM, "i8mm")
+
+//===----------------------------------------------------------------------===//
+// IREE_ARCH_X86_64 / x86-64
+//===----------------------------------------------------------------------===//
+
+// SSE features. Note: SSE and SSE2 are mandatory parts of X86-64.
+IREE_CPU_FEATURE_BIT(X86_64, 0, 0, SSE3, "sse3")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 1, SSSE3, "ssse3")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 2, SSE41, "sse4.1")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 3, SSE42, "sse4.2")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 4, SSE4A, "sse4a")
+
+// AVX features.
+IREE_CPU_FEATURE_BIT(X86_64, 0, 10, AVX, "avx")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 11, FMA3, "fma")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 12, FMA4, "fma4")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 13, XOP, "xop")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 14, F16C, "f16c")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 15, AVX2, "avx2")
+
+// AVX-512 features.
+IREE_CPU_FEATURE_BIT(X86_64, 0, 20, AVX512F, "avx512f")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 21, AVX512CD, "avx512cd")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 22, AVX512VL, "avx512vl")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 23, AVX512DQ, "avx512dq")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 24, AVX512BW, "avx512bw")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 25, AVX512IFMA, "avx512ifma")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 26, AVX512VBMI, "avx512vbmi")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 27, AVX512VPOPCNTDQ, "avx512vpopcntdq")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 28, AVX512VNNI, "avx512vnni")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 29, AVX512VBMI2, "avx512vbmi2")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 30, AVX512BITALG, "avx512bitalg")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 31, AVX512BF16, "avx512bf16")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 32, AVX512FP16, "avx512fp16")
+
+// AMX features.
+IREE_CPU_FEATURE_BIT(X86_64, 0, 50, AMXTILE, "amx-tile")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 51, AMXINT8, "amx-int8")
+IREE_CPU_FEATURE_BIT(X86_64, 0, 52, AMXBF16, "amx-bf16")
diff --git a/tools/BUILD b/tools/BUILD
index fff940c..3a37662 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -212,6 +212,16 @@
 )
 
 cc_binary(
+    name = "iree-cpuinfo",
+    srcs = ["iree-cpuinfo.c"],
+    deps = [
+        "//runtime/src/iree/base",
+        "//runtime/src/iree/base/internal:cpu",
+        "//runtime/src/iree/schemas:cpu_data",
+    ],
+)
+
+cc_binary(
     name = "iree-tblgen",
     srcs = [
         "//compiler/src/iree/compiler/Dialect/VM/Tools:GenSrcs",
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index ea8d47c..50e56d1 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -187,6 +187,17 @@
     yaml
 )
 
+iree_cc_binary(
+  NAME
+    iree-cpuinfo
+  SRCS
+    "iree-cpuinfo.c"
+  DEPS
+    iree::base
+    iree::base::internal::cpu
+    iree::schemas::cpu_data
+)
+
 if(IREE_BUILD_COMPILER)
   # If a target backend that requires LLD to link codegen executables is
   # enabled, install the target.
diff --git a/tools/iree-cpuinfo.c b/tools/iree-cpuinfo.c
new file mode 100644
index 0000000..d740465
--- /dev/null
+++ b/tools/iree-cpuinfo.c
@@ -0,0 +1,25 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdio.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/cpu.h"
+
+int main(int argc, char *argv[]) {
+  iree_cpu_initialize(iree_allocator_system());
+
+#define IREE_CPU_FEATURE_BIT(arch, field_index, bit_pos, bit_name, llvm_name) \
+  if (IREE_ARCH_ENUM == IREE_ARCH_ENUM_##arch) {                              \
+    int64_t result = 0;                                                       \
+    IREE_CHECK_OK(iree_cpu_lookup_data_by_key(IREE_SV(llvm_name), &result));  \
+    printf("%-20s %ld\n", llvm_name, result);                                 \
+  }
+#include "iree/schemas/cpu_feature_bits.inl"
+#undef IREE_CPU_FEATURE_BIT
+
+  return 0;
+}