tree: ecb5f0f674b81879da5fed40b33d835ae3de0c80
  1. abi/
  2. cts/
  3. device/
  4. registration/
  5. util/
  6. access_policy.c
  7. access_policy.h
  8. access_policy_test.cc
  9. allocator.c
  10. allocator.h
  11. allocator_test.cc
  12. api.h
  13. aql_block_processor.c
  14. aql_block_processor.h
  15. aql_block_processor_profile.c
  16. aql_block_processor_profile.h
  17. aql_block_processor_test.cc
  18. aql_block_processor_timestamp.c
  19. aql_block_processor_timestamp.h
  20. aql_block_processor_timestamp_test.cc
  21. aql_command_buffer.c
  22. aql_command_buffer.h
  23. aql_command_buffer_profile.c
  24. aql_command_buffer_profile.h
  25. aql_command_buffer_test.cc
  26. aql_prepublished_kernarg_storage.h
  27. aql_program_builder.c
  28. aql_program_builder.h
  29. aql_program_builder_test.cc
  30. aql_program_validation.c
  31. aql_program_validation.h
  32. buffer.c
  33. buffer.h
  34. BUILD.bazel
  35. CMakeLists.txt
  36. driver.c
  37. driver.h
  38. driver_options_test.cc
  39. executable.c
  40. executable.h
  41. executable_cache.c
  42. executable_cache.h
  43. executable_test.cc
  44. host_queue.c
  45. host_queue.h
  46. host_queue_blit.c
  47. host_queue_blit.h
  48. host_queue_command_buffer.c
  49. host_queue_command_buffer.h
  50. host_queue_command_buffer_block.c
  51. host_queue_command_buffer_block.h
  52. host_queue_command_buffer_packet.c
  53. host_queue_command_buffer_packet.h
  54. host_queue_command_buffer_profile.c
  55. host_queue_command_buffer_profile.h
  56. host_queue_command_buffer_replay.c
  57. host_queue_command_buffer_replay.h
  58. host_queue_command_buffer_scratch.h
  59. host_queue_command_buffer_test.cc
  60. host_queue_dispatch.c
  61. host_queue_dispatch.h
  62. host_queue_file.c
  63. host_queue_file.h
  64. host_queue_host_call.c
  65. host_queue_host_call.h
  66. host_queue_memory.c
  67. host_queue_memory.h
  68. host_queue_pending.c
  69. host_queue_pending.h
  70. host_queue_pending_operation.h
  71. host_queue_pending_payload.c
  72. host_queue_pending_test.cc
  73. host_queue_policy.c
  74. host_queue_policy.h
  75. host_queue_profile.c
  76. host_queue_profile.h
  77. host_queue_profile_events.c
  78. host_queue_profile_events.h
  79. host_queue_staging.c
  80. host_queue_staging.h
  81. host_queue_staging_test.cc
  82. host_queue_submission.c
  83. host_queue_submission.h
  84. host_queue_submission_test.cc
  85. host_queue_timestamp.c
  86. host_queue_timestamp.h
  87. host_queue_waits.c
  88. host_queue_waits.h
  89. logical_device.c
  90. logical_device.h
  91. physical_device.c
  92. physical_device.h
  93. physical_device_capabilities.c
  94. physical_device_capabilities.h
  95. physical_device_capabilities_test.cc
  96. profile_aqlprofile.c
  97. profile_aqlprofile.h
  98. profile_counters.c
  99. profile_counters.h
  100. profile_device_metrics.c
  101. profile_device_metrics.h
  102. profile_device_metrics_linux.c
  103. profile_device_metrics_source.h
  104. profile_events.c
  105. profile_events.h
  106. profile_events_test.cc
  107. profile_metadata.c
  108. profile_metadata.h
  109. profile_metadata_test.cc
  110. profile_traces.c
  111. profile_traces.h
  112. queue_affinity.c
  113. queue_affinity.h
  114. queue_affinity_test.cc
  115. README.md
  116. semaphore.c
  117. semaphore.h
  118. semaphore_test.cc
  119. slab_provider.c
  120. slab_provider.h
  121. slab_provider_test.cc
  122. system.c
  123. system.h
  124. system_test.cc
  125. transient_buffer.c
  126. transient_buffer.h
  127. virtual_queue.h
runtime/src/iree/hal/drivers/amdgpu/README.md

AMD GPU HAL Driver (amdgpu)

NOTE: the code is the authoritative documentation source. This document is an overview of the implementation and should be treated as informational only. See the linked files for details.

Quick Start

CMake

Configure CMake with the following options:

-DIREE_BUILD_COMPILER=ON
-DIREE_TARGET_BACKEND_ROCM=ON
-DIREE_HAL_DRIVER_AMDGPU=ON
-DIREE_HAL_AMDGPU_DEVICE_LIBRARY_TARGETS=all
-DIREE_ROCM_TEST_TARGET_CHIP=gfx1100

Bazel

Build tools with the AMDGPU runtime driver registered and with device artifacts compiled for your local GPU architecture:

iree-bazel-build //tools:iree-compile //tools:iree-run-module \
  --iree_drivers=amdgpu,cuda,hip,local-sync,local-task,vulkan \
  --//build_tools/bazel:rocm_test_target=gfx1100

The ROCM chip target defaults to gfx1100. Override for your hardware:

iree-bazel-test --//build_tools/bazel:rocm_test_target=gfx942 //runtime/src/iree/hal/drivers/amdgpu/cts/...

Substitute the architecture with your own. See therock_amdgpu_targets.cmake for the target and generic family vocabulary mirrored by the embedded device library build.

Use amdgpu to specify devices at runtime:

# Single logical device with all available physical devices:
iree-run-module --device=amdgpu
# Device ordinal 0 (danger, this may change across reboots):
iree-run-module --device=amdgpu:0
# Device with a stable UUID for a device:
iree-run-module --device=amdgpu://GPU-0e12865a3bf5b7ab
# Single logical device with the two devices given by their UUIDs:
iree-run-module --device=amdgpu://GPU-0e12865a3bf5b7ab,GPU-89e8bdf59a10cf6d
# Single logical device with physical devices with ordinals 2 and 3:
ROCR_VISIBLE_DEVICES=2,3 iree-run-module --device=amdgpu
# Two logical devices with two physical devices each:
iree-run-module --device=amdgpu://0,1 --device=amdgpu://2,3

Use amdgpu to specify the AMDGPU target when compiling programs:

iree-compile --iree-hal-target-device=amdgpu ...

For a direct Bazel-built smoke test, compile with the AMDGPU target device and run the resulting VMFB with the AMDGPU HAL driver:

bazel-bin/tools/iree-compile \
  --iree-input-type=stablehlo \
  --iree-hal-target-device=amdgpu \
  --iree-rocm-target=gfx1100 \
  --iree-rocm-bc-dir=bazel-bin/external/_main~iree_extension~amdgpu_device_libs/bitcode \
  tests/e2e/stablehlo_models/mnist_fake_weights.mlir \
  -o=/tmp/mnist_fake_amdgpu.vmfb

bazel-bin/tools/iree-run-module \
  --device=amdgpu \
  --module=/tmp/mnist_fake_amdgpu.vmfb \
  --function=predict \
  --input=1x28x28x1xf32 \
  --expected_output='1x10xf32=0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1'

Prefer this explicit two-step flow when debugging AMDGPU target-device behavior. iree-run-mlir --device=amdgpu currently relies on generic device-to-compiler flag inference and may select the legacy ROCm/HIP target path instead of the AMDGPU HAL target.

To capture a Tracy trace of the same runtime path, use the Bazel trace wrapper. The wrapper requires a tracy-capture binary in PATH, or one supplied through IREE_TRACY_CAPTURE:

IREE_TRACY_CAPTURE=/path/to/tracy-capture \
  build_tools/bin/iree-bazel-run \
  --trace \
  --trace_name=fake_mnist \
  //tools:iree-run-module \
  --iree_drivers=amdgpu,cuda,hip,local-sync,local-task,vulkan \
  --//build_tools/bazel:rocm_test_target=gfx1100 \
  -- \
  --device=amdgpu \
  --module=/tmp/mnist_fake_amdgpu.vmfb \
  --function=predict \
  --input=1x28x28x1xf32 \
  --expected_output='1x10xf32=0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1'

Driver Shape

The AMDGPU HAL driver is a native HSA/ROCR backend. It does not route through HIP or the legacy ROCm HAL driver. The major runtime objects are:

  • a driver that discovers HSA agents and creates logical devices;
  • one logical device spanning one or more physical GPU devices;
  • one physical-device object per HSA GPU agent, including queues, memory pools, executable caches, profiling state, and device metrics;
  • host queues that translate HAL queue operations into AQL packet streams;
  • replayable command buffers that store backend command records and emit AQL packets at submission time; and
  • device libraries embedded in the runtime and selected for the target GPU ISA at device creation.

Normal execution only depends on the HSA runtime and the embedded device library. Optional profiling modes dynamically load ROCm profiling libraries only when the selected mode needs them.

Profiling and Replay

The AMDGPU driver is one of the primary producers for IREE's HAL-native profiling and replay tools:

  • --device_profiling_mode=queue-events,device-queue-events,dispatch-events captures queue submissions, device-side queue timing, and per-dispatch timing into a .ireeprof bundle.
  • --device_profiling_mode=executable-metadata,counters adds executable export metadata and selected hardware/software counters.
  • --device_profiling_mode=executable-traces captures heavy ATT/SQTT artifacts for filtered dispatches.
  • --device_replay_output=/tmp/model.ireereplay records a HAL-level replay stream that can be run, benchmarked, and profiled independently of the original application.

Useful inspection commands:

iree-profile summary /tmp/model.ireeprof
iree-profile dispatch --format=jsonl /tmp/model.ireeprof
iree-profile export --format=ireeperf-jsonl \
  --output=/tmp/model.ireeperf.jsonl /tmp/model.ireeprof
uvx --with perfetto --with protobuf python "$(command -v iree-profile-render)" \
  --format=perfetto \
  /tmp/model.ireeperf.jsonl -o /tmp/model.pftrace
iree-profile att --rocm_library_path=/opt/rocm/lib /tmp/model-att.ireeprof

See the website documentation for the full workflows:

Build Notes

HSA/ROCR Dependency

We maintain a fork of the HSA headers required for compilation as third_party/hsa-runtime-headers/. This fork may also contain tweaks not yet upstreamed required to use the headers in our build.

We require that at runtime a dynamic library with the name libhsa-runtime64.so exists on the path. This can be overridden programmatically when constructing the driver, via the --amdgpu_libhsa_search_path= flag if using the command line tools, via the IREE_HAL_AMDGPU_LIBHSA_PATH environment variable, or by just adding a directory containing the file to PATH.

It's recommended that developers check out a copy of the ROCR-Runtime and build it locally in whatever configuration they are using (debug/release/ASAN/etc). This allows for easier debugging and profiling as symbols are present and may be required to get recent features not available in platform installs. Eventually IREE will ship its own copy of the library (directly or indirectly) as part of the install packages such that only a relatively recent AMDGPU driver is required.

See HSA/ROCR Library for more information on our usage.

ROCm Profiling Dependencies

Counter and ATT/SQTT capture use ROCm's aqlprofile library through a small dynamic-loader shim. Normal execution, queue timing, dispatch timing, replay, statistics, and Perfetto export do not require this library.

When counters or executable-traces profiling is requested, the driver looks for an aqlprofile-compatible library in this order:

  • IREE_HAL_AMDGPU_LIBAQLPROFILE_PATH;
  • a library adjacent to the loaded HSA runtime; and
  • the platform dynamic-library search path.

The iree-profile att decoder also needs ROCm decode libraries. Pass --rocm_library_path=/opt/rocm/lib, set IREE_HAL_AMDGPU_LIBAQLPROFILE_PATH, or rely on the platform search path.

Device Library Compilation

Required CMake Options: -DIREE_BUILD_COMPILER=ON -DIREE_TARGET_BACKEND_ROCM=ON

Top-level Build Target: iree_hal_drivers_amdgpu_device_binaries

Currently IREE's CMake configuration must have the compiler enabled in order to build the runtime including the AMDGPU HAL implementation. This will be made better in the future (allowing for just building what we need instead of the full MLIR stack, using an existing ROCM install, etc). See Device Library for more information.

The device library should be compiled automatically when building the AMDGPU HAL driver and gets embedded inside the runtime binary so that no additional files are required at runtime.

The IREE_HAL_AMDGPU_DEVICE_LIBRARY_TARGETS CMake variable defaults to all, which embeds LLVM generic ISA code objects covering every currently known AMDGPU device library target. Packagers can set it to a smaller list of exact target architectures, LLVM generic ISA targets, TheRock-style generic target families, or TheRock-style product bundles. Exact targets use the HSA ISA spelling, such as gfx1100. LLVM generic ISA targets use spellings such as gfx11-generic. Generic families use the TheRock family spelling, such as gfx110X-all, and product bundles use spellings such as dgpu-all or igpu-all. These selectors expand to the smallest known compatible code object set instead of one code object per exact GPU. Architectures not built into the library will fail to instantiate the driver at runtime.

The Bazel build exposes the same selector vocabulary through //runtime/src/iree/hal/drivers/amdgpu/device/binaries:targets:

iree-bazel-build --//runtime/src/iree/hal/drivers/amdgpu/device/binaries:targets=igpu-all //runtime/src/iree/hal/drivers/amdgpu:amdgpu

See device/binaries/README.md for the target map update flow, the generated Bazel/CMake/runtime fragments, and the TheRock/LLVM sources that should be checked when adding support for a new architecture.