[HAL/AMDGPU] Centralize physical topology edge selection Move the physical source/destination topology edge decision table into physical_device_capabilities.*, separating HSA fact collection from policy selection. logical_device.c now queries memory-pool access and link-hop records, then feeds a pure selector that records coarse/fine access, grant-required peer access, PCIe/xGMI/etc link flags, coherency, 32-bit and 64-bit atomics, link class/cost/NUMA distance, and derived HAL topology modes/capabilities. This keeps the queue, command-buffer, and copy hot paths free of new buffer-snooping or recurring validation while giving future SDMA/P2P strategy selection named cold-path facts to consume. Unsupported copy strategies remain feature slots rather than implicit queue-path branches. Add synthetic coverage for xGMI, PCIe without coherent/system-atomic support, multi-hop worst-case collapse, grant-required peer memory, no-access host-staged fallback, and invalid HSA fact inputs.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c index 9098d84..b7194f6 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c +++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -1642,31 +1642,6 @@ // Maximum number of HSA memory-pool link hops we will stack-allocate. #define IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS 16 -typedef struct iree_hal_amdgpu_physical_topology_edge_t { - // Source-agent access to the destination coarse-grained memory pool. - hsa_amd_memory_pool_access_t coarse_access; - // Source-agent access to the destination fine-grained memory pool. - hsa_amd_memory_pool_access_t fine_access; - // True when |coarse_access| permits some direct device access. - bool coarse_accessible; - // True when |fine_access| permits some direct device access. - bool fine_accessible; - // True when every HSA-reported link hop supports coherent transactions. - bool all_hops_coherent; - // True when every HSA-reported link hop supports 32-bit atomics. - bool all_hops_atomic_32bit; - // True when every HSA-reported link hop supports 64-bit atomics. - bool all_hops_atomic_64bit; - // Worst physical link class across the reported HSA link hops. - iree_hal_topology_link_class_t link_class; - // Conservative copy-cost class derived from |link_class|. - uint8_t copy_cost; - // Conservative latency class derived from |link_class|. - uint8_t latency_class; - // Worst normalized NUMA distance reported by HSA link hops. - uint8_t numa_distance; -} iree_hal_amdgpu_physical_topology_edge_t; - typedef struct iree_hal_amdgpu_topology_edge_aggregate_t { // Physical capability facts produced by cross-pair aggregation. struct { @@ -1693,133 +1668,11 @@ uint8_t numa_distance; } iree_hal_amdgpu_topology_edge_aggregate_t; -// Maps an HSA link type to a HAL topology link class. -// For multi-hop links, the caller should take the worst (highest) class. -static iree_hal_topology_link_class_t iree_hal_amdgpu_link_type_to_link_class( - hsa_amd_link_info_type_t link_type) { - switch (link_type) { - case HSA_AMD_LINK_INFO_TYPE_XGMI: - return IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF; - case HSA_AMD_LINK_INFO_TYPE_PCIE: - return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT; - case HSA_AMD_LINK_INFO_TYPE_QPI: - case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: - // Cross-socket interconnects — treat as cross-root PCIe. - return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT; - case HSA_AMD_LINK_INFO_TYPE_INFINBAND: - return IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC; - default: - return IREE_HAL_TOPOLOGY_LINK_CLASS_OTHER; - } -} - -static void iree_hal_amdgpu_topology_costs_from_link_class( - iree_hal_topology_link_class_t link_class, uint8_t* out_copy_cost, - uint8_t* out_latency_class) { - switch (link_class) { - case IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE: - *out_copy_cost = 0; - *out_latency_class = 0; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF: - *out_copy_cost = 3; - *out_latency_class = 3; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT: - *out_copy_cost = 7; - *out_latency_class = 7; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT: - *out_copy_cost = 9; - *out_latency_class = 9; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED: - *out_copy_cost = 13; - *out_latency_class = 11; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC: - *out_copy_cost = 15; - *out_latency_class = 14; - break; - case IREE_HAL_TOPOLOGY_LINK_CLASS_ISOLATED: - *out_copy_cost = 15; - *out_latency_class = 15; - break; - default: - *out_copy_cost = 11; - *out_latency_class = 10; - break; - } -} - -static uint8_t iree_hal_amdgpu_topology_scale_hsa_numa_distance( - uint32_t hsa_numa_distance) { - if (hsa_numa_distance == 0) return 0; - uint32_t scaled = hsa_numa_distance > 10 ? (hsa_numa_distance - 10) / 2 : 0; - return (uint8_t)iree_min(scaled, 15u); -} - -static iree_status_t iree_hal_amdgpu_validate_memory_pool_access( - hsa_amd_memory_pool_access_t access, const char* pool_kind) { - if (IREE_LIKELY(iree_hal_amdgpu_memory_pool_access_is_valid(access))) { - return iree_ok_status(); - } - return iree_make_status(IREE_STATUS_OUT_OF_RANGE, - "HSA reported unknown %s memory pool access mode %u", - pool_kind, (uint32_t)access); -} - -static iree_hal_topology_capability_t -iree_hal_amdgpu_physical_topology_guaranteed_capabilities( - const iree_hal_amdgpu_physical_topology_edge_t* physical_edge) { - iree_hal_topology_capability_t capabilities = - IREE_HAL_TOPOLOGY_CAPABILITY_NONE; - if (!physical_edge->coarse_accessible && !physical_edge->fine_accessible) { - return capabilities; - } - capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY; - if (physical_edge->all_hops_coherent) { - capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT; - } - if (physical_edge->all_hops_atomic_32bit) { - capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE; - } - if (physical_edge->all_hops_atomic_64bit) { - capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM; - } - return capabilities; -} - -static iree_hal_topology_capability_t -iree_hal_amdgpu_physical_topology_required_capabilities( - const iree_hal_amdgpu_physical_topology_edge_t* physical_edge) { - iree_hal_topology_capability_t capabilities = - IREE_HAL_TOPOLOGY_CAPABILITY_NONE; - capabilities |= iree_hal_amdgpu_memory_pool_access_topology_capabilities( - physical_edge->coarse_access); - capabilities |= iree_hal_amdgpu_memory_pool_access_topology_capabilities( - physical_edge->fine_access); - return capabilities; -} - -static void iree_hal_amdgpu_physical_topology_edge_initialize( - iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) { - memset(out_physical_edge, 0, sizeof(*out_physical_edge)); - out_physical_edge->coarse_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; - out_physical_edge->fine_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; - out_physical_edge->all_hops_coherent = true; - out_physical_edge->all_hops_atomic_32bit = true; - out_physical_edge->all_hops_atomic_64bit = true; - out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE; -} - static iree_status_t iree_hal_amdgpu_query_physical_topology_edge( const iree_hal_amdgpu_libhsa_t* libhsa, const iree_hal_amdgpu_physical_device_t* source_physical_device, const iree_hal_amdgpu_physical_device_t* destination_physical_device, iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) { - iree_hal_amdgpu_physical_topology_edge_initialize(out_physical_edge); - hsa_agent_t source_agent = source_physical_device->device_agent; hsa_agent_t destination_agent = destination_physical_device->device_agent; @@ -1839,27 +1692,24 @@ "destination agent has neither coarse nor fine global memory pool"); } + iree_hal_amdgpu_physical_topology_edge_selection_t selection = { + .memory_access = + { + .coarse = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, + .fine = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, + }, + }; if (has_coarse_pool) { IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info( IREE_LIBHSA(libhsa), source_agent, dst_coarse_pool, HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &out_physical_edge->coarse_access)); - IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access( - out_physical_edge->coarse_access, "coarse")); + &selection.memory_access.coarse)); } if (has_fine_pool) { IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info( IREE_LIBHSA(libhsa), source_agent, dst_fine_pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, - &out_physical_edge->fine_access)); - IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access( - out_physical_edge->fine_access, "fine")); + HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &selection.memory_access.fine)); } - out_physical_edge->coarse_accessible = - out_physical_edge->coarse_access != - HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; - out_physical_edge->fine_accessible = out_physical_edge->fine_access != - HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; // Query link hop count and topology. The link topology describes the // interconnect between agents and is the same regardless of pool granularity; @@ -1878,51 +1728,21 @@ hop_count, (iree_host_size_t)IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS); } + hsa_amd_memory_pool_link_info_t + link_hops[IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS]; + memset(link_hops, 0, sizeof(link_hops[0]) * hop_count); if (hop_count > 0) { // The LINK_INFO query writes exactly hop_count entries into the caller's // buffer with no separate size parameter. - hsa_amd_memory_pool_link_info_t - link_info[IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS]; - memset(link_info, 0, sizeof(link_info[0]) * hop_count); IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info( IREE_LIBHSA(libhsa), source_agent, link_query_pool, - HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info)); - - for (uint32_t i = 0; i < hop_count; ++i) { - iree_hal_topology_link_class_t hop_class = - iree_hal_amdgpu_link_type_to_link_class(link_info[i].link_type); - if (hop_class > out_physical_edge->link_class) { - out_physical_edge->link_class = hop_class; - } - uint8_t numa_distance = iree_hal_amdgpu_topology_scale_hsa_numa_distance( - link_info[i].numa_distance); - if (numa_distance > out_physical_edge->numa_distance) { - out_physical_edge->numa_distance = numa_distance; - } - if (!link_info[i].coherent_support) { - out_physical_edge->all_hops_coherent = false; - } - if (!link_info[i].atomic_support_32bit) { - out_physical_edge->all_hops_atomic_32bit = false; - } - if (!link_info[i].atomic_support_64bit) { - out_physical_edge->all_hops_atomic_64bit = false; - } - } + HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_hops)); } - if (!out_physical_edge->coarse_accessible && - !out_physical_edge->fine_accessible) { - out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED; - out_physical_edge->all_hops_coherent = false; - out_physical_edge->all_hops_atomic_32bit = false; - out_physical_edge->all_hops_atomic_64bit = false; - } - - iree_hal_amdgpu_topology_costs_from_link_class( - out_physical_edge->link_class, &out_physical_edge->copy_cost, - &out_physical_edge->latency_class); - return iree_ok_status(); + selection.link.hops = link_hops; + selection.link.count = hop_count; + return iree_hal_amdgpu_select_physical_topology_edge(&selection, + out_physical_edge); } static void iree_hal_amdgpu_topology_edge_aggregate_initialize( @@ -1953,38 +1773,31 @@ const iree_hal_amdgpu_physical_topology_edge_t* physical_edge, iree_hal_amdgpu_topology_edge_aggregate_t* aggregate) { aggregate->physical_capabilities.guaranteed &= - iree_hal_amdgpu_physical_topology_guaranteed_capabilities(physical_edge); + physical_edge->capabilities.guaranteed; aggregate->physical_capabilities.required |= - iree_hal_amdgpu_physical_topology_required_capabilities(physical_edge); + physical_edge->capabilities.required; - aggregate->noncoherent_read_mode = - iree_max(aggregate->noncoherent_read_mode, - iree_hal_amdgpu_memory_pool_access_topology_mode( - physical_edge->coarse_access)); + aggregate->noncoherent_read_mode = iree_max( + aggregate->noncoherent_read_mode, physical_edge->modes.noncoherent_read); aggregate->noncoherent_write_mode = iree_max(aggregate->noncoherent_write_mode, - iree_hal_amdgpu_memory_pool_access_topology_mode( - physical_edge->coarse_access)); - aggregate->coherent_read_mode = - iree_max(aggregate->coherent_read_mode, - iree_hal_amdgpu_memory_pool_access_topology_mode( - physical_edge->fine_access)); - aggregate->coherent_write_mode = - iree_max(aggregate->coherent_write_mode, - iree_hal_amdgpu_memory_pool_access_topology_mode( - physical_edge->fine_access)); + physical_edge->modes.noncoherent_write); + aggregate->coherent_read_mode = iree_max(aggregate->coherent_read_mode, + physical_edge->modes.coherent_read); + aggregate->coherent_write_mode = iree_max( + aggregate->coherent_write_mode, physical_edge->modes.coherent_write); - if (physical_edge->link_class > aggregate->link_class) { - aggregate->link_class = physical_edge->link_class; + if (physical_edge->link.link_class > aggregate->link_class) { + aggregate->link_class = physical_edge->link.link_class; } - if (physical_edge->copy_cost > aggregate->copy_cost) { - aggregate->copy_cost = physical_edge->copy_cost; + if (physical_edge->link.copy_cost > aggregate->copy_cost) { + aggregate->copy_cost = physical_edge->link.copy_cost; } - if (physical_edge->latency_class > aggregate->latency_class) { - aggregate->latency_class = physical_edge->latency_class; + if (physical_edge->link.latency_class > aggregate->latency_class) { + aggregate->latency_class = physical_edge->link.latency_class; } - if (physical_edge->numa_distance > aggregate->numa_distance) { - aggregate->numa_distance = physical_edge->numa_distance; + if (physical_edge->link.numa_distance > aggregate->numa_distance) { + aggregate->numa_distance = physical_edge->link.numa_distance; } }
diff --git a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.c b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.c index 1f255e3..e528715 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.c +++ b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.c
@@ -112,6 +112,229 @@ return IREE_HAL_TOPOLOGY_CAPABILITY_NONE; } +// Maps an HSA link type to a HAL topology link class. +// +// For multi-hop links, callers should take the worst/highest class. +static iree_hal_topology_link_class_t iree_hal_amdgpu_link_type_to_link_class( + hsa_amd_link_info_type_t link_type) { + switch (link_type) { + case HSA_AMD_LINK_INFO_TYPE_XGMI: + return IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF; + case HSA_AMD_LINK_INFO_TYPE_PCIE: + return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT; + case HSA_AMD_LINK_INFO_TYPE_QPI: + case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: + // Cross-socket interconnects: treat as cross-root PCIe. + return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT; + case HSA_AMD_LINK_INFO_TYPE_INFINBAND: + return IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC; + default: + return IREE_HAL_TOPOLOGY_LINK_CLASS_OTHER; + } +} + +static iree_hal_amdgpu_physical_topology_link_flags_t +iree_hal_amdgpu_link_type_to_physical_topology_link_flags( + hsa_amd_link_info_type_t link_type) { + switch (link_type) { + case HSA_AMD_LINK_INFO_TYPE_PCIE: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_PCIE; + case HSA_AMD_LINK_INFO_TYPE_XGMI: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_XGMI; + case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_HYPERTRANSPORT; + case HSA_AMD_LINK_INFO_TYPE_QPI: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_QPI; + case HSA_AMD_LINK_INFO_TYPE_INFINBAND: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_INFINIBAND; + default: + return IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_OTHER; + } +} + +static void iree_hal_amdgpu_topology_costs_from_link_class( + iree_hal_topology_link_class_t link_class, uint8_t* out_copy_cost, + uint8_t* out_latency_class) { + switch (link_class) { + case IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE: + *out_copy_cost = 0; + *out_latency_class = 0; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF: + *out_copy_cost = 3; + *out_latency_class = 3; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT: + *out_copy_cost = 7; + *out_latency_class = 7; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT: + *out_copy_cost = 9; + *out_latency_class = 9; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED: + *out_copy_cost = 13; + *out_latency_class = 11; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC: + *out_copy_cost = 15; + *out_latency_class = 14; + break; + case IREE_HAL_TOPOLOGY_LINK_CLASS_ISOLATED: + *out_copy_cost = 15; + *out_latency_class = 15; + break; + default: + *out_copy_cost = 11; + *out_latency_class = 10; + break; + } +} + +static uint8_t iree_hal_amdgpu_topology_scale_hsa_numa_distance( + uint32_t hsa_numa_distance) { + if (hsa_numa_distance == 0) return 0; + uint32_t scaled = hsa_numa_distance > 10 ? (hsa_numa_distance - 10) / 2 : 0; + return (uint8_t)iree_min(scaled, 15u); +} + +static iree_status_t iree_hal_amdgpu_validate_physical_topology_edge_access( + hsa_amd_memory_pool_access_t access, const char* pool_kind) { + if (IREE_LIKELY(iree_hal_amdgpu_memory_pool_access_is_valid(access))) { + return iree_ok_status(); + } + return iree_make_status(IREE_STATUS_OUT_OF_RANGE, + "HSA reported unknown %s memory pool access mode %u", + pool_kind, (uint32_t)access); +} + +static void iree_hal_amdgpu_physical_topology_edge_initialize( + iree_hal_amdgpu_physical_topology_edge_t* out_edge) { + memset(out_edge, 0, sizeof(*out_edge)); + out_edge->memory_access.coarse = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + out_edge->memory_access.fine = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + out_edge->coherency.all_hops_coherent = 1; + out_edge->atomics.all_hops_32bit = 1; + out_edge->atomics.all_hops_64bit = 1; + out_edge->link.link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE; + out_edge->modes.noncoherent_read = IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY; + out_edge->modes.noncoherent_write = IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY; + out_edge->modes.coherent_read = IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY; + out_edge->modes.coherent_write = IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY; +} + +static iree_hal_topology_capability_t +iree_hal_amdgpu_physical_topology_guaranteed_capabilities( + const iree_hal_amdgpu_physical_topology_edge_t* edge) { + iree_hal_topology_capability_t capabilities = + IREE_HAL_TOPOLOGY_CAPABILITY_NONE; + if (!edge->memory_access.coarse_accessible && + !edge->memory_access.fine_accessible) { + return capabilities; + } + capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY; + if (edge->coherency.all_hops_coherent) { + capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT; + } + if (edge->atomics.all_hops_32bit) { + capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE; + } + if (edge->atomics.all_hops_64bit) { + capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM; + } + return capabilities; +} + +static iree_hal_topology_capability_t +iree_hal_amdgpu_physical_topology_required_capabilities( + const iree_hal_amdgpu_physical_topology_edge_t* edge) { + iree_hal_topology_capability_t capabilities = + IREE_HAL_TOPOLOGY_CAPABILITY_NONE; + capabilities |= iree_hal_amdgpu_memory_pool_access_topology_capabilities( + edge->memory_access.coarse); + capabilities |= iree_hal_amdgpu_memory_pool_access_topology_capabilities( + edge->memory_access.fine); + return capabilities; +} + +iree_status_t iree_hal_amdgpu_select_physical_topology_edge( + const iree_hal_amdgpu_physical_topology_edge_selection_t* selection, + iree_hal_amdgpu_physical_topology_edge_t* out_edge) { + IREE_ASSERT_ARGUMENT(selection); + IREE_ASSERT_ARGUMENT(out_edge); + iree_hal_amdgpu_physical_topology_edge_initialize(out_edge); + + IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_physical_topology_edge_access( + selection->memory_access.coarse, "coarse")); + IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_physical_topology_edge_access( + selection->memory_access.fine, "fine")); + if (IREE_UNLIKELY(selection->link.count && !selection->link.hops)) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "AMDGPU physical topology edge selection requires link hops when " + "link count is nonzero"); + } + + out_edge->memory_access.coarse = selection->memory_access.coarse; + out_edge->memory_access.fine = selection->memory_access.fine; + out_edge->memory_access.coarse_accessible = + selection->memory_access.coarse != + HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + out_edge->memory_access.fine_accessible = + selection->memory_access.fine != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + + for (iree_host_size_t i = 0; i < selection->link.count; ++i) { + const hsa_amd_memory_pool_link_info_t* link_hop = &selection->link.hops[i]; + iree_hal_topology_link_class_t link_class = + iree_hal_amdgpu_link_type_to_link_class(link_hop->link_type); + if (link_class > out_edge->link.link_class) { + out_edge->link.link_class = link_class; + } + out_edge->link.flags |= + iree_hal_amdgpu_link_type_to_physical_topology_link_flags( + link_hop->link_type); + uint8_t numa_distance = iree_hal_amdgpu_topology_scale_hsa_numa_distance( + link_hop->numa_distance); + if (numa_distance > out_edge->link.numa_distance) { + out_edge->link.numa_distance = numa_distance; + } + if (!link_hop->coherent_support) { + out_edge->coherency.all_hops_coherent = 0; + } + if (!link_hop->atomic_support_32bit) { + out_edge->atomics.all_hops_32bit = 0; + } + if (!link_hop->atomic_support_64bit) { + out_edge->atomics.all_hops_64bit = 0; + } + } + + if (!out_edge->memory_access.coarse_accessible && + !out_edge->memory_access.fine_accessible) { + out_edge->link.link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED; + out_edge->coherency.all_hops_coherent = 0; + out_edge->atomics.all_hops_32bit = 0; + out_edge->atomics.all_hops_64bit = 0; + } + + iree_hal_amdgpu_topology_costs_from_link_class(out_edge->link.link_class, + &out_edge->link.copy_cost, + &out_edge->link.latency_class); + out_edge->capabilities.guaranteed = + iree_hal_amdgpu_physical_topology_guaranteed_capabilities(out_edge); + out_edge->capabilities.required = + iree_hal_amdgpu_physical_topology_required_capabilities(out_edge); + out_edge->modes.noncoherent_read = + iree_hal_amdgpu_memory_pool_access_topology_mode( + out_edge->memory_access.coarse); + out_edge->modes.noncoherent_write = out_edge->modes.noncoherent_read; + out_edge->modes.coherent_read = + iree_hal_amdgpu_memory_pool_access_topology_mode( + out_edge->memory_access.fine); + out_edge->modes.coherent_write = out_edge->modes.coherent_read; + return iree_ok_status(); +} + static bool iree_hal_amdgpu_gfxip_is_pre_gfx908( iree_hal_amdgpu_gfxip_version_t version) { return version.major < 9 ||
diff --git a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.h b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.h index 5a15452..40951bf 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.h +++ b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities.h
@@ -101,6 +101,112 @@ iree_hal_amdgpu_memory_pool_access_topology_capabilities( hsa_amd_memory_pool_access_t access); +typedef enum iree_hal_amdgpu_physical_topology_link_flag_bits_e { + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_NONE = 0u, + // At least one HSA-reported hop uses PCIe. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_PCIE = 1u << 0, + // At least one HSA-reported hop uses xGMI. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_XGMI = 1u << 1, + // At least one HSA-reported hop uses HyperTransport. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_HYPERTRANSPORT = 1u << 2, + // At least one HSA-reported hop uses QPI. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_QPI = 1u << 3, + // At least one HSA-reported hop uses InfiniBand. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_INFINIBAND = 1u << 4, + // At least one HSA-reported hop uses an unknown link type. + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_OTHER = 1u << 5, +} iree_hal_amdgpu_physical_topology_link_flag_bits_t; + +typedef uint32_t iree_hal_amdgpu_physical_topology_link_flags_t; + +// Physical source->destination topology edge selected from already-queried HSA +// memory-pool access and link-hop facts. +typedef struct iree_hal_amdgpu_physical_topology_edge_t { + // Source-agent access to the destination memory pools. + struct { + // Source-agent access to the destination coarse-grained memory pool. + hsa_amd_memory_pool_access_t coarse; + // Source-agent access to the destination fine-grained memory pool. + hsa_amd_memory_pool_access_t fine; + // True when |coarse| permits some direct device access. + uint32_t coarse_accessible : 1; + // True when |fine| permits some direct device access. + uint32_t fine_accessible : 1; + } memory_access; + + // HSA link-hop facts collapsed into strategy-friendly topology values. + struct { + // Worst physical link class across HSA-reported link hops. + iree_hal_topology_link_class_t link_class; + // Conservative copy-cost class derived from |link_class|. + uint8_t copy_cost; + // Conservative latency class derived from |link_class|. + uint8_t latency_class; + // Worst normalized NUMA distance reported by HSA link hops. + uint8_t numa_distance; + // Link flags from iree_hal_amdgpu_physical_topology_link_flag_bits_t. + iree_hal_amdgpu_physical_topology_link_flags_t flags; + } link; + + // Link coherency facts. + struct { + // True when every HSA-reported link hop supports coherent transactions. + uint32_t all_hops_coherent : 1; + } coherency; + + // Link atomic-transaction facts. + struct { + // True when every HSA-reported link hop supports 32-bit atomics. + uint32_t all_hops_32bit : 1; + // True when every HSA-reported link hop supports 64-bit atomics. + uint32_t all_hops_64bit : 1; + } atomics; + + // Generic HAL topology capabilities implied by the physical edge. + struct { + // Positive capabilities guaranteed by this physical pair. + iree_hal_topology_capability_t guaranteed; + // Requirement bits imposed by this physical pair. + iree_hal_topology_capability_t required; + } capabilities; + + // Generic HAL buffer interop modes implied by memory-pool access. + struct { + // Noncoherent read mode derived from coarse-grained pool access. + iree_hal_topology_interop_mode_t noncoherent_read; + // Noncoherent write mode derived from coarse-grained pool access. + iree_hal_topology_interop_mode_t noncoherent_write; + // Coherent read mode derived from fine-grained pool access. + iree_hal_topology_interop_mode_t coherent_read; + // Coherent write mode derived from fine-grained pool access. + iree_hal_topology_interop_mode_t coherent_write; + } modes; +} iree_hal_amdgpu_physical_topology_edge_t; + +// Already-queried HSA facts used to select a physical topology edge. +typedef struct iree_hal_amdgpu_physical_topology_edge_selection_t { + // Source-agent access to the destination memory pools. + struct { + // Source-agent access to the destination coarse-grained memory pool. + hsa_amd_memory_pool_access_t coarse; + // Source-agent access to the destination fine-grained memory pool. + hsa_amd_memory_pool_access_t fine; + } memory_access; + + // HSA link-hop facts for the source->destination memory path. + struct { + // HSA-reported link-hop records. + const hsa_amd_memory_pool_link_info_t* hops; + // Number of entries in |hops|. + iree_host_size_t count; + } link; +} iree_hal_amdgpu_physical_topology_edge_selection_t; + +// Selects a physical topology edge from already-queried HSA facts. +iree_status_t iree_hal_amdgpu_select_physical_topology_edge( + const iree_hal_amdgpu_physical_topology_edge_selection_t* selection, + iree_hal_amdgpu_physical_topology_edge_t* out_edge); + // Returns true if the gfx IP family permits HDP kernarg publication. bool iree_hal_amdgpu_gfxip_allows_hdp_kernarg_publication( iree_hal_amdgpu_gfxip_version_t version);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities_test.cc b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities_test.cc index 8c3bce9..9d55ab7 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities_test.cc +++ b/runtime/src/iree/hal/drivers/amdgpu/physical_device_capabilities_test.cc
@@ -44,6 +44,16 @@ return version; } +static hsa_amd_memory_pool_link_info_t LinkInfo( + hsa_amd_link_info_type_t link_type) { + hsa_amd_memory_pool_link_info_t link_info = {}; + link_info.link_type = link_type; + link_info.atomic_support_32bit = true; + link_info.atomic_support_64bit = true; + link_info.coherent_support = true; + return link_info; +} + class PhysicalDeviceCapabilitiesTest : public ::testing::Test { protected: iree_hal_amdgpu_cpu_visible_device_coarse_memory_selection_t @@ -73,6 +83,19 @@ return selection; } + iree_hal_amdgpu_physical_topology_edge_selection_t MakeTopologyEdgeSelection( + const hsa_amd_memory_pool_link_info_t* link_hops, + iree_host_size_t link_hop_count) { + iree_hal_amdgpu_physical_topology_edge_selection_t selection = {}; + selection.memory_access.coarse = + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT; + selection.memory_access.fine = + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT; + selection.link.hops = link_hops; + selection.link.count = link_hop_count; + return selection; + } + std::array<hsa_agent_t, 2> cpu_agents_ = {Agent(1), Agent(2)}; std::array<hsa_amd_memory_pool_access_t, 2> cpu_access_ = { HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT, @@ -225,6 +248,174 @@ (hsa_amd_memory_pool_access_t)99)); } +TEST_F(PhysicalDeviceCapabilitiesTest, SelectsXgmiPhysicalTopologyEdge) { + std::array<hsa_amd_memory_pool_link_info_t, 1> link_hops = { + LinkInfo(HSA_AMD_LINK_INFO_TYPE_XGMI)}; + link_hops[0].numa_distance = 16; + + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(link_hops.data(), link_hops.size()); + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_ASSERT_OK( + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + EXPECT_EQ(edge.memory_access.coarse, + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT); + EXPECT_EQ(edge.memory_access.fine, + HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT); + EXPECT_TRUE(edge.memory_access.coarse_accessible); + EXPECT_TRUE(edge.memory_access.fine_accessible); + EXPECT_TRUE(edge.coherency.all_hops_coherent); + EXPECT_TRUE(edge.atomics.all_hops_32bit); + EXPECT_TRUE(edge.atomics.all_hops_64bit); + EXPECT_TRUE(iree_any_bit_set( + edge.link.flags, IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_XGMI)); + EXPECT_EQ(edge.link.link_class, IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF); + EXPECT_EQ(edge.link.copy_cost, 3); + EXPECT_EQ(edge.link.latency_class, 3); + EXPECT_EQ(edge.link.numa_distance, 3); + EXPECT_TRUE( + iree_all_bits_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY | + IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT | + IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE | + IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM)); + EXPECT_EQ(edge.capabilities.required, IREE_HAL_TOPOLOGY_CAPABILITY_NONE); + EXPECT_EQ(edge.modes.noncoherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE); + EXPECT_EQ(edge.modes.coherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE); +} + +TEST_F(PhysicalDeviceCapabilitiesTest, + SelectsWorstMultiHopPhysicalTopologyEdge) { + std::array<hsa_amd_memory_pool_link_info_t, 2> link_hops = { + LinkInfo(HSA_AMD_LINK_INFO_TYPE_XGMI), + LinkInfo(HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT)}; + link_hops[0].numa_distance = 12; + link_hops[1].numa_distance = 28; + link_hops[1].atomic_support_32bit = false; + link_hops[1].coherent_support = false; + + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(link_hops.data(), link_hops.size()); + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_ASSERT_OK( + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + EXPECT_FALSE(edge.coherency.all_hops_coherent); + EXPECT_FALSE(edge.atomics.all_hops_32bit); + EXPECT_TRUE(edge.atomics.all_hops_64bit); + EXPECT_TRUE(iree_all_bits_set( + edge.link.flags, + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_XGMI | + IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_HYPERTRANSPORT)); + EXPECT_EQ(edge.link.link_class, IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT); + EXPECT_EQ(edge.link.copy_cost, 9); + EXPECT_EQ(edge.link.latency_class, 9); + EXPECT_EQ(edge.link.numa_distance, 9); +} + +TEST_F(PhysicalDeviceCapabilitiesTest, + SelectsPciePhysicalTopologyEdgeWithoutSystemAtomics) { + std::array<hsa_amd_memory_pool_link_info_t, 1> link_hops = { + LinkInfo(HSA_AMD_LINK_INFO_TYPE_PCIE)}; + link_hops[0].atomic_support_64bit = false; + link_hops[0].coherent_support = false; + + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(link_hops.data(), link_hops.size()); + selection.memory_access.fine = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_ASSERT_OK( + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + EXPECT_TRUE(edge.memory_access.coarse_accessible); + EXPECT_FALSE(edge.memory_access.fine_accessible); + EXPECT_FALSE(edge.coherency.all_hops_coherent); + EXPECT_TRUE(edge.atomics.all_hops_32bit); + EXPECT_FALSE(edge.atomics.all_hops_64bit); + EXPECT_TRUE(iree_any_bit_set( + edge.link.flags, IREE_HAL_AMDGPU_PHYSICAL_TOPOLOGY_LINK_FLAG_PCIE)); + EXPECT_EQ(edge.link.link_class, IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT); + EXPECT_EQ(edge.link.copy_cost, 7); + EXPECT_EQ(edge.link.latency_class, 7); + EXPECT_TRUE(iree_any_bit_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY)); + EXPECT_FALSE(iree_any_bit_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT)); + EXPECT_TRUE(iree_any_bit_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE)); + EXPECT_FALSE(iree_any_bit_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM)); + EXPECT_EQ(edge.modes.noncoherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE); + EXPECT_EQ(edge.modes.coherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY); +} + +TEST_F(PhysicalDeviceCapabilitiesTest, + GrantablePhysicalTopologyEdgeRequiresGrant) { + std::array<hsa_amd_memory_pool_link_info_t, 1> link_hops = { + LinkInfo(HSA_AMD_LINK_INFO_TYPE_PCIE)}; + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(link_hops.data(), link_hops.size()); + selection.memory_access.coarse = + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; + selection.memory_access.fine = + HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; + + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_ASSERT_OK( + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + EXPECT_TRUE(edge.memory_access.coarse_accessible); + EXPECT_TRUE(edge.memory_access.fine_accessible); + EXPECT_TRUE(iree_any_bit_set(edge.capabilities.guaranteed, + IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY)); + EXPECT_TRUE(iree_any_bit_set( + edge.capabilities.required, + IREE_HAL_TOPOLOGY_CAPABILITY_PEER_ACCESS_REQUIRES_GRANT)); + EXPECT_EQ(edge.modes.noncoherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY); + EXPECT_EQ(edge.modes.coherent_read, IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY); +} + +TEST_F(PhysicalDeviceCapabilitiesTest, + NeverAllowedPhysicalTopologyEdgeIsHostStaged) { + std::array<hsa_amd_memory_pool_link_info_t, 1> link_hops = { + LinkInfo(HSA_AMD_LINK_INFO_TYPE_XGMI)}; + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(link_hops.data(), link_hops.size()); + selection.memory_access.coarse = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + selection.memory_access.fine = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; + + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_ASSERT_OK( + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + EXPECT_FALSE(edge.memory_access.coarse_accessible); + EXPECT_FALSE(edge.memory_access.fine_accessible); + EXPECT_FALSE(edge.coherency.all_hops_coherent); + EXPECT_FALSE(edge.atomics.all_hops_32bit); + EXPECT_FALSE(edge.atomics.all_hops_64bit); + EXPECT_EQ(edge.link.link_class, IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED); + EXPECT_EQ(edge.link.copy_cost, 13); + EXPECT_EQ(edge.link.latency_class, 11); + EXPECT_EQ(edge.capabilities.guaranteed, IREE_HAL_TOPOLOGY_CAPABILITY_NONE); +} + +TEST_F(PhysicalDeviceCapabilitiesTest, + InvalidPhysicalTopologyEdgeInputsFailLoud) { + iree_hal_amdgpu_physical_topology_edge_selection_t selection = + MakeTopologyEdgeSelection(nullptr, 1); + iree_hal_amdgpu_physical_topology_edge_t edge; + IREE_EXPECT_STATUS_IS( + IREE_STATUS_INVALID_ARGUMENT, + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); + + selection = MakeTopologyEdgeSelection(nullptr, 0); + selection.memory_access.coarse = (hsa_amd_memory_pool_access_t)99; + IREE_EXPECT_STATUS_IS( + IREE_STATUS_OUT_OF_RANGE, + iree_hal_amdgpu_select_physical_topology_edge(&selection, &edge)); +} + TEST_F(PhysicalDeviceCapabilitiesTest, CpuAccessInputsAreRequiredWhenNeeded) { iree_hal_amdgpu_cpu_visible_device_coarse_memory_selection_t selection = MakeCoarseMemorySelection();