| // Copyright 2023 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/base/internal/math.h" |
| #include "iree/task/topology.h" |
| |
| #if defined(IREE_PLATFORM_APPLE) |
| |
| #include <sys/sysctl.h> |
| #include <sys/types.h> |
| |
| //===----------------------------------------------------------------------===// |
| // Platform utilities |
| //===----------------------------------------------------------------------===// |
| |
| // Apple really doesn't want to let applications control things and hides nearly |
| // all query and control of threads on the system. The best we can do here is |
| // estimate counts and cache information but it'll never be correct and barely |
| // do what the user intends. Such is life in Apple land. Think Different(tm). |
| // Unfortunately this lack of APIs means we can't do much of anything besides |
| // request a QoS level and hope the system puts our workers in the right place. |
| // This makes reliable benchmarking near impossible and results in users having |
| // wildly different performance based on the whims of their current scheduler. |
| // |
| // Meager documentation: |
| // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_system_capabilities |
| |
| static bool iree_task_sysctlbyname_int32(const char* name, int32_t* out_value) { |
| *out_value = 0; |
| size_t sizeof_value = sizeof(*out_value); |
| return sysctlbyname(name, out_value, &sizeof_value, NULL, 0) == 0; |
| } |
| |
| static bool iree_task_sysctlbyname_int64(const char* name, int64_t* out_value) { |
| *out_value = 0; |
| size_t sizeof_value = sizeof(*out_value); |
| return sysctlbyname(name, out_value, &sizeof_value, NULL, 0) == 0; |
| } |
| |
| static bool iree_task_sysctlbyname_perflevel_int32(int level, const char* key, |
| int32_t* out_value) { |
| char name[64]; |
| sprintf(name, "hw.perflevel%d.%s", level, key); |
| return iree_task_sysctlbyname_int32(name, out_value); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // NUMA queries |
| //===----------------------------------------------------------------------===// |
| |
| void iree_task_topology_query_default_caches( |
| iree_task_topology_caches_t* out_caches) { |
| memset(out_caches, 0, sizeof(*out_caches)); |
| // Apple provides system-wide cache size queries that don't require thread |
| // pinning or processor identification. |
| int32_t value = 0; |
| if (iree_task_sysctlbyname_int32("hw.l1dcachesize", &value) && value > 0) { |
| out_caches->l1_data = (uint32_t)value; |
| } |
| if (iree_task_sysctlbyname_int32("hw.l2cachesize", &value) && value > 0) { |
| out_caches->l2_data = (uint32_t)value; |
| } |
| if (iree_task_sysctlbyname_int32("hw.l3cachesize", &value) && value > 0) { |
| out_caches->l3_data = (uint32_t)value; |
| } |
| } |
| |
| iree_host_size_t iree_task_topology_query_node_count(void) { |
| int32_t packages = 1; |
| #if !defined(IREE_PLATFORM_IOS) |
| if (!iree_task_sysctlbyname_int32("hw.packages", &packages) || |
| packages == 0) { |
| packages = 1; // failed to fetch or invalid value |
| } |
| #endif // !IREE_PLATFORM_IOS |
| return packages; |
| } |
| |
| iree_task_topology_node_id_t iree_task_topology_query_current_node(void) { |
| // AFAICT there's no way to query the system for this information. |
| // AFAICT there's also no dual-package systems? Maybe the M2 Ultra? |
| return (iree_task_topology_node_id_t)0; |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Topology initialization helpers |
| //===----------------------------------------------------------------------===// |
| |
| iree_status_t iree_task_topology_fixup_constructive_sharing_masks( |
| iree_task_topology_t* topology) { |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, |
| "assignment to individual CPUs is not available on " |
| "Apple platforms due to a lack of APIs"); |
| } |
| |
| iree_status_t iree_task_topology_initialize_from_logical_cpu_set( |
| iree_host_size_t cpu_count, const uint32_t* cpu_ids, |
| iree_task_topology_t* out_topology) { |
| return iree_make_status(IREE_STATUS_UNIMPLEMENTED, |
| "assignment to individual CPUs is not available on " |
| "Apple platforms due to a lack of APIs"); |
| } |
| |
| typedef struct { |
| int32_t physicalcpu_max; |
| int32_t logicalcpu_max; |
| int32_t l1dcachesize; |
| int32_t l2cachesize; |
| int32_t l3cachesize; |
| int32_t cpusperl2; |
| int32_t cpusperl3; |
| } iree_task_hw_perflevel_t; |
| |
| #define IREE_TASK_MAX_HW_PERF_LEVELS 2 |
| |
| static iree_task_hw_perflevel_t iree_task_query_hw_perflevel_default(void) { |
| iree_task_hw_perflevel_t perflevel = {0}; |
| |
| iree_task_sysctlbyname_int32("hw.physicalcpu_max", |
| &perflevel.physicalcpu_max); |
| iree_task_sysctlbyname_int32("hw.logicalcpu_max", &perflevel.logicalcpu_max); |
| iree_task_sysctlbyname_int32("hw.l1dcachesize", &perflevel.l1dcachesize); |
| iree_task_sysctlbyname_int32("hw.l2cachesize", &perflevel.l2cachesize); |
| iree_task_sysctlbyname_int32("hw.l3cachesize", &perflevel.l3cachesize); |
| |
| // cpusperX, with [main memory, l1d, l2, l3, ...] |
| size_t sizeof_cacheconfig = 0; |
| sysctlbyname("hw.cacheconfig", NULL, &sizeof_cacheconfig, NULL, 0); |
| int64_t* cacheconfig = (int64_t*)iree_alloca(sizeof_cacheconfig); |
| sysctlbyname("hw.cacheconfig", cacheconfig, &sizeof_cacheconfig, NULL, 0); |
| size_t ncacheconfig = sizeof_cacheconfig / sizeof(cacheconfig[0]); |
| perflevel.cpusperl2 = ncacheconfig >= 3 ? cacheconfig[2] : 0; |
| perflevel.cpusperl3 = ncacheconfig >= 4 ? cacheconfig[3] : 0; |
| |
| return perflevel; |
| } |
| |
| static iree_task_hw_perflevel_t iree_task_query_hw_perflevel(int level) { |
| iree_task_hw_perflevel_t perflevel = {0}; |
| iree_task_sysctlbyname_perflevel_int32(level, "physicalcpu_max", |
| &perflevel.physicalcpu_max); |
| iree_task_sysctlbyname_perflevel_int32(level, "logicalcpu_max", |
| &perflevel.logicalcpu_max); |
| iree_task_sysctlbyname_perflevel_int32(level, "l1dcachesize", |
| &perflevel.l1dcachesize); |
| iree_task_sysctlbyname_perflevel_int32(level, "l2cachesize", |
| &perflevel.l2cachesize); |
| iree_task_sysctlbyname_perflevel_int32(level, "l3cachesize", |
| &perflevel.l3cachesize); |
| iree_task_sysctlbyname_perflevel_int32(level, "cpusperl2", |
| &perflevel.cpusperl2); |
| iree_task_sysctlbyname_perflevel_int32(level, "cpusperl3", |
| &perflevel.cpusperl3); |
| return perflevel; |
| } |
| |
| iree_status_t iree_task_topology_initialize_from_physical_cores( |
| iree_task_topology_node_id_t node_id, |
| iree_task_topology_performance_level_t performance_level, |
| iree_task_topology_distribution_t distribution, |
| iree_host_size_t max_core_count, iree_task_topology_t* out_topology) { |
| // NOTE: darwin implementation doesn't currently support cache-domain-aware |
| // distribution strategies. The distribution parameter is accepted for API |
| // compatibility but ignored. |
| (void)distribution; |
| |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)node_id); |
| |
| iree_task_topology_initialize(out_topology); |
| out_topology->node_id = node_id; |
| |
| // Total number of physical cores in the system of all types. |
| int32_t total_physicalcpu_max = 0; |
| if (!iree_task_sysctlbyname_int32("hw.physicalcpu_max", |
| &total_physicalcpu_max) || |
| total_physicalcpu_max == 0) { |
| total_physicalcpu_max = 1; // failed to fetch or invalid value |
| } |
| |
| // Query CPU info per performance level type. |
| // NOTE: older systems will report nperflevels=0 and we instead use the |
| // default non-perflevel keys. |
| // NOTE: when present perflevels[0] is performance. |
| int32_t nperflevels = 0; |
| iree_task_sysctlbyname_int32("hw.nperflevels", &nperflevels); |
| nperflevels = iree_min(nperflevels, IREE_TASK_MAX_HW_PERF_LEVELS); |
| iree_task_hw_perflevel_t perflevels[IREE_TASK_MAX_HW_PERF_LEVELS]; |
| if (nperflevels > 0) { |
| // System has multiple perflevels (AMP / asymmetric multiprocessing). |
| for (int32_t i = 0; i < nperflevels; ++i) { |
| perflevels[i] = iree_task_query_hw_perflevel(i); |
| } |
| } else { |
| // Only one perflevel (homogeneous cores). |
| nperflevels = 1; |
| perflevels[0] = iree_task_query_hw_perflevel_default(); |
| } |
| int32_t physicalcpu_max = total_physicalcpu_max; |
| if (nperflevels > 0) { |
| switch (performance_level) { |
| default: |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_ANY: |
| physicalcpu_max = total_physicalcpu_max; |
| break; |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_LOW: |
| physicalcpu_max = perflevels[/*efficiency=*/1].physicalcpu_max; |
| break; |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_HIGH: |
| physicalcpu_max = perflevels[/*performance=*/0].physicalcpu_max; |
| break; |
| } |
| } |
| |
| iree_host_size_t core_count = iree_min(physicalcpu_max, max_core_count); |
| iree_task_topology_initialize_from_group_count(core_count, out_topology); |
| for (iree_host_size_t i = 0; i < out_topology->group_count; ++i) { |
| iree_task_topology_group_t* group = &out_topology->groups[i]; |
| group->processor_index = i; |
| |
| // Assign attributes based on the perflevel of the group; we can't pin cores |
| // on Apple platforms so instead we just treat the first N groups as |
| // perflevel[0], the next as perflevel[1], etc. |
| int perflevel = 0; |
| if (nperflevels > 1) { |
| perflevel = i < perflevels[0].physicalcpu_max ? 0 : 1; |
| } |
| group->caches.l1_data = perflevels[perflevel].l1dcachesize; |
| group->caches.l2_data = perflevels[perflevel].l2cachesize; |
| group->caches.l3_data = perflevels[perflevel].l3cachesize; |
| |
| // We make stuff up as Apple doesn't want us to have nice things. |
| // See iree_thread_affinity_t for more information about how we use the |
| // affinity info. Note that we pack "use efficiency cores only" into the SMT |
| // bit and use that to force a QoS level that ensures only efficiency cores |
| // are used when present. Probably. |
| group->ideal_thread_affinity.group = (uint32_t)node_id; |
| group->ideal_thread_affinity.id_assigned = 1; |
| group->ideal_thread_affinity.id = i; |
| switch (performance_level) { |
| default: |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_ANY: |
| // If heterogeneous then put the first N groups anywhere and the rest on |
| // efficiency cores. |
| group->ideal_thread_affinity.smt = perflevel > 0 ? 1 : 0; |
| break; |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_HIGH: |
| // Try to avoid efficiency cores (but no way to do that). |
| group->ideal_thread_affinity.smt = 0; |
| break; |
| case IREE_TASK_TOPOLOGY_PERFORMANCE_LEVEL_LOW: |
| // Force onto efficiency cores. |
| group->ideal_thread_affinity.smt = 1; |
| break; |
| } |
| |
| // We don't set any sharing mask here as we have no idea where the groups |
| // we be placed by the magical mystical completely unpredictable XNU |
| // scheduler. Cool. |
| // We could use cpusperl2/l3 to at least know how many groups may share a |
| // particular cache but without control it's useless info. |
| // group->constructive_sharing_mask = ...; |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return iree_ok_status(); |
| } |
| |
| #endif // IREE_PLATFORM_APPLE |