Adding `--task_topology_cpu_ids=` flag. (#14969)
This allows for explicitly specifying one or more logical CPU ID sets
that should be used for task system topologies. The IDs are in the range
of [0, total_cpu_count) to match lscpu/lstopo on Linux and on a
flattened view of all cores on Windows (so with group 0 having 4 logical
CPUs an ID of 6 indicates group 1 cpu 1). When the flag is specified all
other topology flags are ignored as the NUMA node specified is implicit
in the absolute logical CPU ID.
For programmatic control (python/etc) the
`iree_task_topology_initialize_from_logical_cpu_set_string` helper is
exposed. A hosting application that already knows precisely which thread
affinities it wants can use the new
`iree_task_topology_initialize_from_thread_affinities` to directly
specify without any interpretation while still getting proper
constructive sharing masks when supported on the platform.
Examples:
`--task_topology_cpu_ids=0`: one topology with logical CPU 0 only
`--task_topology_cpu_ids=0,1`: one topology with logical CPU 0 and 1
`--task_topology_cpu_ids=0,1 task_topology_cpu_ids=32,33`: two
topologies with logical CPU 0,1 and 32,33 respectively
diff --git a/runtime/src/iree/base/alignment.h b/runtime/src/iree/base/alignment.h
index 54c64dd..bbf3b19 100644
--- a/runtime/src/iree/base/alignment.h
+++ b/runtime/src/iree/base/alignment.h
@@ -130,6 +130,22 @@
#define iree_sizeof_struct(t) iree_host_align(sizeof(t), iree_max_align_t)
// Returns the ceil-divide of |lhs| by non-zero |rhs|.
+static inline iree_host_size_t iree_host_size_ceil_div(iree_host_size_t lhs,
+ iree_host_size_t rhs) {
+ return ((lhs != 0) && (lhs > 0) == (rhs > 0))
+ ? ((lhs + ((rhs > 0) ? -1 : 1)) / rhs) + 1
+ : -(-lhs / rhs);
+}
+
+// Returns the floor-divide of |lhs| by non-zero |rhs|.
+static inline iree_host_size_t iree_host_size_floor_div(iree_host_size_t lhs,
+ iree_host_size_t rhs) {
+ return ((lhs != 0) && ((lhs < 0) != (rhs < 0)))
+ ? -((-lhs + ((rhs < 0) ? 1 : -1)) / rhs) - 1
+ : lhs / rhs;
+}
+
+// Returns the ceil-divide of |lhs| by non-zero |rhs|.
static inline iree_device_size_t iree_device_size_ceil_div(
iree_device_size_t lhs, iree_device_size_t rhs) {
return ((lhs != 0) && (lhs > 0) == (rhs > 0))
diff --git a/runtime/src/iree/task/api.c b/runtime/src/iree/task/api.c
index 086d2ae..d55d5b6 100644
--- a/runtime/src/iree/task/api.c
+++ b/runtime/src/iree/task/api.c
@@ -62,21 +62,17 @@
//===----------------------------------------------------------------------===//
IREE_FLAG(
- string, task_topology_nodes, "current",
- "Comma-separated list of NUMA nodes that topologies will be defined for.\n"
- "Each node specified will be configured based on the other topology\n"
- "flags. 'all' can be used to indicate all available NUMA nodes and\n"
- "'current' will inherit the node of the calling thread.");
-
-IREE_FLAG(
string, task_topology_mode, "physical_cores",
"Available modes:\n"
" --task_topology_group_count=non-zero:\n"
" Uses whatever the specified group count is and ignores the set mode.\n"
" All threads will be unpinned and run on system-determined processors.\n"
+ " --task_topology_cpu_ids=0,1,2 [+ --task_topology_cpu_ids=3,4,5]:\n"
+ " Creates one executor per set of logical CPU IDs.\n"
" 'physical_cores':\n"
- " Creates one group per physical core in each NUMA node up to\n"
- " the value specified by --task_topology_max_group_count=.");
+ " Creates one executor per NUMA node in --task_topology_nodes= and one\n"
+ " group per physical core in each NUMA node up to the value specified\n"
+ " by --task_topology_max_group_count=.");
IREE_FLAG(
int32_t, task_topology_group_count, 0,
@@ -87,6 +83,20 @@
"WARNING: setting this flag directly is not recommended; use\n"
"--task_topology_max_group_count= instead.");
+IREE_FLAG_LIST(
+ string, task_topology_cpu_ids,
+ "A list of absolute logical CPU IDs to use for a single topology. One\n"
+ "topology will be created for each repetition of the flag. CPU IDs match\n"
+ "the Linux logical CPU ID scheme (as used by lscpu/lstopo) or a flattened\n"
+ "[0, total_processor_count) range on Windows.");
+
+IREE_FLAG(
+ string, task_topology_nodes, "current",
+ "Comma-separated list of NUMA nodes that topologies will be defined for.\n"
+ "Each node specified will be configured based on the other topology\n"
+ "flags. 'all' can be used to indicate all available NUMA nodes and\n"
+ "'current' will inherit the node of the calling thread.");
+
IREE_FLAG(
int32_t, task_topology_max_group_count, 8,
"Sets a maximum value on the worker count that can be automatically\n"
@@ -178,72 +188,90 @@
fprintf(file, "# --%.*s\n", (int)flag_name.size, flag_name.data);
}
-static iree_status_t iree_task_flags_dump_task_topologies(
- iree_string_view_t flag_name, void* storage, iree_string_view_t value) {
- // Select which nodes in the machine we will be creating topologies for.
- uint64_t node_mask = 0ull;
- IREE_RETURN_IF_ERROR(
- iree_task_topologies_select_nodes_from_flags(&node_mask));
-
- // TODO(benvanik): macros to make this iteration easier (ala cpu_set
- // iterators).
- iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
- uint64_t node_mask_bits = node_mask;
- iree_task_topology_node_id_t node_base_id = 0;
- for (iree_host_size_t i = 0; i < topology_count; ++i) {
- int node_offset =
- iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
- iree_task_topology_node_id_t node_id = node_base_id + node_offset;
- node_base_id += node_offset + 1;
- node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
- iree_task_topology_t topology;
- IREE_RETURN_IF_ERROR(
- iree_task_topology_initialize_from_flags(node_id, &topology));
- fprintf(stdout,
- "# "
- "===-------------------------------------------------------------"
- "-----------===\n");
- fprintf(stdout, "# topology[%" PRIhsz "]: %" PRIhsz " worker groups\n", i,
- topology.group_count);
- fprintf(stdout,
- "# "
- "===-------------------------------------------------------------"
- "-----------===\n");
- fprintf(stdout, "#\n");
- for (iree_host_size_t j = 0; j < topology.group_count; ++j) {
- const iree_task_topology_group_t* group = &topology.groups[j];
- fprintf(stdout, "# group[%d]: '%s'\n", group->group_index, group->name);
- fprintf(stdout, "# processor: %u\n", group->processor_index);
- fprintf(stdout, "# affinity: ");
- if (group->ideal_thread_affinity.specified) {
- fprintf(stdout, "group=%u, id=%u, smt=%u",
- group->ideal_thread_affinity.group,
- group->ideal_thread_affinity.id,
- group->ideal_thread_affinity.smt);
- } else {
- fprintf(stdout, "(unspecified)");
+static void iree_task_flags_dump_task_topology(
+ iree_host_size_t topology_id, const iree_task_topology_t* topology) {
+ fprintf(stdout,
+ "# "
+ "===-------------------------------------------------------------"
+ "-----------===\n");
+ fprintf(stdout, "# topology[%" PRIhsz "]: %" PRIhsz " worker groups\n",
+ topology_id, topology->group_count);
+ fprintf(stdout,
+ "# "
+ "===-------------------------------------------------------------"
+ "-----------===\n");
+ fprintf(stdout, "#\n");
+ for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
+ const iree_task_topology_group_t* group = &topology->groups[j];
+ fprintf(stdout, "# group[%d]: '%s'\n", group->group_index, group->name);
+ fprintf(stdout, "# processor: %u\n", group->processor_index);
+ fprintf(stdout, "# affinity: ");
+ if (group->ideal_thread_affinity.specified) {
+ fprintf(
+ stdout, "group=%u, id=%u, smt=%u", group->ideal_thread_affinity.group,
+ group->ideal_thread_affinity.id, group->ideal_thread_affinity.smt);
+ } else {
+ fprintf(stdout, "(unspecified)");
+ }
+ fprintf(stdout, "\n");
+ fprintf(stdout, "# cache sharing: ");
+ if (group->constructive_sharing_mask == 0) {
+ fprintf(stdout, "(none)\n");
+ } else if (group->constructive_sharing_mask ==
+ IREE_TASK_TOPOLOGY_GROUP_MASK_ALL) {
+ fprintf(stdout, "(all/undefined)\n");
+ } else {
+ fprintf(stdout, "%d group(s): ",
+ iree_math_count_ones_u64(group->constructive_sharing_mask));
+ for (iree_host_size_t ic = 0, jc = 0;
+ ic < IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT; ++ic) {
+ if ((group->constructive_sharing_mask >> ic) & 1) {
+ if (jc > 0) fprintf(stdout, ", ");
+ fprintf(stdout, "%" PRIhsz, ic);
+ ++jc;
+ }
}
fprintf(stdout, "\n");
- fprintf(stdout, "# cache sharing: ");
- if (group->constructive_sharing_mask == 0) {
- fprintf(stdout, "(none)\n");
- } else if (group->constructive_sharing_mask ==
- IREE_TASK_TOPOLOGY_GROUP_MASK_ALL) {
- fprintf(stdout, "(all/undefined)\n");
- } else {
- fprintf(stdout, "%d group(s): ",
- iree_math_count_ones_u64(group->constructive_sharing_mask));
- for (iree_host_size_t ic = 0, jc = 0;
- ic < IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT; ++ic) {
- if ((group->constructive_sharing_mask >> ic) & 1) {
- if (jc > 0) fprintf(stdout, ", ");
- fprintf(stdout, "%" PRIhsz, ic);
- ++jc;
- }
- }
- fprintf(stdout, "\n");
- }
- fprintf(stdout, "#\n");
+ }
+ fprintf(stdout, "#\n");
+ }
+}
+
+static iree_status_t iree_task_flags_dump_task_topologies(
+ iree_string_view_t flag_name, void* storage, iree_string_view_t value) {
+ const iree_flag_string_list_t cpu_ids_list =
+ FLAG_task_topology_cpu_ids_list();
+ if (cpu_ids_list.count == 0) {
+ // Select which nodes in the machine we will be creating topologies for.
+ uint64_t node_mask = 0ull;
+ IREE_RETURN_IF_ERROR(
+ iree_task_topologies_select_nodes_from_flags(&node_mask));
+
+ // TODO(benvanik): macros to make this iteration easier (ala cpu_set
+ // iterators).
+ iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
+ uint64_t node_mask_bits = node_mask;
+ iree_task_topology_node_id_t node_base_id = 0;
+ for (iree_host_size_t i = 0; i < topology_count; ++i) {
+ int node_offset =
+ iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
+ iree_task_topology_node_id_t node_id = node_base_id + node_offset;
+ node_base_id += node_offset + 1;
+ node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
+ iree_task_topology_t topology;
+ IREE_RETURN_IF_ERROR(
+ iree_task_topology_initialize_from_flags(node_id, &topology));
+ iree_task_flags_dump_task_topology(i, &topology);
+ iree_task_topology_deinitialize(&topology);
+ }
+ } else {
+ for (iree_host_size_t i = 0; i < cpu_ids_list.count; ++i) {
+ iree_task_topology_t topology;
+ IREE_RETURN_IF_ERROR(
+ iree_task_topology_initialize_from_logical_cpu_set_string(
+ cpu_ids_list.values[i], &topology));
+ iree_task_flags_dump_task_topology(i, &topology);
+ iree_task_topology_deinitialize(&topology);
}
}
@@ -275,11 +303,19 @@
IREE_RETURN_IF_ERROR(
iree_task_executor_options_initialize_from_flags(&options));
- // Select which nodes in the machine we will be creating topologies for.
+ // Select which nodes in the machine we will be creating topologies for based
+ // on the topology mode.
+ iree_host_size_t topology_count = 0;
uint64_t node_mask = 0ull;
- IREE_RETURN_IF_ERROR(
- iree_task_topologies_select_nodes_from_flags(&node_mask));
- const iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
+ const iree_flag_string_list_t cpu_ids_list =
+ FLAG_task_topology_cpu_ids_list();
+ if (cpu_ids_list.count == 0) {
+ IREE_RETURN_IF_ERROR(
+ iree_task_topologies_select_nodes_from_flags(&node_mask));
+ topology_count = iree_math_count_ones_u64(node_mask);
+ } else {
+ topology_count = cpu_ids_list.count;
+ }
// Since this utility function creates one executor per topology returned by
// the query we can check the executor capacity immediately.
@@ -309,34 +345,58 @@
}
// Create one executor per topology.
- // TODO(benvanik): macros to make this iteration easier (ala cpu_set
- // iterators).
iree_status_t status = iree_ok_status();
- uint64_t node_mask_bits = node_mask;
- iree_task_topology_node_id_t node_base_id = 0;
- for (iree_host_size_t i = 0; i < topology_count; ++i) {
- int node_offset =
- iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
- iree_task_topology_node_id_t node_id = node_base_id + node_offset;
- node_base_id += node_offset + 1;
- node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
+ if (cpu_ids_list.count == 0) {
+ // TODO(benvanik): macros to make this iteration easier (ala cpu_set
+ // iterators).
+ uint64_t node_mask_bits = node_mask;
+ iree_task_topology_node_id_t node_base_id = 0;
+ for (iree_host_size_t i = 0; i < topology_count; ++i) {
+ int node_offset =
+ iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
+ iree_task_topology_node_id_t node_id = node_base_id + node_offset;
+ node_base_id += node_offset + 1;
+ node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
- // Query topology for the node this executor is pinned to.
- iree_task_topology_t topology;
- status = iree_task_topology_initialize_from_flags(node_id, &topology);
- if (!iree_status_is_ok(status)) break;
+ // Query topology for the node this executor is pinned to.
+ iree_task_topology_t topology;
+ status = iree_task_topology_initialize_from_flags(node_id, &topology);
+ if (!iree_status_is_ok(status)) break;
- // TODO(benvanik): if group count is 0 then don't create the executor. Today
- // the executor creation will fail with 0 groups so the program won't get in
- // a weird state but it's probably not what a user would expect.
+ // TODO(benvanik): if group count is 0 then don't create the executor.
+ // Today the executor creation will fail with 0 groups so the program
+ // won't get in a weird state but it's probably not what a user would
+ // expect.
- // Create executor with the given topology.
- status = iree_task_executor_create(options, &topology, host_allocator,
- &executors[i]);
+ // Create executor with the given topology.
+ status = iree_task_executor_create(options, &topology, host_allocator,
+ &executors[i]);
- // Executor has consumed the topology and it can be dropped now.
- iree_task_topology_deinitialize(&topology);
- if (!iree_status_is_ok(status)) break;
+ // Executor has consumed the topology and it can be dropped now.
+ iree_task_topology_deinitialize(&topology);
+ if (!iree_status_is_ok(status)) break;
+ }
+ } else {
+ for (iree_host_size_t i = 0; i < topology_count; ++i) {
+ // Query topology for the node this executor is pinned to.
+ iree_task_topology_t topology;
+ status = iree_task_topology_initialize_from_logical_cpu_set_string(
+ cpu_ids_list.values[i], &topology);
+ if (!iree_status_is_ok(status)) break;
+
+ // TODO(benvanik): if group count is 0 then don't create the executor.
+ // Today the executor creation will fail with 0 groups so the program
+ // won't get in a weird state but it's probably not what a user would
+ // expect.
+
+ // Create executor with the given topology.
+ status = iree_task_executor_create(options, &topology, host_allocator,
+ &executors[i]);
+
+ // Executor has consumed the topology and it can be dropped now.
+ iree_task_topology_deinitialize(&topology);
+ if (!iree_status_is_ok(status)) break;
+ }
}
if (iree_status_is_ok(status)) {
diff --git a/runtime/src/iree/task/topology.c b/runtime/src/iree/task/topology.c
index 1bf11a1..36bfea2 100644
--- a/runtime/src/iree/task/topology.c
+++ b/runtime/src/iree/task/topology.c
@@ -76,11 +76,25 @@
return iree_ok_status();
}
+// Fixes constructive_sharing_mask values such that they represent other chosen
+// topology groups instead of processor indices. We do this so that code using
+// the topology groups doesn't need to know anything about which physical
+// processor IDs a particular group is mapped to.
+//
+// This is implemented by platform-specific logic and may be a no-op if the
+// platform doesn't support querying the required cache information.
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+ iree_task_topology_t* topology);
+
void iree_task_topology_initialize_from_group_count(
iree_host_size_t group_count, iree_task_topology_t* out_topology) {
+ // Clamp to the maximum we support.
+ group_count = iree_min(group_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, group_count);
+ // Initialize default groups with no affinities specified.
iree_task_topology_initialize(out_topology);
for (iree_host_size_t i = 0; i < group_count; ++i) {
iree_task_topology_group_t* group = &out_topology->groups[i];
@@ -90,3 +104,64 @@
IREE_TRACE_ZONE_END(z0);
}
+
+iree_status_t iree_task_topology_initialize_from_thread_affinities(
+ iree_host_size_t group_count,
+ const iree_thread_affinity_t* group_affinities,
+ iree_task_topology_t* out_topology) {
+ // Today we have a fixed limit on the number of groups within a particular
+ // topology.
+ if (group_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "too many groups specified (%" PRIhsz
+ " provided for a max capacity of %zu)",
+ group_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+ }
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, group_count);
+
+ // Initialize each group with the given affinities.
+ iree_task_topology_initialize(out_topology);
+ for (iree_host_size_t i = 0; i < group_count; ++i) {
+ iree_task_topology_group_t* group = &out_topology->groups[i];
+ iree_task_topology_group_initialize(i, group);
+ group->ideal_thread_affinity = group_affinities[i];
+ }
+ out_topology->group_count = group_count;
+
+ // Try to use platform support to set the constructive sharing masks.
+ // No-op if the platform support is not available.
+ iree_status_t status =
+ iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set_string(
+ iree_string_view_t cpu_id_set, iree_task_topology_t* out_topology) {
+ if (iree_string_view_is_empty(cpu_id_set)) {
+ return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+ "at least one CPU ID must be provided");
+ }
+ iree_host_size_t count = 1;
+ for (iree_host_size_t i = 0; i < cpu_id_set.size; ++i) {
+ if (cpu_id_set.data[i] == ',') ++count;
+ }
+ uint32_t* cpu_ids = (uint32_t*)iree_alloca(count * sizeof(uint32_t));
+ memset(cpu_ids, 0, count * sizeof(uint32_t));
+ iree_host_size_t cpu_count = 0;
+ while (!iree_string_view_is_empty(cpu_id_set)) {
+ iree_string_view_t cpu_id_string = iree_string_view_empty();
+ iree_string_view_split(cpu_id_set, ',', &cpu_id_string, &cpu_id_set);
+ if (!iree_string_view_atoi_uint32(iree_string_view_trim(cpu_id_string),
+ &cpu_ids[cpu_count++])) {
+ return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+ "'%.*s' not a valid CPU ID",
+ (int)cpu_id_string.size, cpu_id_string.data);
+ }
+ }
+ return iree_task_topology_initialize_from_logical_cpu_set(cpu_count, cpu_ids,
+ out_topology);
+}
diff --git a/runtime/src/iree/task/topology.h b/runtime/src/iree/task/topology.h
index a449e2a..c563be2 100644
--- a/runtime/src/iree/task/topology.h
+++ b/runtime/src/iree/task/topology.h
@@ -152,8 +152,32 @@
void iree_task_topology_initialize_from_group_count(
iree_host_size_t group_count, iree_task_topology_t* out_topology);
+// Initializes a topology with the given groups each assigned a platform thread
+// affinity. See `iree_thread_affinity_t` for more information about how to
+// properly initialize the thread affinities for each platform.
+iree_status_t iree_task_topology_initialize_from_thread_affinities(
+ iree_host_size_t group_count,
+ const iree_thread_affinity_t* group_affinities,
+ iree_task_topology_t* out_topology);
+
+// Initializes a topology with one group for each logical CPU specified.
+//
+// The logical CPU IDs are in the platform-defined flattened domain of 0 to
+// the total number of logical processors in the system such as those returned
+// by `lscpu --extended`/lstopo/the bit index in cpu_set_t. The same ID is used
+// on the file-based access in e.g. `/sys/devices/system/cpu/cpu<cpu_id>/`.
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+ iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+ iree_task_topology_t* out_topology);
+
+// Initializes a topology with one group for each logical CPU specified in a
+// comma-delimited list.
+// See iree_task_topology_initialize_from_logical_cpu_set for more information.
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set_string(
+ iree_string_view_t cpu_id_set, iree_task_topology_t* out_topology);
+
// Initializes a topology with one group for each physical core with the given
-// NUMA node ID (usually package or cluster). Up to |max_core_count| physical
+// NUMA |node_id| (usually package or cluster). Up to |max_core_count| physical
// cores will be selected from the node.
iree_status_t iree_task_topology_initialize_from_physical_cores(
iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,
diff --git a/runtime/src/iree/task/topology_cpuinfo.c b/runtime/src/iree/task/topology_cpuinfo.c
index 2f8198d..4a9dc77 100644
--- a/runtime/src/iree/task/topology_cpuinfo.c
+++ b/runtime/src/iree/task/topology_cpuinfo.c
@@ -18,12 +18,14 @@
iree_host_size_t max_group_count, iree_task_topology_t* out_topology) {
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, max_group_count);
+
// TODO(benvanik): implement our own query... but that seems not so great.
// For now we default to a single group: if a user wants more then they can
// either get cpuinfo working for their platform or manually construct the
// topology themselves.
iree_host_size_t group_count = 1;
iree_task_topology_initialize_from_group_count(group_count, out_topology);
+
IREE_TRACE_ZONE_END(z0);
}
@@ -35,6 +37,50 @@
return 0;
}
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+ iree_task_topology_t* topology) {
+ // No-op.
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+ iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+ iree_task_topology_t* out_topology) {
+ // Today we have a fixed limit on the number of groups within a particular
+ // topology.
+ if (cpu_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "too many CPUs specified (%" PRIhsz
+ " provided for a max capacity of %zu)",
+ cpu_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+ }
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, cpu_count);
+
+ iree_task_topology_initialize(out_topology);
+
+ out_topology->group_count = cpu_count;
+ for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+ iree_task_topology_group_t* group = &out_topology->groups[i];
+ iree_task_topology_group_initialize(i, group);
+ group->processor_index = cpu_ids[i];
+
+ // NOTE: without cpuinfo we can't get SMT and node info but this isn't
+ // really used on Linux today anyway.
+ iree_thread_affinity_t* affinity = &group->ideal_thread_affinity;
+ memset(affinity, 0, sizeof(*affinity));
+ affinity->specified = 1;
+ affinity->id = cpu_ids[i];
+ }
+
+ iree_status_t status =
+ iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
iree_status_t iree_task_topology_initialize_from_physical_cores(
iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,
iree_task_topology_t* out_topology) {
@@ -134,6 +180,30 @@
#endif // cpuinfo-like platform field
}
+// Populates |out_group| with the information from |processor|.
+static void iree_task_topology_group_initialize_from_processor(
+ uint32_t group_index, const struct cpuinfo_processor* processor,
+ iree_task_topology_group_t* out_group) {
+ iree_task_topology_group_initialize(group_index, out_group);
+ out_group->processor_index = processor->linux_id;
+ iree_task_topology_set_affinity_from_processor(
+ processor, &out_group->ideal_thread_affinity);
+}
+
+// Populates |out_group| with the information from |core|.
+static void iree_task_topology_group_initialize_from_core(
+ uint32_t group_index, const struct cpuinfo_core* core,
+ iree_task_topology_group_t* out_group) {
+ // Guess: always pick the first processor in a core.
+ // When pinning to threads we'll take into account whether the core is SMT
+ // and use all threads anyway so this alignment is just helpful for debugging.
+ uint32_t processor_i = core->processor_start;
+ const struct cpuinfo_processor* processor =
+ cpuinfo_get_processor(processor_i);
+ iree_task_topology_group_initialize_from_processor(group_index, processor,
+ out_group);
+}
+
// Returns a bitset with all *processors* that share the same |cache|.
static uint64_t iree_task_topology_calculate_cache_bits(
const struct cpuinfo_cache* cache) {
@@ -163,30 +233,13 @@
return mask;
}
-// Populates |our_group| with the information from |core|.
-static void iree_task_topology_group_initialize_from_core(
- uint32_t group_index, const struct cpuinfo_core* core,
- iree_task_topology_group_t* out_group) {
- iree_task_topology_group_initialize(group_index, out_group);
-
- // Guess: always pick the first processor in a core.
- // When pinning to threads we'll take into account whether the core is SMT
- // and use all threads anyway so this alignment is just helpful for debugging.
- uint32_t processor_i = core->processor_start;
- out_group->processor_index = processor_i;
-
- const struct cpuinfo_processor* processor =
- cpuinfo_get_processor(processor_i);
- iree_task_topology_set_affinity_from_processor(
- processor, &out_group->ideal_thread_affinity);
-}
-
-// Fixes constructive_sharing_mask values such that they represent other chosen
-// topology groups instead of processor indices. We do this so that code using
-// the topology groups doesn't need to know anything about which physical
-// processor IDs a particular group is mapped to.
-static void iree_task_topology_fixup_constructive_sharing_masks(
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
iree_task_topology_t* topology) {
+ if (!iree_task_topology_is_cpuinfo_available()) {
+ // No-op when cpuinfo is unavailable.
+ return iree_ok_status();
+ }
+
// O(n^2), but n is always <= 64 (and often <= 8).
for (iree_host_size_t i = 0; i < topology->group_count; ++i) {
iree_task_topology_group_t* group = &topology->groups[i];
@@ -198,7 +251,6 @@
iree_task_topology_group_mask_t group_mask = 0;
for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
- if (i == j) continue;
const iree_task_topology_group_t* other_group = &topology->groups[j];
uint64_t group_processor_bits =
iree_math_rotl_u64(1ull, other_group->processor_index);
@@ -209,6 +261,56 @@
group->constructive_sharing_mask = group_mask;
}
+
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+ iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+ iree_task_topology_t* out_topology) {
+ // Ensure cpuinfo is available; if not we fall back to random.
+ if (!iree_task_topology_is_cpuinfo_available()) {
+ iree_task_topology_initialize_fallback(cpu_count, out_topology);
+ return iree_ok_status();
+ }
+
+ // Today we have a fixed limit on the number of groups within a particular
+ // topology.
+ if (cpu_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "too many CPUs specified (%" PRIhsz
+ " provided for a max capacity of %zu)",
+ cpu_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+ }
+
+ // Validate the CPU IDs provided.
+ const uint32_t processor_count = cpuinfo_get_processors_count();
+ for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+ if (cpu_ids[i] >= processor_count) {
+ return iree_make_status(
+ IREE_STATUS_OUT_OF_RANGE,
+ "cpu_ids[%" PRIhsz
+ "] %u out of bounds, only %u logical processors available",
+ i, cpu_ids[i], processor_count);
+ }
+ }
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, cpu_count);
+
+ iree_task_topology_initialize(out_topology);
+
+ out_topology->group_count = cpu_count;
+ for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+ const struct cpuinfo_processor* processor =
+ cpuinfo_get_processor(cpu_ids[i]);
+ iree_task_topology_group_initialize_from_processor(
+ i, processor, &out_topology->groups[i]);
+ }
+
+ iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
}
// Returns true if the given |core| passes the filter and should be included.
@@ -237,12 +339,12 @@
static void iree_task_topology_initialize_from_physical_cores_with_filter(
iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
iree_host_size_t max_core_count, iree_task_topology_t* out_topology) {
- max_core_count = iree_min(max_core_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
if (!iree_task_topology_is_cpuinfo_available()) {
iree_task_topology_initialize_fallback(max_core_count, out_topology);
return;
}
+ max_core_count = iree_min(max_core_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
IREE_TRACE_ZONE_BEGIN(z0);
IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, max_core_count);
diff --git a/runtime/src/iree/task/topology_win32.c b/runtime/src/iree/task/topology_win32.c
index 770bcf9..30f62f9 100644
--- a/runtime/src/iree/task/topology_win32.c
+++ b/runtime/src/iree/task/topology_win32.c
@@ -89,6 +89,192 @@
}
}
+// Assigns constructive sharing masks to each topology group. These indicate
+// which other topology groups share L3 caches (if any).
+static void
+iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* relationships,
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* relationships_end,
+ iree_task_topology_t* topology) {
+ for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = relationships;
+ p < relationships_end;
+ p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+ if (p->Relationship == RelationCache) {
+ if (p->Cache.Level == 3 &&
+ (p->Cache.Type == CacheUnified || p->Cache.Type == CacheData)) {
+ if (p->Cache.GroupCount == 0) {
+ iree_task_topology_assign_constructive_sharing(topology,
+ p->Cache.GroupMask);
+ } else {
+ for (WORD i = 0; i < p->Cache.GroupCount; ++i) {
+ iree_task_topology_assign_constructive_sharing(
+ topology, p->Cache.GroupMasks[i]);
+ }
+ }
+ }
+ }
+ }
+}
+
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+ iree_task_topology_t* topology) {
+ // Query the total size required for just cache information and allocate
+ // storage for it on the stack - it's generally just a few KB.
+ DWORD cache_relationships_size = 0;
+ if (!GetLogicalProcessorInformationEx(RelationCache, NULL,
+ &cache_relationships_size) &&
+ GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+ return iree_make_status(
+ iree_status_code_from_win32_error(GetLastError()),
+ "failed to query logical processor information size (%08X)",
+ GetLastError());
+ }
+ if (cache_relationships_size > 64 * 1024) {
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "logical processor information size overflow (got "
+ "%u which is large for a stack alloc)",
+ cache_relationships_size);
+ }
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships =
+ (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
+ cache_relationships_size);
+
+ // Query again to populate the storage with cache relationship information.
+ if (!GetLogicalProcessorInformationEx(RelationCache, cache_relationships,
+ &cache_relationships_size)) {
+ return iree_make_status(
+ iree_status_code_from_win32_error(GetLastError()),
+ "failed to query logical processor information (%08X)", GetLastError());
+ }
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships_end =
+ (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)
+ cache_relationships +
+ cache_relationships_size);
+
+ // Perform the assignment.
+ iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+ cache_relationships, cache_relationships_end, topology);
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+ iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+ iree_task_topology_t* out_topology) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)cpu_count);
+
+ iree_task_topology_initialize(out_topology);
+
+ // Query the total size required for all information and allocate storage for
+ // it on the stack - it's generally just a few KB.
+ DWORD all_relationships_size = 0;
+ if (!GetLogicalProcessorInformationEx(RelationAll, NULL,
+ &all_relationships_size) &&
+ GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+ IREE_TRACE_ZONE_END(z0);
+ return iree_make_status(
+ iree_status_code_from_win32_error(GetLastError()),
+ "failed to query logical processor information size (%08X)",
+ GetLastError());
+ }
+ if (all_relationships_size > 64 * 1024) {
+ IREE_TRACE_ZONE_END(z0);
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "logical processor information size overflow (got "
+ "%u which is large for a stack alloc)",
+ all_relationships_size);
+ }
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships =
+ (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
+ all_relationships_size);
+
+ // Query again to populate the storage with all relationship information.
+ if (!GetLogicalProcessorInformationEx(RelationAll, all_relationships,
+ &all_relationships_size)) {
+ IREE_TRACE_ZONE_END(z0);
+ return iree_make_status(
+ iree_status_code_from_win32_error(GetLastError()),
+ "failed to query logical processor information (%08X)", GetLastError());
+ }
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships_end =
+ (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)all_relationships +
+ all_relationships_size);
+
+ // Count up the total number of logical processors (bits in each core group).
+ uint32_t total_processor_count = 0;
+ for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
+ p < all_relationships_end;
+ p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+ if (p->Relationship == RelationProcessorCore) {
+ assert(p->Processor.GroupCount == 1);
+ total_processor_count +=
+ iree_task_count_kaffinity_bits(p->Processor.GroupMask[0].Mask);
+ }
+ }
+
+ // Validate the CPU IDs provided and build a lookup table of processors we
+ // have selected. This could be a bitmap but it's not worth the code today.
+ uint8_t* included_processors =
+ (uint8_t*)iree_alloca(total_processor_count * sizeof(uint8_t));
+ memset(included_processors, 0, total_processor_count * sizeof(uint8_t));
+ for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+ if (cpu_ids[i] >= total_processor_count) {
+ IREE_TRACE_ZONE_END(z0);
+ return iree_make_status(
+ IREE_STATUS_OUT_OF_RANGE,
+ "cpu_ids[%" PRIhsz
+ "] %u out of bounds, only %u logical processors available",
+ i, cpu_ids[i], total_processor_count);
+ }
+ included_processors[cpu_ids[i]] = 1;
+ }
+
+ // Build an on-stack table for random access into all logical processors.
+ // This isn't strictly required but makes it easier to walk the CPU table.
+ PROCESSOR_RELATIONSHIP** all_processors =
+ iree_alloca(sizeof(PROCESSOR_RELATIONSHIP*) * total_processor_count);
+ iree_host_size_t global_processor_count = 0;
+ for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
+ p < all_relationships_end;
+ p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+ if (p->Relationship != RelationProcessorCore) continue;
+ assert(p->Processor.GroupCount == 1);
+ KAFFINITY mask = p->Processor.GroupMask[0].Mask;
+ int group_offset = 0;
+ while (mask) {
+ int bit_offset = iree_task_count_trailing_zeros_kaffinity(mask);
+ mask = mask >> (bit_offset + 1);
+ iree_host_size_t global_processor_index = global_processor_count++;
+ if (included_processors[global_processor_index]) {
+ // Setup the group for the processor.
+ uint8_t group_index = (uint8_t)out_topology->group_count++;
+ iree_task_topology_group_t* group = &out_topology->groups[group_index];
+ iree_task_topology_group_initialize(group_index, group);
+ group->processor_index = (uint32_t)global_processor_index;
+ group->constructive_sharing_mask = 0; // set below
+
+ // Pin group to the processor.
+ iree_thread_affinity_t* affinity = &group->ideal_thread_affinity;
+ memset(affinity, 0, sizeof(*affinity));
+ affinity->specified = 1;
+ affinity->smt = (p->Processor.Flags & LTP_PC_SMT) == LTP_PC_SMT;
+ affinity->group = p->Processor.GroupMask[0].Group;
+ affinity->id = group_offset + bit_offset;
+ }
+ group_offset += bit_offset + 1;
+ if (out_topology->group_count >= cpu_count) break;
+ }
+ if (out_topology->group_count >= cpu_count) break;
+ }
+
+ // Assign constructive sharing masks to each topology group.
+ iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+ all_relationships, all_relationships_end, out_topology);
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
iree_status_t iree_task_topology_initialize_from_physical_cores(
iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,
iree_task_topology_t* out_topology) {
@@ -258,26 +444,9 @@
all_cores[adjusted_core_index], &group->ideal_thread_affinity);
}
- // Assign constructive sharing masks to each topology group. These indicate
- // which other topology groups share L3 caches (if any).
- for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
- p < all_relationships_end;
- p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
- if (p->Relationship == RelationCache) {
- if (p->Cache.Level == 3 &&
- (p->Cache.Type == CacheUnified || p->Cache.Type == CacheData)) {
- if (p->Cache.GroupCount == 0) {
- iree_task_topology_assign_constructive_sharing(out_topology,
- p->Cache.GroupMask);
- } else {
- for (WORD i = 0; i < p->Cache.GroupCount; ++i) {
- iree_task_topology_assign_constructive_sharing(
- out_topology, p->Cache.GroupMasks[i]);
- }
- }
- }
- }
- }
+ // Assign constructive sharing masks to each topology group.
+ iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+ all_relationships, all_relationships_end, out_topology);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();