Adding `--task_topology_cpu_ids=` flag. (#14969) This allows for explicitly specifying one or more logical CPU ID sets that should be used for task system topologies. The IDs are in the range of [0, total_cpu_count) to match lscpu/lstopo on Linux and on a flattened view of all cores on Windows (so with group 0 having 4 logical CPUs an ID of 6 indicates group 1 cpu 1). When the flag is specified all other topology flags are ignored as the NUMA node specified is implicit in the absolute logical CPU ID. For programmatic control (python/etc) the `iree_task_topology_initialize_from_logical_cpu_set_string` helper is exposed. A hosting application that already knows precisely which thread affinities it wants can use the new `iree_task_topology_initialize_from_thread_affinities` to directly specify without any interpretation while still getting proper constructive sharing masks when supported on the platform. Examples: `--task_topology_cpu_ids=0`: one topology with logical CPU 0 only `--task_topology_cpu_ids=0,1`: one topology with logical CPU 0 and 1 `--task_topology_cpu_ids=0,1 task_topology_cpu_ids=32,33`: two topologies with logical CPU 0,1 and 32,33 respectively

commit: 0f4dd7350fe4e6c44149afbb3572e04b6572b298 [log] [tgz]
author: Ben Vanik <ben.vanik@gmail.com> Tue Sep 19 17:56:36 2023 -0700
committer: GitHub <noreply@github.com> Tue Sep 19 20:56:36 2023 -0400
tree: f96e2da26cbd35c2a26a1a8f6ebf72bb74d4b96e
parent: 04259d04d6354cb9ba53451894db7f48aa2a9501 [diff]
diff --git a/runtime/src/iree/base/alignment.h b/runtime/src/iree/base/alignment.h
index 54c64dd..bbf3b19 100644
--- a/runtime/src/iree/base/alignment.h
+++ b/runtime/src/iree/base/alignment.h

@@ -130,6 +130,22 @@
 #define iree_sizeof_struct(t) iree_host_align(sizeof(t), iree_max_align_t)
 
 // Returns the ceil-divide of |lhs| by non-zero |rhs|.
+static inline iree_host_size_t iree_host_size_ceil_div(iree_host_size_t lhs,
+                                                       iree_host_size_t rhs) {
+  return ((lhs != 0) && (lhs > 0) == (rhs > 0))
+             ? ((lhs + ((rhs > 0) ? -1 : 1)) / rhs) + 1
+             : -(-lhs / rhs);
+}
+
+// Returns the floor-divide of |lhs| by non-zero |rhs|.
+static inline iree_host_size_t iree_host_size_floor_div(iree_host_size_t lhs,
+                                                        iree_host_size_t rhs) {
+  return ((lhs != 0) && ((lhs < 0) != (rhs < 0)))
+             ? -((-lhs + ((rhs < 0) ? 1 : -1)) / rhs) - 1
+             : lhs / rhs;
+}
+
+// Returns the ceil-divide of |lhs| by non-zero |rhs|.
 static inline iree_device_size_t iree_device_size_ceil_div(
     iree_device_size_t lhs, iree_device_size_t rhs) {
   return ((lhs != 0) && (lhs > 0) == (rhs > 0))

diff --git a/runtime/src/iree/task/api.c b/runtime/src/iree/task/api.c
index 086d2ae..d55d5b6 100644
--- a/runtime/src/iree/task/api.c
+++ b/runtime/src/iree/task/api.c

@@ -62,21 +62,17 @@
 //===----------------------------------------------------------------------===//
 
 IREE_FLAG(
-    string, task_topology_nodes, "current",
-    "Comma-separated list of NUMA nodes that topologies will be defined for.\n"
-    "Each node specified will be configured based on the other topology\n"
-    "flags. 'all' can be used to indicate all available NUMA nodes and\n"
-    "'current' will inherit the node of the calling thread.");
-
-IREE_FLAG(
     string, task_topology_mode, "physical_cores",
     "Available modes:\n"
     " --task_topology_group_count=non-zero:\n"
     "   Uses whatever the specified group count is and ignores the set mode.\n"
     "   All threads will be unpinned and run on system-determined processors.\n"
+    " --task_topology_cpu_ids=0,1,2 [+ --task_topology_cpu_ids=3,4,5]:\n"
+    "   Creates one executor per set of logical CPU IDs.\n"
     " 'physical_cores':\n"
-    "   Creates one group per physical core in each NUMA node up to\n"
-    "   the value specified by --task_topology_max_group_count=.");
+    "   Creates one executor per NUMA node in --task_topology_nodes= and one\n"
+    "   group per physical core in each NUMA node up to the value specified\n"
+    "   by --task_topology_max_group_count=.");
 
 IREE_FLAG(
     int32_t, task_topology_group_count, 0,
@@ -87,6 +83,20 @@
     "WARNING: setting this flag directly is not recommended; use\n"
     "--task_topology_max_group_count= instead.");
 
+IREE_FLAG_LIST(
+    string, task_topology_cpu_ids,
+    "A list of absolute logical CPU IDs to use for a single topology. One\n"
+    "topology will be created for each repetition of the flag. CPU IDs match\n"
+    "the Linux logical CPU ID scheme (as used by lscpu/lstopo) or a flattened\n"
+    "[0, total_processor_count) range on Windows.");
+
+IREE_FLAG(
+    string, task_topology_nodes, "current",
+    "Comma-separated list of NUMA nodes that topologies will be defined for.\n"
+    "Each node specified will be configured based on the other topology\n"
+    "flags. 'all' can be used to indicate all available NUMA nodes and\n"
+    "'current' will inherit the node of the calling thread.");
+
 IREE_FLAG(
     int32_t, task_topology_max_group_count, 8,
     "Sets a maximum value on the worker count that can be automatically\n"
@@ -178,72 +188,90 @@
   fprintf(file, "# --%.*s\n", (int)flag_name.size, flag_name.data);
 }
 
-static iree_status_t iree_task_flags_dump_task_topologies(
-    iree_string_view_t flag_name, void* storage, iree_string_view_t value) {
-  // Select which nodes in the machine we will be creating topologies for.
-  uint64_t node_mask = 0ull;
-  IREE_RETURN_IF_ERROR(
-      iree_task_topologies_select_nodes_from_flags(&node_mask));
-
-  // TODO(benvanik): macros to make this iteration easier (ala cpu_set
-  // iterators).
-  iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
-  uint64_t node_mask_bits = node_mask;
-  iree_task_topology_node_id_t node_base_id = 0;
-  for (iree_host_size_t i = 0; i < topology_count; ++i) {
-    int node_offset =
-        iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
-    iree_task_topology_node_id_t node_id = node_base_id + node_offset;
-    node_base_id += node_offset + 1;
-    node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
-    iree_task_topology_t topology;
-    IREE_RETURN_IF_ERROR(
-        iree_task_topology_initialize_from_flags(node_id, &topology));
-    fprintf(stdout,
-            "# "
-            "===-------------------------------------------------------------"
-            "-----------===\n");
-    fprintf(stdout, "# topology[%" PRIhsz "]: %" PRIhsz " worker groups\n", i,
-            topology.group_count);
-    fprintf(stdout,
-            "# "
-            "===-------------------------------------------------------------"
-            "-----------===\n");
-    fprintf(stdout, "#\n");
-    for (iree_host_size_t j = 0; j < topology.group_count; ++j) {
-      const iree_task_topology_group_t* group = &topology.groups[j];
-      fprintf(stdout, "# group[%d]: '%s'\n", group->group_index, group->name);
-      fprintf(stdout, "#      processor: %u\n", group->processor_index);
-      fprintf(stdout, "#       affinity: ");
-      if (group->ideal_thread_affinity.specified) {
-        fprintf(stdout, "group=%u, id=%u, smt=%u",
-                group->ideal_thread_affinity.group,
-                group->ideal_thread_affinity.id,
-                group->ideal_thread_affinity.smt);
-      } else {
-        fprintf(stdout, "(unspecified)");
+static void iree_task_flags_dump_task_topology(
+    iree_host_size_t topology_id, const iree_task_topology_t* topology) {
+  fprintf(stdout,
+          "# "
+          "===-------------------------------------------------------------"
+          "-----------===\n");
+  fprintf(stdout, "# topology[%" PRIhsz "]: %" PRIhsz " worker groups\n",
+          topology_id, topology->group_count);
+  fprintf(stdout,
+          "# "
+          "===-------------------------------------------------------------"
+          "-----------===\n");
+  fprintf(stdout, "#\n");
+  for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
+    const iree_task_topology_group_t* group = &topology->groups[j];
+    fprintf(stdout, "# group[%d]: '%s'\n", group->group_index, group->name);
+    fprintf(stdout, "#      processor: %u\n", group->processor_index);
+    fprintf(stdout, "#       affinity: ");
+    if (group->ideal_thread_affinity.specified) {
+      fprintf(
+          stdout, "group=%u, id=%u, smt=%u", group->ideal_thread_affinity.group,
+          group->ideal_thread_affinity.id, group->ideal_thread_affinity.smt);
+    } else {
+      fprintf(stdout, "(unspecified)");
+    }
+    fprintf(stdout, "\n");
+    fprintf(stdout, "#  cache sharing: ");
+    if (group->constructive_sharing_mask == 0) {
+      fprintf(stdout, "(none)\n");
+    } else if (group->constructive_sharing_mask ==
+               IREE_TASK_TOPOLOGY_GROUP_MASK_ALL) {
+      fprintf(stdout, "(all/undefined)\n");
+    } else {
+      fprintf(stdout, "%d group(s): ",
+              iree_math_count_ones_u64(group->constructive_sharing_mask));
+      for (iree_host_size_t ic = 0, jc = 0;
+           ic < IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT; ++ic) {
+        if ((group->constructive_sharing_mask >> ic) & 1) {
+          if (jc > 0) fprintf(stdout, ", ");
+          fprintf(stdout, "%" PRIhsz, ic);
+          ++jc;
+        }
       }
       fprintf(stdout, "\n");
-      fprintf(stdout, "#  cache sharing: ");
-      if (group->constructive_sharing_mask == 0) {
-        fprintf(stdout, "(none)\n");
-      } else if (group->constructive_sharing_mask ==
-                 IREE_TASK_TOPOLOGY_GROUP_MASK_ALL) {
-        fprintf(stdout, "(all/undefined)\n");
-      } else {
-        fprintf(stdout, "%d group(s): ",
-                iree_math_count_ones_u64(group->constructive_sharing_mask));
-        for (iree_host_size_t ic = 0, jc = 0;
-             ic < IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT; ++ic) {
-          if ((group->constructive_sharing_mask >> ic) & 1) {
-            if (jc > 0) fprintf(stdout, ", ");
-            fprintf(stdout, "%" PRIhsz, ic);
-            ++jc;
-          }
-        }
-        fprintf(stdout, "\n");
-      }
-      fprintf(stdout, "#\n");
+    }
+    fprintf(stdout, "#\n");
+  }
+}
+
+static iree_status_t iree_task_flags_dump_task_topologies(
+    iree_string_view_t flag_name, void* storage, iree_string_view_t value) {
+  const iree_flag_string_list_t cpu_ids_list =
+      FLAG_task_topology_cpu_ids_list();
+  if (cpu_ids_list.count == 0) {
+    // Select which nodes in the machine we will be creating topologies for.
+    uint64_t node_mask = 0ull;
+    IREE_RETURN_IF_ERROR(
+        iree_task_topologies_select_nodes_from_flags(&node_mask));
+
+    // TODO(benvanik): macros to make this iteration easier (ala cpu_set
+    // iterators).
+    iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
+    uint64_t node_mask_bits = node_mask;
+    iree_task_topology_node_id_t node_base_id = 0;
+    for (iree_host_size_t i = 0; i < topology_count; ++i) {
+      int node_offset =
+          iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
+      iree_task_topology_node_id_t node_id = node_base_id + node_offset;
+      node_base_id += node_offset + 1;
+      node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
+      iree_task_topology_t topology;
+      IREE_RETURN_IF_ERROR(
+          iree_task_topology_initialize_from_flags(node_id, &topology));
+      iree_task_flags_dump_task_topology(i, &topology);
+      iree_task_topology_deinitialize(&topology);
+    }
+  } else {
+    for (iree_host_size_t i = 0; i < cpu_ids_list.count; ++i) {
+      iree_task_topology_t topology;
+      IREE_RETURN_IF_ERROR(
+          iree_task_topology_initialize_from_logical_cpu_set_string(
+              cpu_ids_list.values[i], &topology));
+      iree_task_flags_dump_task_topology(i, &topology);
+      iree_task_topology_deinitialize(&topology);
     }
   }
 
@@ -275,11 +303,19 @@
   IREE_RETURN_IF_ERROR(
       iree_task_executor_options_initialize_from_flags(&options));
 
-  // Select which nodes in the machine we will be creating topologies for.
+  // Select which nodes in the machine we will be creating topologies for based
+  // on the topology mode.
+  iree_host_size_t topology_count = 0;
   uint64_t node_mask = 0ull;
-  IREE_RETURN_IF_ERROR(
-      iree_task_topologies_select_nodes_from_flags(&node_mask));
-  const iree_host_size_t topology_count = iree_math_count_ones_u64(node_mask);
+  const iree_flag_string_list_t cpu_ids_list =
+      FLAG_task_topology_cpu_ids_list();
+  if (cpu_ids_list.count == 0) {
+    IREE_RETURN_IF_ERROR(
+        iree_task_topologies_select_nodes_from_flags(&node_mask));
+    topology_count = iree_math_count_ones_u64(node_mask);
+  } else {
+    topology_count = cpu_ids_list.count;
+  }
 
   // Since this utility function creates one executor per topology returned by
   // the query we can check the executor capacity immediately.
@@ -309,34 +345,58 @@
   }
 
   // Create one executor per topology.
-  // TODO(benvanik): macros to make this iteration easier (ala cpu_set
-  // iterators).
   iree_status_t status = iree_ok_status();
-  uint64_t node_mask_bits = node_mask;
-  iree_task_topology_node_id_t node_base_id = 0;
-  for (iree_host_size_t i = 0; i < topology_count; ++i) {
-    int node_offset =
-        iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
-    iree_task_topology_node_id_t node_id = node_base_id + node_offset;
-    node_base_id += node_offset + 1;
-    node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
+  if (cpu_ids_list.count == 0) {
+    // TODO(benvanik): macros to make this iteration easier (ala cpu_set
+    // iterators).
+    uint64_t node_mask_bits = node_mask;
+    iree_task_topology_node_id_t node_base_id = 0;
+    for (iree_host_size_t i = 0; i < topology_count; ++i) {
+      int node_offset =
+          iree_task_affinity_set_count_trailing_zeros(node_mask_bits);
+      iree_task_topology_node_id_t node_id = node_base_id + node_offset;
+      node_base_id += node_offset + 1;
+      node_mask_bits = iree_shr(node_mask_bits, node_offset + 1);
 
-    // Query topology for the node this executor is pinned to.
-    iree_task_topology_t topology;
-    status = iree_task_topology_initialize_from_flags(node_id, &topology);
-    if (!iree_status_is_ok(status)) break;
+      // Query topology for the node this executor is pinned to.
+      iree_task_topology_t topology;
+      status = iree_task_topology_initialize_from_flags(node_id, &topology);
+      if (!iree_status_is_ok(status)) break;
 
-    // TODO(benvanik): if group count is 0 then don't create the executor. Today
-    // the executor creation will fail with 0 groups so the program won't get in
-    // a weird state but it's probably not what a user would expect.
+      // TODO(benvanik): if group count is 0 then don't create the executor.
+      // Today the executor creation will fail with 0 groups so the program
+      // won't get in a weird state but it's probably not what a user would
+      // expect.
 
-    // Create executor with the given topology.
-    status = iree_task_executor_create(options, &topology, host_allocator,
-                                       &executors[i]);
+      // Create executor with the given topology.
+      status = iree_task_executor_create(options, &topology, host_allocator,
+                                         &executors[i]);
 
-    // Executor has consumed the topology and it can be dropped now.
-    iree_task_topology_deinitialize(&topology);
-    if (!iree_status_is_ok(status)) break;
+      // Executor has consumed the topology and it can be dropped now.
+      iree_task_topology_deinitialize(&topology);
+      if (!iree_status_is_ok(status)) break;
+    }
+  } else {
+    for (iree_host_size_t i = 0; i < topology_count; ++i) {
+      // Query topology for the node this executor is pinned to.
+      iree_task_topology_t topology;
+      status = iree_task_topology_initialize_from_logical_cpu_set_string(
+          cpu_ids_list.values[i], &topology);
+      if (!iree_status_is_ok(status)) break;
+
+      // TODO(benvanik): if group count is 0 then don't create the executor.
+      // Today the executor creation will fail with 0 groups so the program
+      // won't get in a weird state but it's probably not what a user would
+      // expect.
+
+      // Create executor with the given topology.
+      status = iree_task_executor_create(options, &topology, host_allocator,
+                                         &executors[i]);
+
+      // Executor has consumed the topology and it can be dropped now.
+      iree_task_topology_deinitialize(&topology);
+      if (!iree_status_is_ok(status)) break;
+    }
   }
 
   if (iree_status_is_ok(status)) {

diff --git a/runtime/src/iree/task/topology.c b/runtime/src/iree/task/topology.c
index 1bf11a1..36bfea2 100644
--- a/runtime/src/iree/task/topology.c
+++ b/runtime/src/iree/task/topology.c

@@ -76,11 +76,25 @@
   return iree_ok_status();
 }
 
+// Fixes constructive_sharing_mask values such that they represent other chosen
+// topology groups instead of processor indices. We do this so that code using
+// the topology groups doesn't need to know anything about which physical
+// processor IDs a particular group is mapped to.
+//
+// This is implemented by platform-specific logic and may be a no-op if the
+// platform doesn't support querying the required cache information.
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+    iree_task_topology_t* topology);
+
 void iree_task_topology_initialize_from_group_count(
     iree_host_size_t group_count, iree_task_topology_t* out_topology) {
+  // Clamp to the maximum we support.
+  group_count = iree_min(group_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, group_count);
 
+  // Initialize default groups with no affinities specified.
   iree_task_topology_initialize(out_topology);
   for (iree_host_size_t i = 0; i < group_count; ++i) {
     iree_task_topology_group_t* group = &out_topology->groups[i];
@@ -90,3 +104,64 @@
 
   IREE_TRACE_ZONE_END(z0);
 }
+
+iree_status_t iree_task_topology_initialize_from_thread_affinities(
+    iree_host_size_t group_count,
+    const iree_thread_affinity_t* group_affinities,
+    iree_task_topology_t* out_topology) {
+  // Today we have a fixed limit on the number of groups within a particular
+  // topology.
+  if (group_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "too many groups specified (%" PRIhsz
+                            " provided for a max capacity of %zu)",
+                            group_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, group_count);
+
+  // Initialize each group with the given affinities.
+  iree_task_topology_initialize(out_topology);
+  for (iree_host_size_t i = 0; i < group_count; ++i) {
+    iree_task_topology_group_t* group = &out_topology->groups[i];
+    iree_task_topology_group_initialize(i, group);
+    group->ideal_thread_affinity = group_affinities[i];
+  }
+  out_topology->group_count = group_count;
+
+  // Try to use platform support to set the constructive sharing masks.
+  // No-op if the platform support is not available.
+  iree_status_t status =
+      iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set_string(
+    iree_string_view_t cpu_id_set, iree_task_topology_t* out_topology) {
+  if (iree_string_view_is_empty(cpu_id_set)) {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "at least one CPU ID must be provided");
+  }
+  iree_host_size_t count = 1;
+  for (iree_host_size_t i = 0; i < cpu_id_set.size; ++i) {
+    if (cpu_id_set.data[i] == ',') ++count;
+  }
+  uint32_t* cpu_ids = (uint32_t*)iree_alloca(count * sizeof(uint32_t));
+  memset(cpu_ids, 0, count * sizeof(uint32_t));
+  iree_host_size_t cpu_count = 0;
+  while (!iree_string_view_is_empty(cpu_id_set)) {
+    iree_string_view_t cpu_id_string = iree_string_view_empty();
+    iree_string_view_split(cpu_id_set, ',', &cpu_id_string, &cpu_id_set);
+    if (!iree_string_view_atoi_uint32(iree_string_view_trim(cpu_id_string),
+                                      &cpu_ids[cpu_count++])) {
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "'%.*s' not a valid CPU ID",
+                              (int)cpu_id_string.size, cpu_id_string.data);
+    }
+  }
+  return iree_task_topology_initialize_from_logical_cpu_set(cpu_count, cpu_ids,
+                                                            out_topology);
+}

diff --git a/runtime/src/iree/task/topology.h b/runtime/src/iree/task/topology.h
index a449e2a..c563be2 100644
--- a/runtime/src/iree/task/topology.h
+++ b/runtime/src/iree/task/topology.h

@@ -152,8 +152,32 @@
 void iree_task_topology_initialize_from_group_count(
     iree_host_size_t group_count, iree_task_topology_t* out_topology);
 
+// Initializes a topology with the given groups each assigned a platform thread
+// affinity. See `iree_thread_affinity_t` for more information about how to
+// properly initialize the thread affinities for each platform.
+iree_status_t iree_task_topology_initialize_from_thread_affinities(
+    iree_host_size_t group_count,
+    const iree_thread_affinity_t* group_affinities,
+    iree_task_topology_t* out_topology);
+
+// Initializes a topology with one group for each logical CPU specified.
+//
+// The logical CPU IDs are in the platform-defined flattened domain of 0 to
+// the total number of logical processors in the system such as those returned
+// by `lscpu --extended`/lstopo/the bit index in cpu_set_t. The same ID is used
+// on the file-based access in e.g. `/sys/devices/system/cpu/cpu<cpu_id>/`.
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+    iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+    iree_task_topology_t* out_topology);
+
+// Initializes a topology with one group for each logical CPU specified in a
+// comma-delimited list.
+// See iree_task_topology_initialize_from_logical_cpu_set for more information.
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set_string(
+    iree_string_view_t cpu_id_set, iree_task_topology_t* out_topology);
+
 // Initializes a topology with one group for each physical core with the given
-// NUMA node ID (usually package or cluster). Up to |max_core_count| physical
+// NUMA |node_id| (usually package or cluster). Up to |max_core_count| physical
 // cores will be selected from the node.
 iree_status_t iree_task_topology_initialize_from_physical_cores(
     iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,

diff --git a/runtime/src/iree/task/topology_cpuinfo.c b/runtime/src/iree/task/topology_cpuinfo.c
index 2f8198d..4a9dc77 100644
--- a/runtime/src/iree/task/topology_cpuinfo.c
+++ b/runtime/src/iree/task/topology_cpuinfo.c

@@ -18,12 +18,14 @@
     iree_host_size_t max_group_count, iree_task_topology_t* out_topology) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, max_group_count);
+
   // TODO(benvanik): implement our own query... but that seems not so great.
   // For now we default to a single group: if a user wants more then they can
   // either get cpuinfo working for their platform or manually construct the
   // topology themselves.
   iree_host_size_t group_count = 1;
   iree_task_topology_initialize_from_group_count(group_count, out_topology);
+
   IREE_TRACE_ZONE_END(z0);
 }
 
@@ -35,6 +37,50 @@
   return 0;
 }
 
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+    iree_task_topology_t* topology) {
+  // No-op.
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+    iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+    iree_task_topology_t* out_topology) {
+  // Today we have a fixed limit on the number of groups within a particular
+  // topology.
+  if (cpu_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "too many CPUs specified (%" PRIhsz
+                            " provided for a max capacity of %zu)",
+                            cpu_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, cpu_count);
+
+  iree_task_topology_initialize(out_topology);
+
+  out_topology->group_count = cpu_count;
+  for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+    iree_task_topology_group_t* group = &out_topology->groups[i];
+    iree_task_topology_group_initialize(i, group);
+    group->processor_index = cpu_ids[i];
+
+    // NOTE: without cpuinfo we can't get SMT and node info but this isn't
+    // really used on Linux today anyway.
+    iree_thread_affinity_t* affinity = &group->ideal_thread_affinity;
+    memset(affinity, 0, sizeof(*affinity));
+    affinity->specified = 1;
+    affinity->id = cpu_ids[i];
+  }
+
+  iree_status_t status =
+      iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
 iree_status_t iree_task_topology_initialize_from_physical_cores(
     iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,
     iree_task_topology_t* out_topology) {
@@ -134,6 +180,30 @@
 #endif  // cpuinfo-like platform field
 }
 
+// Populates |out_group| with the information from |processor|.
+static void iree_task_topology_group_initialize_from_processor(
+    uint32_t group_index, const struct cpuinfo_processor* processor,
+    iree_task_topology_group_t* out_group) {
+  iree_task_topology_group_initialize(group_index, out_group);
+  out_group->processor_index = processor->linux_id;
+  iree_task_topology_set_affinity_from_processor(
+      processor, &out_group->ideal_thread_affinity);
+}
+
+// Populates |out_group| with the information from |core|.
+static void iree_task_topology_group_initialize_from_core(
+    uint32_t group_index, const struct cpuinfo_core* core,
+    iree_task_topology_group_t* out_group) {
+  // Guess: always pick the first processor in a core.
+  // When pinning to threads we'll take into account whether the core is SMT
+  // and use all threads anyway so this alignment is just helpful for debugging.
+  uint32_t processor_i = core->processor_start;
+  const struct cpuinfo_processor* processor =
+      cpuinfo_get_processor(processor_i);
+  iree_task_topology_group_initialize_from_processor(group_index, processor,
+                                                     out_group);
+}
+
 // Returns a bitset with all *processors* that share the same |cache|.
 static uint64_t iree_task_topology_calculate_cache_bits(
     const struct cpuinfo_cache* cache) {
@@ -163,30 +233,13 @@
   return mask;
 }
 
-// Populates |our_group| with the information from |core|.
-static void iree_task_topology_group_initialize_from_core(
-    uint32_t group_index, const struct cpuinfo_core* core,
-    iree_task_topology_group_t* out_group) {
-  iree_task_topology_group_initialize(group_index, out_group);
-
-  // Guess: always pick the first processor in a core.
-  // When pinning to threads we'll take into account whether the core is SMT
-  // and use all threads anyway so this alignment is just helpful for debugging.
-  uint32_t processor_i = core->processor_start;
-  out_group->processor_index = processor_i;
-
-  const struct cpuinfo_processor* processor =
-      cpuinfo_get_processor(processor_i);
-  iree_task_topology_set_affinity_from_processor(
-      processor, &out_group->ideal_thread_affinity);
-}
-
-// Fixes constructive_sharing_mask values such that they represent other chosen
-// topology groups instead of processor indices. We do this so that code using
-// the topology groups doesn't need to know anything about which physical
-// processor IDs a particular group is mapped to.
-static void iree_task_topology_fixup_constructive_sharing_masks(
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
     iree_task_topology_t* topology) {
+  if (!iree_task_topology_is_cpuinfo_available()) {
+    // No-op when cpuinfo is unavailable.
+    return iree_ok_status();
+  }
+
   // O(n^2), but n is always <= 64 (and often <= 8).
   for (iree_host_size_t i = 0; i < topology->group_count; ++i) {
     iree_task_topology_group_t* group = &topology->groups[i];
@@ -198,7 +251,6 @@
 
     iree_task_topology_group_mask_t group_mask = 0;
     for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
-      if (i == j) continue;
       const iree_task_topology_group_t* other_group = &topology->groups[j];
       uint64_t group_processor_bits =
           iree_math_rotl_u64(1ull, other_group->processor_index);
@@ -209,6 +261,56 @@
 
     group->constructive_sharing_mask = group_mask;
   }
+
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+    iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+    iree_task_topology_t* out_topology) {
+  // Ensure cpuinfo is available; if not we fall back to random.
+  if (!iree_task_topology_is_cpuinfo_available()) {
+    iree_task_topology_initialize_fallback(cpu_count, out_topology);
+    return iree_ok_status();
+  }
+
+  // Today we have a fixed limit on the number of groups within a particular
+  // topology.
+  if (cpu_count >= IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "too many CPUs specified (%" PRIhsz
+                            " provided for a max capacity of %zu)",
+                            cpu_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
+  }
+
+  // Validate the CPU IDs provided.
+  const uint32_t processor_count = cpuinfo_get_processors_count();
+  for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+    if (cpu_ids[i] >= processor_count) {
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "cpu_ids[%" PRIhsz
+          "] %u out of bounds, only %u logical processors available",
+          i, cpu_ids[i], processor_count);
+    }
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, cpu_count);
+
+  iree_task_topology_initialize(out_topology);
+
+  out_topology->group_count = cpu_count;
+  for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+    const struct cpuinfo_processor* processor =
+        cpuinfo_get_processor(cpu_ids[i]);
+    iree_task_topology_group_initialize_from_processor(
+        i, processor, &out_topology->groups[i]);
+  }
+
+  iree_task_topology_fixup_constructive_sharing_masks(out_topology);
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
 }
 
 // Returns true if the given |core| passes the filter and should be included.
@@ -237,12 +339,12 @@
 static void iree_task_topology_initialize_from_physical_cores_with_filter(
     iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
     iree_host_size_t max_core_count, iree_task_topology_t* out_topology) {
-  max_core_count = iree_min(max_core_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
   if (!iree_task_topology_is_cpuinfo_available()) {
     iree_task_topology_initialize_fallback(max_core_count, out_topology);
     return;
   }
 
+  max_core_count = iree_min(max_core_count, IREE_TASK_TOPOLOGY_GROUP_BIT_COUNT);
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, max_core_count);
 

diff --git a/runtime/src/iree/task/topology_win32.c b/runtime/src/iree/task/topology_win32.c
index 770bcf9..30f62f9 100644
--- a/runtime/src/iree/task/topology_win32.c
+++ b/runtime/src/iree/task/topology_win32.c

@@ -89,6 +89,192 @@
   }
 }
 
+// Assigns constructive sharing masks to each topology group. These indicate
+// which other topology groups share L3 caches (if any).
+static void
+iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* relationships,
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* relationships_end,
+    iree_task_topology_t* topology) {
+  for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = relationships;
+       p < relationships_end;
+       p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+    if (p->Relationship == RelationCache) {
+      if (p->Cache.Level == 3 &&
+          (p->Cache.Type == CacheUnified || p->Cache.Type == CacheData)) {
+        if (p->Cache.GroupCount == 0) {
+          iree_task_topology_assign_constructive_sharing(topology,
+                                                         p->Cache.GroupMask);
+        } else {
+          for (WORD i = 0; i < p->Cache.GroupCount; ++i) {
+            iree_task_topology_assign_constructive_sharing(
+                topology, p->Cache.GroupMasks[i]);
+          }
+        }
+      }
+    }
+  }
+}
+
+iree_status_t iree_task_topology_fixup_constructive_sharing_masks(
+    iree_task_topology_t* topology) {
+  // Query the total size required for just cache information and allocate
+  // storage for it on the stack - it's generally just a few KB.
+  DWORD cache_relationships_size = 0;
+  if (!GetLogicalProcessorInformationEx(RelationCache, NULL,
+                                        &cache_relationships_size) &&
+      GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return iree_make_status(
+        iree_status_code_from_win32_error(GetLastError()),
+        "failed to query logical processor information size (%08X)",
+        GetLastError());
+  }
+  if (cache_relationships_size > 64 * 1024) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "logical processor information size overflow (got "
+                            "%u which is large for a stack alloc)",
+                            cache_relationships_size);
+  }
+  SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships =
+      (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
+          cache_relationships_size);
+
+  // Query again to populate the storage with cache relationship information.
+  if (!GetLogicalProcessorInformationEx(RelationCache, cache_relationships,
+                                        &cache_relationships_size)) {
+    return iree_make_status(
+        iree_status_code_from_win32_error(GetLastError()),
+        "failed to query logical processor information (%08X)", GetLastError());
+  }
+  SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* cache_relationships_end =
+      (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)
+                                                     cache_relationships +
+                                                 cache_relationships_size);
+
+  // Perform the assignment.
+  iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+      cache_relationships, cache_relationships_end, topology);
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_initialize_from_logical_cpu_set(
+    iree_host_size_t cpu_count, const uint32_t* cpu_ids,
+    iree_task_topology_t* out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)cpu_count);
+
+  iree_task_topology_initialize(out_topology);
+
+  // Query the total size required for all information and allocate storage for
+  // it on the stack - it's generally just a few KB.
+  DWORD all_relationships_size = 0;
+  if (!GetLogicalProcessorInformationEx(RelationAll, NULL,
+                                        &all_relationships_size) &&
+      GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        iree_status_code_from_win32_error(GetLastError()),
+        "failed to query logical processor information size (%08X)",
+        GetLastError());
+  }
+  if (all_relationships_size > 64 * 1024) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "logical processor information size overflow (got "
+                            "%u which is large for a stack alloc)",
+                            all_relationships_size);
+  }
+  SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships =
+      (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)iree_alloca(
+          all_relationships_size);
+
+  // Query again to populate the storage with all relationship information.
+  if (!GetLogicalProcessorInformationEx(RelationAll, all_relationships,
+                                        &all_relationships_size)) {
+    IREE_TRACE_ZONE_END(z0);
+    return iree_make_status(
+        iree_status_code_from_win32_error(GetLastError()),
+        "failed to query logical processor information (%08X)", GetLastError());
+  }
+  SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* all_relationships_end =
+      (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)all_relationships +
+                                                 all_relationships_size);
+
+  // Count up the total number of logical processors (bits in each core group).
+  uint32_t total_processor_count = 0;
+  for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
+       p < all_relationships_end;
+       p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+    if (p->Relationship == RelationProcessorCore) {
+      assert(p->Processor.GroupCount == 1);
+      total_processor_count +=
+          iree_task_count_kaffinity_bits(p->Processor.GroupMask[0].Mask);
+    }
+  }
+
+  // Validate the CPU IDs provided and build a lookup table of processors we
+  // have selected. This could be a bitmap but it's not worth the code today.
+  uint8_t* included_processors =
+      (uint8_t*)iree_alloca(total_processor_count * sizeof(uint8_t));
+  memset(included_processors, 0, total_processor_count * sizeof(uint8_t));
+  for (iree_host_size_t i = 0; i < cpu_count; ++i) {
+    if (cpu_ids[i] >= total_processor_count) {
+      IREE_TRACE_ZONE_END(z0);
+      return iree_make_status(
+          IREE_STATUS_OUT_OF_RANGE,
+          "cpu_ids[%" PRIhsz
+          "] %u out of bounds, only %u logical processors available",
+          i, cpu_ids[i], total_processor_count);
+    }
+    included_processors[cpu_ids[i]] = 1;
+  }
+
+  // Build an on-stack table for random access into all logical processors.
+  // This isn't strictly required but makes it easier to walk the CPU table.
+  PROCESSOR_RELATIONSHIP** all_processors =
+      iree_alloca(sizeof(PROCESSOR_RELATIONSHIP*) * total_processor_count);
+  iree_host_size_t global_processor_count = 0;
+  for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
+       p < all_relationships_end;
+       p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
+    if (p->Relationship != RelationProcessorCore) continue;
+    assert(p->Processor.GroupCount == 1);
+    KAFFINITY mask = p->Processor.GroupMask[0].Mask;
+    int group_offset = 0;
+    while (mask) {
+      int bit_offset = iree_task_count_trailing_zeros_kaffinity(mask);
+      mask = mask >> (bit_offset + 1);
+      iree_host_size_t global_processor_index = global_processor_count++;
+      if (included_processors[global_processor_index]) {
+        // Setup the group for the processor.
+        uint8_t group_index = (uint8_t)out_topology->group_count++;
+        iree_task_topology_group_t* group = &out_topology->groups[group_index];
+        iree_task_topology_group_initialize(group_index, group);
+        group->processor_index = (uint32_t)global_processor_index;
+        group->constructive_sharing_mask = 0;  // set below
+
+        // Pin group to the processor.
+        iree_thread_affinity_t* affinity = &group->ideal_thread_affinity;
+        memset(affinity, 0, sizeof(*affinity));
+        affinity->specified = 1;
+        affinity->smt = (p->Processor.Flags & LTP_PC_SMT) == LTP_PC_SMT;
+        affinity->group = p->Processor.GroupMask[0].Group;
+        affinity->id = group_offset + bit_offset;
+      }
+      group_offset += bit_offset + 1;
+      if (out_topology->group_count >= cpu_count) break;
+    }
+    if (out_topology->group_count >= cpu_count) break;
+  }
+
+  // Assign constructive sharing masks to each topology group.
+  iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+      all_relationships, all_relationships_end, out_topology);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
 iree_status_t iree_task_topology_initialize_from_physical_cores(
     iree_task_topology_node_id_t node_id, iree_host_size_t max_core_count,
     iree_task_topology_t* out_topology) {
@@ -258,26 +444,9 @@
         all_cores[adjusted_core_index], &group->ideal_thread_affinity);
   }
 
-  // Assign constructive sharing masks to each topology group. These indicate
-  // which other topology groups share L3 caches (if any).
-  for (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* p = all_relationships;
-       p < all_relationships_end;
-       p = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)((uintptr_t)p + p->Size)) {
-    if (p->Relationship == RelationCache) {
-      if (p->Cache.Level == 3 &&
-          (p->Cache.Type == CacheUnified || p->Cache.Type == CacheData)) {
-        if (p->Cache.GroupCount == 0) {
-          iree_task_topology_assign_constructive_sharing(out_topology,
-                                                         p->Cache.GroupMask);
-        } else {
-          for (WORD i = 0; i < p->Cache.GroupCount; ++i) {
-            iree_task_topology_assign_constructive_sharing(
-                out_topology, p->Cache.GroupMasks[i]);
-          }
-        }
-      }
-    }
-  }
+  // Assign constructive sharing masks to each topology group.
+  iree_task_topology_fixup_constructive_sharing_masks_from_relationships(
+      all_relationships, all_relationships_end, out_topology);
 
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
commit	0f4dd7350fe4e6c44149afbb3572e04b6572b298	[log] [tgz]
author	Ben Vanik <ben.vanik@gmail.com>	Tue Sep 19 17:56:36 2023 -0700
committer	GitHub <noreply@github.com>	Tue Sep 19 20:56:36 2023 -0400
tree	f96e2da26cbd35c2a26a1a8f6ebf72bb74d4b96e
parent	04259d04d6354cb9ba53451894db7f48aa2a9501 [diff]