Adding an entire RTOS-scale task scheduling system.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9f3888..98c1268 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -499,6 +499,7 @@
add_subdirectory(iree/hal)
add_subdirectory(iree/modules)
add_subdirectory(iree/schemas)
+add_subdirectory(iree/task)
add_subdirectory(iree/testing)
add_subdirectory(iree/test)
diff --git a/iree/base/debugging.h b/iree/base/debugging.h
index d6cb377..538ba08 100644
--- a/iree/base/debugging.h
+++ b/iree/base/debugging.h
@@ -12,10 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// NOTE: the best kind of synchronization is no synchronization; always try to
-// design your algorithm so that you don't need anything from this file :)
-// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
-
#ifndef IREE_BASE_DEBUGGING_H_
#define IREE_BASE_DEBUGGING_H_
diff --git a/iree/task/BUILD b/iree/task/BUILD
new file mode 100644
index 0000000..aa31386
--- /dev/null
+++ b/iree/task/BUILD
@@ -0,0 +1,133 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+ default_visibility = ["//visibility:public"],
+ features = ["layering_check"],
+ licenses = ["notice"], # Apache 2.0
+)
+
+cc_library(
+ name = "task",
+ srcs = [
+ "executor.c",
+ "executor_impl.h",
+ "list.c",
+ "pool.c",
+ "post_batch.c",
+ "post_batch.h",
+ "queue.c",
+ "scope.c",
+ "submission.c",
+ "task.c",
+ "task_impl.h",
+ "topology.c",
+ "worker.c",
+ "worker.h",
+ ],
+ hdrs = [
+ "affinity_set.h",
+ "executor.h",
+ "list.h",
+ "pool.h",
+ "queue.h",
+ "scope.h",
+ "submission.h",
+ "task.h",
+ "topology.h",
+ "tuning.h",
+ ],
+ deps = [
+ "//iree/base:api",
+ "//iree/base:atomic_slist",
+ "//iree/base:core_headers",
+ "//iree/base:synchronization",
+ "//iree/base:threading",
+ "//iree/base:tracing",
+ "//iree/base:wait_handle",
+ "@com_github_pytorch_cpuinfo//:cpuinfo",
+ ],
+)
+
+cc_test(
+ name = "executor_test",
+ srcs = ["executor_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/base:core_headers",
+ "//iree/task/testing:test_util",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "list_test",
+ srcs = ["list_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/task/testing:test_util",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "pool_test",
+ srcs = ["pool_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/task/testing:test_util",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "queue_test",
+ srcs = ["queue_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/task/testing:test_util",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "scope_test",
+ srcs = ["scope_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/task/testing:test_util",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
+
+cc_test(
+ name = "topology_test",
+ srcs = ["topology_test.cc"],
+ deps = [
+ ":task",
+ "//iree/base:api",
+ "//iree/testing:gtest",
+ "//iree/testing:gtest_main",
+ ],
+)
diff --git a/iree/task/CMakeLists.txt b/iree/task/CMakeLists.txt
new file mode 100644
index 0000000..f365ccf
--- /dev/null
+++ b/iree/task/CMakeLists.txt
@@ -0,0 +1,134 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+ NAME
+ task
+ HDRS
+ "affinity_set.h"
+ "executor.h"
+ "list.h"
+ "pool.h"
+ "queue.h"
+ "scope.h"
+ "submission.h"
+ "task.h"
+ "topology.h"
+ "tuning.h"
+ SRCS
+ "executor.c"
+ "executor_impl.h"
+ "list.c"
+ "pool.c"
+ "post_batch.c"
+ "post_batch.h"
+ "queue.c"
+ "scope.c"
+ "submission.c"
+ "task.c"
+ "task_impl.h"
+ "topology.c"
+ "worker.c"
+ "worker.h"
+ DEPS
+ cpuinfo
+ iree::base::api
+ iree::base::atomic_slist
+ iree::base::core_headers
+ iree::base::synchronization
+ iree::base::threading
+ iree::base::tracing
+ iree::base::wait_handle
+ PUBLIC
+)
+
+iree_cc_test(
+ NAME
+ executor_test
+ SRCS
+ "executor_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::base::core_headers
+ iree::task::testing::test_util
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
+
+iree_cc_test(
+ NAME
+ list_test
+ SRCS
+ "list_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::task::testing::test_util
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
+
+iree_cc_test(
+ NAME
+ pool_test
+ SRCS
+ "pool_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::task::testing::test_util
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
+
+iree_cc_test(
+ NAME
+ queue_test
+ SRCS
+ "queue_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::task::testing::test_util
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
+
+iree_cc_test(
+ NAME
+ scope_test
+ SRCS
+ "scope_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::task::testing::test_util
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
+
+iree_cc_test(
+ NAME
+ topology_test
+ SRCS
+ "topology_test.cc"
+ DEPS
+ ::task
+ iree::base::api
+ iree::testing::gtest
+ iree::testing::gtest_main
+)
diff --git a/iree/task/affinity_set.h b/iree/task/affinity_set.h
new file mode 100644
index 0000000..5e8dee1
--- /dev/null
+++ b/iree/task/affinity_set.h
@@ -0,0 +1,93 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_AFFINITY_SET_H_
+#define IREE_TASK_AFFINITY_SET_H_
+
+#include "iree/base/atomics.h"
+#include "iree/base/math.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// TODO(benvanik): if IREE_TASK_EXECUTOR_MAX_WORKER_COUNT <= 32 then switch
+// these to using the 32-bit primitives. No real effect on larger 64-bit systems
+// but if we were on a smaller 32-bit system with 2 cores it's kind of silly to
+// be doing expensive 64-bit atomics on a 32-bit bus all for just 2 bits of
+// data :)
+
+//===----------------------------------------------------------------------===//
+// iree_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef uint64_t iree_task_affinity_set_t;
+
+// Allows for only a specific worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker(
+ uint8_t worker_index) {
+ return 1ull << worker_index;
+}
+
+// Allows for a range of workers to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker_range(
+ uint8_t worker_start, uint8_t worker_end) {
+ return ((1ull << (worker_start - 1)) - 1) ^ ((1ull << worker_end) - 1);
+}
+
+// Allows for any worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_any_worker() {
+ return UINT64_MAX;
+}
+
+#define iree_task_affinity_set_count_trailing_zeros \
+ iree_math_count_trailing_zeros_u64
+#define iree_task_affinity_set_count_ones iree_math_count_ones_u64
+#define iree_task_affinity_set_rotr iree_math_rotr_u64
+
+//===----------------------------------------------------------------------===//
+// iree_atomic_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef iree_atomic_int64_t iree_atomic_task_affinity_set_t;
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_load(
+ iree_atomic_task_affinity_set_t* set, iree_memory_order_t order) {
+ return iree_atomic_load_int64(set, order);
+}
+
+static inline void iree_atomic_task_affinity_set_store(
+ iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+ iree_memory_order_t order) {
+ iree_atomic_store_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_and(
+ iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+ iree_memory_order_t order) {
+ return iree_atomic_fetch_and_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_or(
+ iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+ iree_memory_order_t order) {
+ return iree_atomic_fetch_or_int64(set, value, order);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_AFFINITY_SET_H_
diff --git a/iree/task/executor.c b/iree/task/executor.c
new file mode 100644
index 0000000..b4af62e
--- /dev/null
+++ b/iree/task/executor.c
@@ -0,0 +1,725 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/executor.h"
+
+#include "iree/base/debugging.h"
+#include "iree/base/math.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/task_impl.h"
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor);
+
+iree_status_t iree_task_executor_create(
+ iree_task_scheduling_mode_t scheduling_mode,
+ const iree_task_topology_t* topology, iree_allocator_t allocator,
+ iree_task_executor_t** out_executor) {
+ iree_host_size_t worker_count = iree_task_topology_group_count(topology);
+ if (worker_count > IREE_TASK_EXECUTOR_MAX_WORKER_COUNT) {
+ return iree_make_status(
+ IREE_STATUS_RESOURCE_EXHAUSTED,
+ "requested %zu workers but a maximum of %d is allowed", worker_count,
+ IREE_TASK_EXECUTOR_MAX_WORKER_COUNT);
+ }
+
+ // TODO(benvanik): support a threadless mode where we have one dummy worker
+ // that just holds the lists but is pumped from donate_caller.
+ if (worker_count == 0) {
+ return iree_make_status(
+ IREE_STATUS_UNIMPLEMENTED,
+ "threadless donate-only executor mode not yet implemented");
+ }
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_ASSERT_ARGUMENT(out_executor);
+ *out_executor = NULL;
+
+ iree_host_size_t executor_size =
+ sizeof(iree_task_executor_t) + worker_count * sizeof(iree_task_worker_t);
+
+ iree_task_executor_t* executor = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_allocator_malloc(allocator, executor_size, (void**)&executor));
+ memset(executor, 0, executor_size);
+ iree_atomic_ref_count_init(&executor->ref_count);
+ executor->allocator = allocator;
+ executor->scheduling_mode = scheduling_mode;
+ iree_atomic_task_slist_initialize(&executor->incoming_ready_slist);
+ iree_atomic_task_slist_initialize(&executor->incoming_waiting_slist);
+ iree_slim_mutex_initialize(&executor->coordinator_mutex);
+ iree_slim_mutex_initialize(&executor->wait_mutex);
+
+ // Simple PRNG used to generate seeds for the per-worker PRNGs used to
+ // distribute work. This isn't strong (and doesn't need to be); it's just
+ // enough to ensure each worker gets a sufficiently random seed for itself to
+ // then generate entropy with. As a hack we use out_executor's address, as
+ // that should live on the caller stack and with ASLR that's likely pretty
+ // random itself. I'm sure somewhere a mathemetician just cringed :)
+ iree_prng_splitmix64_state_t seed_prng;
+ iree_prng_splitmix64_initialize(/*seed=*/(uint64_t)(out_executor),
+ &seed_prng);
+ iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(&seed_prng),
+ &executor->donation_theft_prng);
+
+ iree_status_t status = iree_ok_status();
+
+ // Wait set used to batch syscalls for polling/waiting on wait handles.
+ // This is currently limited to a relatively small max to make bad behavior
+ // clearer with nice RESOURCE_EXHAUSTED errors.
+ if (iree_status_is_ok(status)) {
+ status = iree_wait_set_allocate(IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS,
+ allocator, &executor->wait_set);
+ }
+
+ // Pool used for all dispatch->slice fanout tasks. These only live within the
+ // executor and since we know the precise lifetime of them we can keep them
+ // entirely within the system here.
+ if (iree_status_is_ok(status)) {
+ status = iree_task_pool_initialize(
+ allocator, sizeof(iree_task_dispatch_slice_t),
+ worker_count * IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER,
+ &executor->slice_task_pool);
+ }
+ if (iree_status_is_ok(status)) {
+ status = iree_task_pool_initialize(
+ allocator, sizeof(iree_task_dispatch_shard_t),
+ worker_count * IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
+ &executor->shard_task_pool);
+ }
+
+ // Bring up the workers; the threads will be created here but be suspended
+ // (if the platform supports it) awaiting the first tasks getting scheduled.
+ if (iree_status_is_ok(status)) {
+ executor->worker_count = worker_count;
+ executor->workers = (iree_task_worker_t*)(executor + 1);
+ iree_task_affinity_set_t worker_idle_mask = 0;
+ iree_task_affinity_set_t worker_live_mask = 0;
+ iree_task_affinity_set_t worker_suspend_mask = 0;
+ for (iree_host_size_t i = 0; i < worker_count; ++i) {
+ iree_task_affinity_set_t worker_bit = iree_task_affinity_for_worker(i);
+ worker_idle_mask |= worker_bit;
+ worker_live_mask |= worker_bit;
+ if (executor->scheduling_mode &
+ IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+ worker_suspend_mask |= worker_bit;
+ }
+
+ iree_task_worker_t* worker = &executor->workers[i];
+ status = iree_task_worker_initialize(
+ executor, i, iree_task_topology_get_group(topology, i), &seed_prng,
+ worker);
+ if (!iree_status_is_ok(status)) break;
+ }
+ iree_atomic_task_affinity_set_store(&executor->worker_live_mask,
+ worker_live_mask,
+ iree_memory_order_relaxed);
+ iree_atomic_task_affinity_set_store(&executor->worker_suspend_mask,
+ worker_suspend_mask,
+ iree_memory_order_relaxed);
+ iree_atomic_task_affinity_set_store(&executor->worker_idle_mask,
+ worker_idle_mask,
+ iree_memory_order_relaxed);
+ }
+
+ if (!iree_status_is_ok(status)) {
+ // NOTE: destroy will ensure that any workers we have initialized are
+ // properly cleaned up.
+ iree_task_executor_destroy(executor);
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+ }
+
+ *out_executor = executor;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor) {
+ if (!executor) return;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // First ask all workers to exit. We do this prior to waiting on them to exit
+ // so that we parallelize the shutdown logic (which may flush pending tasks).
+ for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+ iree_task_worker_t* worker = &executor->workers[i];
+ iree_task_worker_request_exit(worker);
+ }
+
+ // Now that all workers should be in the process of exiting we can join with
+ // them. Some may take longer than others to exit but that's fine as we can't
+ // return from here until they do anyway.
+ for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+ iree_task_worker_t* worker = &executor->workers[i];
+ iree_task_worker_deinitialize(worker);
+ }
+
+ iree_wait_set_free(executor->wait_set);
+ iree_slim_mutex_deinitialize(&executor->wait_mutex);
+ iree_slim_mutex_deinitialize(&executor->coordinator_mutex);
+ iree_atomic_task_slist_deinitialize(&executor->incoming_ready_slist);
+ iree_atomic_task_slist_deinitialize(&executor->incoming_waiting_slist);
+ iree_task_pool_deinitialize(&executor->slice_task_pool);
+ iree_task_pool_deinitialize(&executor->shard_task_pool);
+ iree_allocator_free(executor->allocator, executor);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_retain(iree_task_executor_t* executor) {
+ if (executor) {
+ iree_atomic_ref_count_inc(&executor->ref_count);
+ }
+}
+
+void iree_task_executor_release(iree_task_executor_t* executor) {
+ if (executor && iree_atomic_ref_count_dec(&executor->ref_count) == 1) {
+ iree_task_executor_destroy(executor);
+ }
+}
+
+// Schedules a generic task to a worker matching its affinity.
+// The task will be posted to the worker mailbox and available for the worker to
+// begin processing as soon as the |post_batch| is submitted.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_relay_to_worker(
+ iree_task_executor_t* executor, iree_task_post_batch_t* post_batch,
+ iree_task_t* task) {
+ iree_host_size_t worker_index =
+ iree_task_post_batch_select_worker(post_batch, task->affinity_set);
+ iree_task_post_batch_enqueue(post_batch, worker_index, task);
+}
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Task may enqueue zero or more new tasks (or newly-ready/waiting tasks) to
+// |pending_submission| or queue work for posting to workers via the
+// |post_batch|.
+//
+// NOTE: the pending submission list we walk here is in FIFO order and the
+// post batch we are building is in LIFO; this means that as we pop off the
+// least recently added tasks from the submission (nice in-order traversal) we
+// are pushing them as what will become the least recent tasks in the batch.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+ iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch) {
+ if (iree_task_list_is_empty(&pending_submission->ready_list)) return;
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_task_t* task = NULL;
+ while ((task = iree_task_list_pop_front(&pending_submission->ready_list))) {
+ switch (task->type) {
+ case IREE_TASK_TYPE_NOP:
+ case IREE_TASK_TYPE_CALL:
+ case IREE_TASK_TYPE_DISPATCH_SLICE: {
+ // Generic routing to workers for tasks that should always run there.
+ iree_task_executor_relay_to_worker(executor, post_batch, task);
+ break;
+ }
+ case IREE_TASK_TYPE_BARRIER: {
+ // Retire the barrier to (possibly) ready up all dependent tasks.
+ // This acts as a fan-out in cases where the dependent task count >1.
+ iree_task_barrier_retire((iree_task_barrier_t*)task,
+ pending_submission);
+ break;
+ }
+ case IREE_TASK_TYPE_FENCE: {
+ // Scope fence hit; notifies the scope so that anyone waiting on the
+ // fence can be notified without us having to do so explicitly.
+ iree_task_fence_retire((iree_task_fence_t*)task, pending_submission);
+ break;
+ }
+ case IREE_TASK_TYPE_WAIT: {
+ // Waits may need to be moved into the wait list (not completed) or
+ // retired (after the wait condition is met).
+ if (task->flags & IREE_TASK_FLAG_WAIT_COMPLETED) {
+ iree_task_wait_retire((iree_task_wait_t*)task, pending_submission);
+ } else {
+ iree_task_submission_enqueue(pending_submission, task);
+ }
+ break;
+ }
+ case IREE_TASK_TYPE_DISPATCH: {
+ // Dispatches may need to be issued (fanning out the tiles to workers)
+ // or retired (after all tiles have completed).
+ if (task->flags & IREE_TASK_FLAG_DISPATCH_RETIRE) {
+ iree_task_dispatch_retire((iree_task_dispatch_t*)task,
+ pending_submission);
+ } else {
+ if (task->flags & IREE_TASK_FLAG_DISPATCH_SLICED) {
+ iree_task_dispatch_issue_sliced((iree_task_dispatch_t*)task,
+ &executor->slice_task_pool,
+ pending_submission, post_batch);
+ } else {
+ iree_task_dispatch_issue_sharded((iree_task_dispatch_t*)task,
+ &executor->shard_task_pool,
+ pending_submission, post_batch);
+ }
+ }
+ break;
+ }
+ }
+ }
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+ iree_task_submission_t* submission) {
+ // Concatenate all of the incoming tasks into the submission list.
+ // Note that the submission stores tasks in LIFO order such that when they are
+ // put into the LIFO atomic slist they match the order across all concats
+ // (earlier concats are later in the LIFO list).
+ iree_atomic_task_slist_concat(&executor->incoming_ready_slist,
+ submission->ready_list.head,
+ submission->ready_list.tail);
+ iree_atomic_task_slist_concat(&executor->incoming_waiting_slist,
+ submission->waiting_list.head,
+ submission->waiting_list.tail);
+
+ // NOTE: after concatenating the intrusive next_task pointers may immediately
+ // be modified by other threads. We can no longer assume anything about the
+ // submission lists and can only discard them.
+ iree_task_submission_reset(submission);
+}
+
+iree_status_t iree_task_executor_submit(iree_task_executor_t* executor,
+ iree_task_submission_t* submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Concatenate the submitted tasks onto our primary LIFO incoming lists.
+ iree_task_executor_merge_submission(executor, submission);
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_executor_flush(iree_task_executor_t* executor) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Mostly a no-op today as we aren't deferring submission with the scheduling
+ // mode. Instead, we'll just run the coordinator inline to ensure all tasks
+ // are pushed to workers.
+ iree_task_executor_coordinate(executor, /*current_worker=*/NULL,
+ /*speculative=*/false);
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+// Merges incoming likely-unresolved wait tasks into the primary executor lists.
+// The handle of each task will be inserted into the wait_set (where it may be
+// a duplicate).
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_merge_wait_list(
+ iree_task_executor_t* executor, iree_task_list_t* incoming_waiting_list) {
+ if (iree_task_list_is_empty(incoming_waiting_list)) return;
+
+ iree_slim_mutex_lock(&executor->wait_mutex);
+
+ // Walk the list of incoming wait tasks and add them to our wait_set.
+ iree_task_wait_t* wait_task =
+ (iree_task_wait_t*)iree_task_list_front(incoming_waiting_list);
+ do {
+ iree_status_t status =
+ iree_wait_set_insert(executor->wait_set, wait_task->wait_handle);
+ // TODO(#4026): propagate failure to the task scope.
+ IREE_ASSERT_TRUE(iree_status_is_ok(status));
+ iree_status_ignore(status);
+ wait_task = (iree_task_wait_t*)wait_task->header.next_task;
+ } while (wait_task);
+
+ iree_slim_mutex_unlock(&executor->wait_mutex);
+
+ // Add (in undefined order) to the primary wait list used for tracking the
+ // root wait tasks until they are ready.
+ iree_task_list_append(&executor->waiting_list, incoming_waiting_list);
+}
+
+// Finds the waiting task corresponding to |wake_handle| and retires it.
+// Any dependent tasks will be enqueued in the |pending_submission| for issuing.
+// If multiple tasks were waiting on the same wait handle all will be readied.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+// The wait lock must be held as the wait_set is modified.
+static void iree_task_executor_wake_waiting_task(
+ iree_task_executor_t* executor, iree_wait_handle_t wake_handle,
+ iree_task_submission_t* pending_submission) {
+ // Walk through the waiting_list and find all waits with this handle.
+ // Some may not have resolved yet and need to remain in the list.
+ iree_task_t* prev_task = NULL;
+ iree_task_t* task = iree_task_list_front(&executor->waiting_list);
+ while (task != NULL) {
+ iree_task_t* next_task = task->next_task;
+ iree_task_wait_t* wait_task = (iree_task_wait_t*)task;
+ if (wake_handle.type == wait_task->wait_handle.type &&
+ memcmp(&wake_handle.value, &wait_task->wait_handle.value,
+ sizeof(wake_handle.value)) == 0) {
+ // Found one of possibly many. If its condition is met then remove from
+ // the wait set and ready up.
+ if (iree_task_wait_check_condition(wait_task)) {
+ iree_wait_set_erase(executor->wait_set, wake_handle);
+ iree_task_list_erase(&executor->waiting_list, prev_task, task);
+ iree_task_submission_enqueue(pending_submission, task);
+ task = prev_task;
+ }
+ }
+ prev_task = task;
+ task = next_task;
+ }
+}
+
+// Polls all waiting tasks to see if they have completed and adds any newly
+// ready dependencies to |pending_submission|.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_poll_waiting_tasks(
+ iree_task_executor_t* executor,
+ iree_task_submission_t* pending_submission) {
+ if (iree_task_list_is_empty(&executor->waiting_list)) return;
+
+ // Hold the wait lock for the duration we use the wait_set.
+ if (!iree_slim_mutex_try_lock(&executor->wait_mutex)) {
+ return;
+ }
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Poll all root waiting tasks (infinite-past duration) to see if any have
+ // completed. If one or more have resolved then wake_handle will contain an
+ // unspecified wake handle.
+ int woken_tasks = 0;
+ do {
+ iree_wait_handle_t wake_handle;
+ iree_status_t status = iree_wait_any(executor->wait_set,
+ IREE_TIME_INFINITE_PAST, &wake_handle);
+ if (iree_status_is_ok(status)) {
+ // One or more waiters is ready. We don't support multi-wake right now so
+ // we'll just take the one we got back and try again.
+ iree_task_executor_wake_waiting_task(executor, wake_handle,
+ pending_submission);
+ ++woken_tasks;
+ continue;
+ } else if (iree_status_is_deadline_exceeded(status)) {
+ // Indicates nothing was woken. Gracefully bail for now.
+ break;
+ } else {
+ // (Spurious?) error during poll.
+ // TODO(#4026): propagate failure to all scopes involved.
+ // It may be ok to ignore when polling as the eventual wait will handle
+ // the full propagation. For now we assert so its easy to see if we have
+ // tried to perform a bad iree_wait_any.
+ IREE_ASSERT_TRUE(iree_status_is_ok(status));
+ iree_status_ignore(status);
+ break;
+ }
+ } while (!iree_task_list_is_empty(&executor->waiting_list));
+
+ iree_slim_mutex_unlock(&executor->wait_mutex);
+
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+// Waits for one or more waiting tasks to be ready to execute.
+// If a wait task retires any newly-ready tasks will be added to
+// |pending_submission|.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_wait_any_task(
+ iree_task_executor_t* executor, iree_task_worker_t* current_worker,
+ iree_task_submission_t* pending_submission) {
+ if (iree_task_list_is_empty(&executor->waiting_list)) return;
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_slim_mutex_unlock(&executor->coordinator_mutex);
+
+ // We can't hold the coordinator lock during the wait but also need to ensure
+ // no other coordination messes with the wait set. We have a dedicated wait
+ // mutex and guard wait-set accesses (polling/waiting/etc) with that. Polls
+ // may try-lock and bail if the lock is held indicating that someone else has
+ // a non-polling wait active.
+
+ // TODO(benvanik): ensure coordinator wake semantics are modeled:
+ // - donator:
+ // attempt 0:
+ // try steal
+ // if fail to steal: coordinate
+ // attempt 1:
+ // try steal
+ // if fail to steal: await any-posted notification?
+ // - worker:
+ // attempt 0:
+ // try steal
+ // if fail to steal: coordinate
+
+ iree_slim_mutex_lock(&executor->wait_mutex);
+
+ iree_time_t deadline_ns = IREE_TIME_INFINITE_FUTURE;
+ iree_wait_handle_t wake_handle;
+ iree_status_t status =
+ iree_wait_any(executor->wait_set, deadline_ns, &wake_handle);
+
+ iree_slim_mutex_unlock(&executor->wait_mutex);
+
+ // TODO(#4026): propagate failure to all scopes involved.
+ IREE_ASSERT_TRUE(iree_status_is_ok(status));
+ iree_status_ignore(status);
+
+ iree_slim_mutex_lock(&executor->coordinator_mutex);
+
+ int woken_tasks = 0;
+ if (iree_status_is_ok(status)) {
+ // One or more waiters is ready. We don't support multi-wake right now so
+ // we'll just take the one we got back and try again.
+ iree_task_executor_wake_waiting_task(executor, wake_handle,
+ pending_submission);
+ ++woken_tasks;
+ } else if (iree_status_is_deadline_exceeded(status)) {
+ // Indicates nothing was woken. Gracefully bail and return to the
+ // coordinator to see if we should wait again.
+ } else {
+ // (Spurious?) error during wait.
+ // TODO(#4026): propagate failure to all scopes involved.
+ // Failures during waits are serious: ignoring them could lead to live-lock
+ // as tasks further in the pipeline expect them to have completed or - even
+ // worse - user code/other processes/drivers/etc may expect them to
+ // complete.
+ IREE_ASSERT_TRUE(iree_status_is_ok(status));
+ iree_status_ignore(status);
+ }
+
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+// Dispatches tasks in the global submission queue to workers.
+// This is called by users upon submission of new tasks or by workers when they
+// run out of tasks to process. |speculative| indicates whether the coordination
+// request is done as a fallback in the event of there possibly being new work
+// available.
+//
+// If a coordination run ends up with no ready tasks and one or more waiting
+// tasks then the coordinator will wait for one of the tasks to become ready.
+// This only happens in the speculative case (so it's always a worker) as in
+// those cases the next step for the worker would have been to wait anyway. In
+// the non-speculative case the coordinator polls the wait handles to see if
+// they have resolved instead, possibly readying more tasks immediately.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+ iree_task_worker_t* current_worker,
+ bool speculative) {
+ if (speculative) {
+ if (!iree_slim_mutex_try_lock(&executor->coordinator_mutex)) {
+ // Another thread is already holding the coordination lock.
+ // Return to the caller to wait for it to finish.
+ // TODO(benvanik): spin here if it's likely we'll have work after the
+ // other coordinator finishes - that way we don't enter the wait.
+ return;
+ }
+ } else {
+ iree_slim_mutex_lock(&executor->coordinator_mutex);
+ }
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Check for incoming submissions and move their posted tasks into our
+ // local lists. Any of the tasks here are ready to execute immediately and
+ // ones we should be able to distribute to workers without delay. The
+ // waiting tasks are to the best of the caller's knowledge not ready yet.
+ //
+ // Note that we only do this once per coordination; that's so we don't
+ // starve if submissions come in faster than we can schedule them.
+ // Coordination will run again when workers become idle and will pick up
+ // any changes then.
+ //
+ // As we schedule tasks we may spawn new ones (like a dispatch -> many
+ // dispatch slices) and we keep track of those here. By doing a pass through
+ // all ready tasks and only then merging in the new submission we get
+ // breadth-first traversal of task graphs even if they originate from
+ // various places and have no relation - hopefully leading to better average
+ // latency.
+ iree_task_submission_t pending_submission;
+ iree_task_submission_initialize_from_lifo_slist(
+ &executor->incoming_ready_slist, &pending_submission);
+ iree_task_list_append_from_fifo_slist(&pending_submission.waiting_list,
+ &executor->incoming_waiting_slist);
+
+ // Scratch coordinator submission batch used during scheduling to batch up
+ // all tasks that will be posted to each worker. We could stash this on the
+ // executor but given that which thread is playing the role of the coordinator
+ // is random it's better to ensure that these bytes never incur a cache miss
+ // by making them live here in the stack of the chosen thread.
+ iree_task_post_batch_t* post_batch =
+ iree_alloca(sizeof(iree_task_post_batch_t) +
+ executor->worker_count * sizeof(iree_task_list_t));
+ iree_task_post_batch_initialize(executor, current_worker, post_batch);
+
+ // Poll the waiting tasks to see if any have resolved. This dramatically
+ // cuts latency in cases where the wait handle completes prior to us
+ // entering the real wait. When we have semaphores sequencing back-to-back
+ // work this ensures that we pack in future dispatch work earlier vs.
+ // waiting for a full thread hop.
+ //
+ // If any waits have resolved then they'll be moved to the ready list here
+ // and then get processed FIFO with the tasks that were ready in the
+ // request.
+ iree_task_executor_poll_waiting_tasks(executor, &pending_submission);
+
+ // Schedule all ready tasks in this batch. Some may complete inline (such
+ // as ready barriers with all their dependencies resolved) while others may
+ // be scheduled on workers via the post batch.
+ iree_task_executor_schedule_ready_tasks(executor, &pending_submission,
+ post_batch);
+
+ // Merge any newly waiting tasks into the global wait list.
+ iree_task_executor_merge_wait_list(executor,
+ &pending_submission.waiting_list);
+
+ // Post all new work to workers; they may wake and begin executing
+ // immediately. Returns whether this worker has new tasks for it to work on.
+ bool did_post = iree_task_post_batch_submit(post_batch);
+ if (!did_post && speculative) {
+ // No work was found; wait on one or more of our wait handles.
+ // This will block the calling thread but that's fine as they were going
+ // to wait anyway and were just speculatively seeing if there was work first
+ // by requesting coordination. If work completes here we'll catch it on
+ // the poll next loop around.
+ iree_task_executor_wait_any_task(executor, current_worker,
+ &pending_submission);
+ }
+
+ // Merge any new work into the submission list for future coordinators to
+ // deal with - we don't want the possibility of starvation by looping on this.
+ iree_task_executor_merge_submission(executor, &pending_submission);
+
+ iree_slim_mutex_unlock(&executor->coordinator_mutex);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_task_t* iree_task_executor_try_steal_task_from_affinity_set(
+ iree_task_executor_t* executor, iree_task_affinity_set_t victim_mask,
+ uint32_t max_theft_attempts, int rotation_offset,
+ iree_task_queue_t* local_task_queue) {
+ if (!victim_mask) return NULL;
+ max_theft_attempts = iree_min(max_theft_attempts,
+ iree_task_affinity_set_count_ones(victim_mask));
+ victim_mask = iree_task_affinity_set_rotr(victim_mask, rotation_offset);
+
+ int worker_index = rotation_offset;
+ iree_task_affinity_set_t mask =
+ iree_task_affinity_set_rotr(victim_mask, worker_index);
+ for (uint32_t i = 0; i < max_theft_attempts; ++i) {
+ // Find the last set bit and skip to it. This avoids the need for doing
+ // a full O(n) scan and instead gets us at O(popcnt) * O(ctz).
+ //
+ // Example: sharing mask = 0b01010101
+ // mask_rotation = 3 (randomly selected)
+ // mask = 0b01010101 rotr 3 = 0b10101010
+ // for (i = 0; i < 4; ++i)
+ // offset = ctz(0b10101010) = 1
+ // mask_rotation += 1 = 4
+ // mask >>= 1 = 0b01010101
+ // victim_index = 4 % 64 = 4
+ int offset = iree_task_affinity_set_count_trailing_zeros(mask);
+ int victim_index = (worker_index + offset) % executor->worker_count;
+ worker_index += offset + 1;
+ mask = mask >> (offset + 1);
+ iree_task_worker_t* victim_worker = &executor->workers[victim_index];
+
+ // Policy: steal a chunk of tasks at the tail of the victim queue.
+ // This will steal multiple tasks from the victim up to the specified max
+ // and move the them into our local task queue. Not all tasks will be stolen
+ // and the assumption is that over a large-enough random distribution of
+ // thievery taking ~half of the tasks each time (across all queues) will
+ // lead to a relatively even distribution.
+ iree_task_t* task = iree_task_worker_try_steal_task(
+ victim_worker, local_task_queue,
+ /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+ if (task) return task;
+ }
+
+ // No tasks found in victim_mask.
+ return NULL;
+}
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+//
+// We do a scan through ideal victims indicated by the
+// |constructive_sharing_mask|; these are the workers most likely to have some
+// cache benefits to taking their work as they share some level of the cache
+// hierarchy and should be better to steal from than any random worker.
+//
+// To prevent biasing any particular victim we use a fast prng function to
+// select where in the set of potential victims defined by the topology
+// group we steal. We (probably) don't need anything super complex here so
+// instead of bouncing around at random we just select the starting point in
+// our search and then go in-order.
+iree_task_t* iree_task_executor_try_steal_task(
+ iree_task_executor_t* executor,
+ iree_task_affinity_set_t constructive_sharing_mask,
+ uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+ iree_task_queue_t* local_task_queue) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ const int worker_count = executor->worker_count;
+ iree_task_worker_t* workers = executor->workers;
+
+ // Limit the workers we will steal from to the ones that are currently live
+ // and not idle.
+ iree_task_affinity_set_t victim_mask =
+ iree_atomic_task_affinity_set_load(&executor->worker_live_mask,
+ iree_memory_order_relaxed) &
+ ~iree_atomic_task_affinity_set_load(&executor->worker_idle_mask,
+ iree_memory_order_relaxed);
+
+ // TODO(benvanik): it may be possible to rework this such that we better
+ // use the prng; for example, instead of all this rotating stuff we could just
+ // generate an 8-bit number (or even split it into two 4-bit numbers) per
+ // theft attempt. The current rotation strategy is biased toward the same try
+ // ordering vs. what we may really want with an unbiased random selection.
+ int rotation_offset = iree_prng_minilcg128_next_uint8(theft_prng) &
+ (8 * sizeof(iree_task_affinity_set_t) - 1);
+
+ // Try first with the workers we may have some caches shared with. This
+ // helps to prevent cache invalidations/availability updates as it's likely
+ // that we won't need to go back to main memory (or higher cache tiers) in the
+ // event that the thief and victim are running close to each other in time.
+ iree_task_t* task = iree_task_executor_try_steal_task_from_affinity_set(
+ executor, victim_mask & constructive_sharing_mask, max_theft_attempts,
+ rotation_offset, local_task_queue);
+ if (task) {
+ IREE_TRACE_ZONE_APPEND_TEXT(z0, "local");
+ } else {
+ task = iree_task_executor_try_steal_task_from_affinity_set(
+ executor, victim_mask & ~constructive_sharing_mask, max_theft_attempts,
+ rotation_offset, local_task_queue);
+ if (task) {
+ IREE_TRACE_ZONE_APPEND_TEXT(z0, "non-local");
+ }
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return task;
+}
+
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+ iree_wait_handle_t* wait_handle,
+ iree_time_t deadline_ns) {
+ // Not implemented; just wait. Unclear we want this yet and it's complex.
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_status_t status = iree_wait_one(wait_handle, deadline_ns);
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
diff --git a/iree/task/executor.h b/iree/task/executor.h
new file mode 100644
index 0000000..0f49878
--- /dev/null
+++ b/iree/task/executor.h
@@ -0,0 +1,384 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_EXECUTOR_H_
+#define IREE_TASK_EXECUTOR_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomics.h"
+#include "iree/base/wait_handle.h"
+#include "iree/task/scope.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/topology.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+//==============================================================================
+// IREE Task Executor
+//==============================================================================
+//
+// Roughly models wavefront-style GPU dispatch. Users submit task DAGs with
+// fine-grained dependency information for the executor to schedule across a set
+// of workers. As tasks become ready to execute they are placed into per-worker
+// FIFOs and workers run through them in a breadth-first fashion executing and
+// resolving tasks and building up new waves of ready tasks. Workers will always
+// make forward progress and only when they run out of work will they attempt to
+// self-nominate to play the role of coordinator and schedule any newly-
+// submitted or readied tasks. Only once all tasks have been retired and
+// waits on external resources remain does the task system suspend itself until
+// more tasks are submitted or an external wait resolves.
+//
+// Our goal is to do the minimal amount of work to get the maximum amount of
+// concurrency the user requests or allows (by way of their dependencies).
+// Whether on a single core where you want to timeshare with an application or
+// across hundreds the same architecture holds. Where there is inefficiency it's
+// almost always surmountable with properly constructed tasks: choose the right
+// granularity for dispatches, choose the right fan-out for tiles within those
+// dispatches, choose the right places to insert barriers to force fan-in to
+// reduce memory utilization or right places to batch barriers to allow less
+// synchronization with the work queue, etc. All of those choices are ones this
+// system is designed to handle dynamically via the task graphs provided that
+// are themselves (in the IREE world) mapped 1:1 with the GPU-esque grid
+// dispatch and command buffer model. It's a super-power if a human is authoring
+// all that information but what makes it particularly powerful here is that we
+// are authoring that in the compiler based on a tremendous amount of
+// higher-level information we can derive from the whole program. Every bit of
+// dynamism here is matched with the ability to tighten down the screws and gain
+// back anything lost by way of compiler improvements while also being able to
+// generalize out to far more complex systems (higher parallelism, higher and
+// more efficient concurrency, etc).
+//
+// The design of this system allows for a spectrum of dynamic behavior based on
+// desired usage scenarios:
+// - variable number of persistent workers based on compute/memory topology
+// - per-task scope and per-task worker affinity to control for:
+// - power islands on multi-core systems with fine-grained power management
+// - heterogenous microarchitectures in big.LITTLE/etc compute complexes
+// - task isolation between multiple active requests or users
+// - latency prioritization by partitioning workloads by priority
+// - scheduling overhead tradeoffs by varying:
+// - coordination/flush frequency to reduce cross-thread communication
+// - by statically inserting dispatch slices to avoid dynamic fan-out
+// - thread donation to avoid likely context switches upon submit+wait
+// - multi-wait across all users by sharing a wait set
+// - per-worker work-stealing specification of victim workers in the topology
+// - limited work-stealing to prevent chained stealing/cascading theft
+//
+// Required reading:
+// https://www.usenix.org/conference/osdi20/presentation/ma
+// (closest equivalent to this scheduling model)
+// https://www.cister-labs.pt/summer2017/w3/Parallelism%20-%20Dag%20Model.pdf
+// (good overall, our worker local lists/mailboxes are work-stealing queues)
+// http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
+// (what we model with the coordinator)
+// http://mcg.cs.tau.ac.il/papers/opodis2010-quasi.pdf
+// (we exploit relaxed consistency for all our cross-thread queuing, see ^)
+// https://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++.htm
+// (moodycamel is the state of the art on scaling queues; read it all)
+// https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+// https://blog.molecular-matters.com/2015/09/08/job-system-2-0-lock-free-work-stealing-part-2-a-specialized-allocator/
+// https://blog.molecular-matters.com/2015/09/25/job-system-2-0-lock-free-work-stealing-part-3-going-lock-free/
+// https://blog.molecular-matters.com/2015/11/09/job-system-2-0-lock-free-work-stealing-part-4-parallel_for/
+// https://blog.molecular-matters.com/2016/04/04/job-system-2-0-lock-free-work-stealing-part-5-dependencies/
+// (fantastic 5 part blog series; very similar to this)
+// http://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/jacm99.pdf
+// (provably optimal dynamic nested parallelism in 1999; basically: GPUs)
+// http://www.cs.cmu.edu/~blelloch/papers/locality2000.pdf
+// (followup to jacm99; using locality now to guide work stealing)
+// https://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/CGK07.pdf
+// (worker affinity and task locality for constructive cache sharing)
+//
+//==============================================================================
+// Life of an iree_task_t / high level algorithm
+//==============================================================================
+//
+// 1. Users allocate (from iree_task_pool_t, slice from arenas, etc) and
+// construct a DAG of iree_task_ts.
+//
+// a. Task dependency information is setup via completion_tasks for simple
+// dependencies, implicit fan-out/fan-in (dispatches), or explicit fan-in
+// (barriers).
+//
+// b. Tasks are pushed into iree_task_submission_t (LIFO, thread-local list).
+// If the task has no initial unmet initial dependencies it is placed into
+// the ready_list. If it is initially waiting on an external resource such
+// as iree_wait_handle_t then it is placed into the waiting_list.
+//
+// 2. iree_task_executor_submit (LIFO, atomic slist)
+// Submissions have their task thread-local lists concatenated into a LIFO
+// incoming_ready_slist or incoming_waiting_slist shared by the executor.
+//
+// 3. iree_task_executor_flush (or a worker puts on its coordinator hat 🎩)
+//
+// a. Tasks are flushed from the incoming_ready_slist into a coordinator-local
+// FIFO task queue and incoming_waiting_slist is concatenated into the
+// primary executor waitlist.
+//
+// b. iree_task_executor_poll_waiting_tasks: finds waiting tasks that are now
+// ready and they are moved into the coordinator-local FIFO task queue.
+//
+// c. iree_task_executor_schedule_ready_tasks: walks the FIFO task queue and
+// builds a iree_task_post_batch_t containing the per-worker tasks
+// in LIFO order.
+//
+// d. iree_task_post_batch_submit: per-worker tasks are pushed to their
+// respective iree_task_worker_t mailbox_slist and the workers with new
+// tasks are notified to wake up (if not already awake).
+//
+// 4. iree_task_worker_main_pump_once (LIFO mailbox -> FIFO thread-local list)
+// When either woken or after completing all available thread-local work
+// each worker will check its mailbox_slist to see if any tasks have been
+// posted.
+//
+// a. Tasks are flushed from the LIFO mailbox into the local_task_queue FIFO
+// for the particular worker.
+//
+// b. If the mailbox is empty the worker *may* attempt to steal work from
+// another nearby worker in the topology.
+//
+// c. Any tasks in the local_task_queue are executed until empty.
+// Tasks are retired and dependent tasks (via completion_task or barriers)
+// are made ready and placed in the executor incoming_ready_slist or
+// incoming_waiting_slist as with iree_task_executor_submit.
+//
+// d. If no more thread-local work is available and the mailbox_slist is
+// empty the worker will self-nominate for coordination and attempt to don
+// the coordinator hat with iree_task_executor_coordinate. If new work
+// becomes available after coordination step 5 repeats.
+//
+// e. If another worker (or iree_task_executor_flush) is already wearing the
+// coordinator hat then the worker will go to sleep.
+//
+//==============================================================================
+// Scaling Down
+//==============================================================================
+//
+// IREE is built at all levels - and both in the compiler and runtime - to scale
+// to different needs. Everything that IREE imposes on the runtime performance
+// and binary size is a spectrum of choices made that allows a user to only pay
+// for what they use.
+//
+// If a deployment scenario does not need complex multithreading and
+// out-of-order execution then this task system can be used in single-threaded
+// mode to at least allow for offloading from the main application thread. In
+// even more constrained scenarios (or embeddings within other systems that have
+// thread pools of their own) it can be used in zero-threaded mode with only
+// donated threads from the user performing work when the user wants it to
+// happen within its control. It still gives the benefits of wave-style
+// scheduling, multi-waiting, locality-aware work distribution, etc as well as
+// giving us a single target interface from the compiler to communicate
+// fine-grained dependency information to the runtime.
+//
+// If the cost of a few KB of data structures and some cheap uncontended atomic
+// linked list concatenations is still scary (it shouldn't be for 95% of uses)
+// then it's also possible to have a HAL driver that doesn't use this task
+// system at all and instead just executes the command buffers directly just
+// like our Vulkan/Metal/etc GPU backends do. Even though I don't recommend that
+// (one wouldn't be saving as much as they think and be losing a lot instead)
+// the layering holds and it can be useful if there's an existing external
+// sophisticated task execution system (ala taskflow) that is already in present
+// in an application.
+//
+// One assertion of IREE is that for models that take more than milliseconds to
+// execute then asynchronous scheduling is almost always worth it even on
+// systems with single cores. The ability to cooperatively schedule model
+// execution allows applications significant control over their total program
+// scheduling behavior; just as on a Commodore 64 you'd have to interrupt work
+// on vsync to begin scanning out pixels to the screen and then resume afterward
+// it's rare to see any system even scaling down to double-digit MHz
+// microcontrollers that doesn't benefit from the ability to cleanly suspend and
+// resume execution.
+//
+// But even if *all* of that is too much, the compile-time representations in
+// the HAL IR are designed to be lowered away: execution modeling does not need
+// to bottom out on a hal.command_buffer.dispatch that maps 1:1 with the runtime
+// iree_hal_command_buffer_dispatch call: dispatch can be lowered into LLVM
+// IR calls and finally into native code to do precisely what you want. The HAL
+// at runtime is a useful abstraction to allow for switching your target
+// execution system (statically or dynamically across deployments) and to share
+// the same execution system across multiple models that may be executing
+// simultaneously but it is _not_ a requirement that the IREE HAL runtime
+// implementation is used. It's called multi-level IR for a reason and the HAL
+// IR is just one level that may have many more below it.
+//
+// So yeah: don't worry. It's almost certain that the thing making or breaking
+// the performance of models over 1ms of execution time is not the HAL, and that
+// in models at or above that scale the benefits we get from being able to
+// holistically schedule the work far outstrip any specialization that can be
+// done by hand. That's to say: only worry about this if your model is literally
+// 4 floats coming from an IMU and a few hundred scalar instructions to predict
+// whether the user is walking, and that shouldn't be using the runtime HAL at
+// all and really likely doesn't benefit from using IREE at any scale - just go
+// straight to LLVM IR from the source.
+//
+//==============================================================================
+// Scaling Up
+//==============================================================================
+//
+// The task system has an implicit limit of 64 workers. This intentional
+// limitation simplifies several parts of the code while also preventing misuse:
+// it rarely (if ever) makes sense to have more than 64 compute-dominated
+// threads working on a single problem. Achieving high performance in such
+// situations requires extremely careful control over the OS scheduler, memory
+// bandwidth consumption, and synchronization. It's always possible to make the
+// problem more compute-bound or very carefully try to fit in specific cache
+// sizes to avoid more constrained bandwidth paths but it's a non-portable
+// whack-a-mole style solution that is in conflict with a lot of what IREE seeks
+// to do with respect to low-latency and multi-tenant workloads.
+//
+// If more than 64 unique L1/L2 caches (or realistically more than probably ~32)
+// are available *and* all of them are attached to the same memory controllers
+// (no NUMA involved) then the solution is straightfoward: use multiple IREE
+// task executors. Either within a process or in separate processes the
+// granularity is coarse enough to not be a burden and changes the problem from
+// needing 100% perfect work scaling of a single task to needing a naive
+// distributed workload solution at the algorithm level.
+//
+// Many useful effects also fall out of solving the work distribution problem.
+// Even for single-tenant workloads being able to split work between two
+// executors allows for natural mappings on NUMA systems or completely
+// independent machines. When supporting multi-tenant workloads (even if the
+// same program is acting as multiple-tenants in a minibatched-style algorithm)
+// the improvements of isolation both in memory access patterns and in variance
+// from potentially bad system behavior dramatically improve: there aren't many
+// opportunities for contention in this system but one can guarantee zero
+// contention by simply not sharing the resources!
+
+// A bitfield specifying the scheduling mode used for configuring how (or if)
+// work is balanced across queues.
+enum iree_task_scheduling_mode_e {
+ // TODO(benvanik): batch, round-robin, FCFS, SJF, etc.
+ // We can also allow for custom scheduling, though I'm skeptical of the value
+ // of that. We should look into what GPUs do in hardware for balancing things
+ // (if anything this sophisticated at all). The potential benefit here is that
+ // we can optimize for offline workloads by allowing each queue to be drained
+ // until blocking - hopefully optimizing cache coherency and reducing the
+ // total memory high-water mark - or optimize for latency across all queues by
+ // taking tasks from all queues equally. There are other more interesting
+ // scheduling strategies such as preferring the widest tasks available from
+ // any queue such that we are keeping as many workers active as possible to
+ // reach peak utilization or artificially limiting which tasks we allow
+ // through to keep certain CPU cores asleep unless absolutely required.
+ IREE_TASK_SCHEDULING_MODE_RESERVED = 0u,
+
+ // Creates all workers suspended and waits until work is first scheduled to
+ // them to resume. This trades off initial blocking startup time waking the
+ // threads for potential latency additions later on as threads take longer to
+ // wake on their first use.
+ //
+ // Prefer this setting in systems where startup time is the priority and work
+ // may not be scheduled for awhile or scheduled unevenly to start; otherwise
+ // the executor creation will take longer and a thundering herd will occur
+ // forcing context switches even if no work is needed.
+ //
+ // Avoid in systems where the latency from initial submission to worker
+ // execution is critical as this will ensure all worker threads are waiting
+ // for their respective wake notifications. The kernel then will be able to
+ // much faster schedule all worker quantums and in many cases all workers will
+ // begin processing simultaneously immediately after the submission is made.
+ IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP = 1u << 0,
+
+ // TODO(#4027): implement IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD.
+ // Creates a dedicated thread performing waits on root wait handles.
+ // On workloads with many short-duration waits this will reduce total latency
+ // as the waits are aggressively processed and dependent tasks are scheduled.
+ // It also keeps any wait-related syscalls off the worker threads that would
+ // otherwise need to perform the syscalls during coordination.
+ IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD = 1u << 1,
+};
+typedef uint32_t iree_task_scheduling_mode_t;
+
+// Base task system executor interface.
+typedef struct iree_task_executor_s iree_task_executor_t;
+
+// Creates a task executor using the specified topology.
+// |topology| is only used during creation and need not live beyond this call.
+// |out_executor| must be released by the caller.
+iree_status_t iree_task_executor_create(
+ iree_task_scheduling_mode_t scheduling_mode,
+ const iree_task_topology_t* topology, iree_allocator_t allocator,
+ iree_task_executor_t** out_executor);
+
+// Retains the given |executor| for the caller.
+void iree_task_executor_retain(iree_task_executor_t* executor);
+
+// Releases the given |executor| from the caller.
+void iree_task_executor_release(iree_task_executor_t* executor);
+
+// TODO(benvanik): scheduling mode mutation, compute quota control, etc.
+
+// Submits a batch of tasks for execution.
+// The submission represents a DAG of tasks all reachable from the initial
+// submission lists.
+//
+// Ownership of the tasks remains with the caller for the lifetime of the
+// submission unless tasks have a custom pool specified that they can be
+// returned to.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: it's possible for all work in the submission to complete prior to this
+// function returning.
+iree_status_t iree_task_executor_submit(iree_task_executor_t* executor,
+ iree_task_submission_t* submission);
+
+// Flushes any pending task batches for execution.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: due to races it's possible for new work to arrive from other threads
+// after the flush has occurred but prior to this call returning.
+iree_status_t iree_task_executor_flush(iree_task_executor_t* executor);
+
+// Donates the calling thread to the executor until either |wait_handle|
+// resolves or |deadline_ns| is exceeded.
+//
+// If there are no tasks available then the calling thread will block as if
+// iree_wait_one had been used on |wait_handle|. If tasks are ready then the
+// caller will not block prior to starting to perform work on behalf of the
+// executor.
+//
+// Donation is intended as an optimization to elide context switches when the
+// caller would have waited anyway; now instead of performing a kernel wait and
+// most certainly incurring a context switch the caller immediately begins
+// taking work from the queue - likely even prior to any of the executor workers
+// waking (assuming they were idle).
+//
+// Note that donation may not always be strictly a win: the caller may have an
+// arbitrary thread affinity that may cause oversubscription of resources within
+// the topology. This can cause additional contention for compute resources and
+// increase kernel scheduling overhead as threads are swapped or migrated.
+// Measure, measure, measure! If there is any IO that can be performed during
+// the time that a caller would otherwise donate themselves to the executor that
+// should always be preferred as should smaller computation (again to not
+// oversubscribe resources). Treat donation as a hail mary to prevent a kernel
+// wait and not something that will magically make things execute faster.
+// Especially in large applications it's almost certainly better to do something
+// useful with the calling thread (even if that's go to sleep).
+//
+// Safe to call from any thread (though bad to reentrantly call from workers).
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+ iree_wait_handle_t* wait_handle,
+ iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_EXECUTOR_H_
diff --git a/iree/task/executor_impl.h b/iree/task/executor_impl.h
new file mode 100644
index 0000000..dbb17c5
--- /dev/null
+++ b/iree/task/executor_impl.h
@@ -0,0 +1,146 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_EXECUTOR_IMPL_H_
+#define IREE_TASK_EXECUTOR_IMPL_H_
+
+#include "iree/base/math.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/queue.h"
+#include "iree/task/tuning.h"
+#include "iree/task/worker.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct iree_task_executor_s {
+ iree_atomic_ref_count_t ref_count;
+ iree_allocator_t allocator;
+
+ // Defines how work is selected across queues.
+ // TODO(benvanik): make mutable; currently always the same reserved value.
+ iree_task_scheduling_mode_t scheduling_mode;
+
+ // State used by the work-stealing operations performed by donated threads.
+ // This is **NOT SYNCHRONIZED** and relies on the fact that we actually don't
+ // much care about the precise selection of workers enough to mind any tears
+ // we get in the PRNG state that lives inside. Cache write-back order and
+ // incidental cache line availability/visibility update frequency is like an
+ // extra layer of PRNG anyway ;)
+ iree_prng_minilcg128_state_t donation_theft_prng;
+
+ // Pools of transient dispatch tasks shared across all workers.
+ // Depending on configuration the task pool may allocate after creation using
+ // the allocator provided upon executor creation.
+ iree_task_pool_t slice_task_pool;
+ iree_task_pool_t shard_task_pool;
+
+ // A list of incoming tasks that are ready to execute immediately.
+ // The list is LIFO and we require that task lists are reversed by the
+ // submitter so we can use iree_atomic_slist_concat to quickly prepend the
+ // LIFO list to the atomic slist. By doing this we can construct the task
+ // lists in LIFO order prior to submission, concat with a pointer swap into
+ // this list, flush from the list in LIFO order during coordination, and do a
+ // single LIFO->FIFO conversion while distributing work. What could have been
+ // half a dozen task list pointer walks and inverted sequential memory access
+ // becomes one.
+ //
+ // Example:
+ // existing tasks: C B A
+ // new tasks: 1 2 3
+ // updated tasks: 3 2 1 C B A
+ iree_atomic_task_slist_t incoming_ready_slist;
+ // A list of incoming wait tasks that need to be waited on. Order doesn't
+ // really matter here as all tasks will be waited on simultaneously.
+ iree_atomic_task_slist_t incoming_waiting_slist;
+
+ // Guards coordination logic; only one thread at a time may be acting as the
+ // coordinator.
+ iree_slim_mutex_t coordinator_mutex;
+ // A list of wait tasks with external handles that need to be waited on.
+ // Coordinators can choose to poll/wait on these.
+ iree_task_list_t waiting_list;
+ // Guards manipulation and use of the wait_set.
+ // coordinator_mutex may be held when taking this lock.
+ iree_slim_mutex_t wait_mutex;
+ // Wait set containing all the tasks in waiting_list. Coordinator manages
+ // keeping the waiting_list and wait_set in sync.
+ iree_wait_set_t* wait_set;
+
+ // A bitset indicating which workers are live and usable; all attempts to
+ // push work onto a particular worker should check first with this mask. This
+ // may change over time either automatically or by user request ("don't use
+ // these cores for awhile I'm going to be using them" etc).
+ iree_atomic_task_affinity_set_t worker_live_mask;
+
+ // A bitset indicating which workers may be suspended and need to be resumed
+ // via iree_thread_resume prior to them being able to execute work.
+ iree_atomic_task_affinity_set_t worker_suspend_mask;
+
+ // A bitset indicating which workers are currently idle. Used to bias incoming
+ // tasks to workers that aren't doing much else. This is a balance of latency
+ // to wake the idle workers vs. latency to wait for existing work to complete
+ // on already woken workers.
+ iree_atomic_task_affinity_set_t worker_idle_mask;
+
+ // Specifies how many workers threads there are.
+ // For now this number is fixed per executor however if we wanted to enable
+ // live join/leave behavior we could change this to a registration mechanism.
+ iree_host_size_t worker_count;
+ iree_task_worker_t* workers; // [worker_count]
+};
+
+// Merges a submission into the primary FIFO queues.
+// Coordinators will fetch items from here as workers demand them but otherwise
+// not be notified of the changes (waiting until coordination runs again).
+//
+// May be called from any thread.
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+ iree_task_submission_t* submission);
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+ iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch);
+
+// Dispatches tasks in the global submission queue to workers.
+// |current_worker| will be NULL if called from a non-worker thread and
+// otherwise be the current worker; used to avoid round-tripping through the
+// whole system to post to oneself.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+ iree_task_worker_t* current_worker,
+ bool speculative);
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+iree_task_t* iree_task_executor_try_steal_task(
+ iree_task_executor_t* executor,
+ iree_task_affinity_set_t constructive_sharing_mask,
+ uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+ iree_task_queue_t* local_task_queue);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_EXECUTOR_IMPL_H_
diff --git a/iree/task/executor_test.cc b/iree/task/executor_test.cc
new file mode 100644
index 0000000..6641edb
--- /dev/null
+++ b/iree/task/executor_test.cc
@@ -0,0 +1,181 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/executor.h"
+
+#include <thread>
+
+#include "iree/base/math.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+static thread_local volatile uint64_t xxx = 0;
+
+static void simulate_work(iree_task_tile_context_t* tile_context) {
+ iree_prng_splitmix64_state_t state;
+ iree_prng_splitmix64_initialize(xxx, &state);
+ bool slow = false; // tile_context->workgroup_xyz[0] % 3 == 1;
+ if (tile_context->workgroup_xyz[0] == 128 ||
+ tile_context->workgroup_xyz[0] == 1023) {
+ // Introduce big variance to highlight work stealing.
+ // std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ for (int i = 0; i < 256 * 1024; ++i) {
+ uint64_t value = iree_prng_splitmix64_next(&state);
+ xxx += value;
+ if (slow) {
+ for (int j = 0; j < 4; ++j) {
+ value = iree_prng_splitmix64_next(&state);
+ xxx += value;
+ }
+ }
+ }
+}
+
+TEST(ExecutorTest, Any) {
+ IREE_TRACE_SCOPE0("ExecutorTest::Any");
+
+ iree_allocator_t allocator = iree_allocator_system();
+
+ iree_task_topology_t* topology = NULL;
+#if 1
+ IREE_CHECK_OK(iree_task_topology_from_physical_cores(
+ /*max_core_count=*/6, allocator, &topology));
+#elif 0
+ IREE_CHECK_OK(iree_task_topology_from_unique_l2_cache_groups(
+ /*max_group_count=*/6, allocator, &topology));
+#else
+ IREE_CHECK_OK(iree_task_topology_from_group_count(/*group_count=*/6,
+ allocator, &topology));
+#endif
+ iree_task_executor_t* executor = NULL;
+ iree_task_scheduling_mode_t scheduling_mode =
+ IREE_TASK_SCHEDULING_MODE_RESERVED;
+ IREE_CHECK_OK(iree_task_executor_create(scheduling_mode, topology, allocator,
+ &executor));
+ iree_task_topology_free(topology);
+
+ //
+ iree_task_scope_t scope_a;
+ iree_task_scope_initialize(iree_make_cstring_view("a"), &scope_a);
+
+ //
+ iree_task_call_t call0;
+ iree_task_call_initialize(
+ &scope_a,
+ iree_task_make_closure(
+ [](uintptr_t user_context, uintptr_t task_context) {
+ IREE_TRACE_SCOPE0("call0");
+ EXPECT_EQ(0, user_context);
+ return iree_ok_status();
+ },
+ 0),
+ &call0);
+
+ const uint32_t workgroup_size_0[3] = {256, 1, 1};
+ const uint32_t workgroup_count_0[3] = {32, 4, 2};
+ iree_task_dispatch_t dispatch0;
+ iree_task_dispatch_initialize(
+ &scope_a,
+ iree_task_make_closure(
+ [](uintptr_t user_context, uintptr_t task_context) {
+ IREE_TRACE_SCOPE0("tile0");
+ iree_task_tile_context_t* tile_context =
+ (iree_task_tile_context_t*)task_context;
+ EXPECT_EQ(0, user_context);
+ simulate_work(tile_context);
+ iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+ iree_memory_order_relaxed);
+ return iree_ok_status();
+ },
+ 0),
+ workgroup_size_0, workgroup_count_0, &dispatch0);
+ // dispatch0.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
+
+ const uint32_t workgroup_size_1[3] = {128, 1, 1};
+ const uint32_t workgroup_count_1[3] = {16, 2, 1};
+ iree_task_dispatch_t dispatch1;
+ iree_task_dispatch_initialize(
+ &scope_a,
+ iree_task_make_closure(
+ [](uintptr_t user_context, uintptr_t task_context) {
+ IREE_TRACE_SCOPE0("tile1");
+ iree_task_tile_context_t* tile_context =
+ (iree_task_tile_context_t*)task_context;
+ EXPECT_EQ(0, user_context);
+ simulate_work(tile_context);
+ iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+ iree_memory_order_relaxed);
+ return iree_ok_status();
+ },
+ 0),
+ workgroup_size_1, workgroup_count_1, &dispatch1);
+ // dispatch1.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
+
+ //
+ iree_task_call_t call1;
+ iree_task_call_initialize(
+ &scope_a,
+ iree_task_make_closure(
+ [](uintptr_t user_context, uintptr_t task_context) {
+ IREE_TRACE_SCOPE0("call1");
+ EXPECT_EQ(1, user_context);
+ return iree_ok_status();
+ },
+ 1),
+ &call1);
+
+#if 1
+ // no barrier between dispatches; fanout
+ iree_task_t* barrier0_tasks[2] = {&dispatch0.header, &dispatch1.header};
+ iree_task_barrier_t barrier0;
+ iree_task_barrier_initialize(&scope_a, IREE_ARRAYSIZE(barrier0_tasks),
+ barrier0_tasks, &barrier0);
+ iree_task_set_completion_task(&call0.header, &barrier0.header);
+ iree_task_set_completion_task(&dispatch0.header, &call1.header);
+ iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#else
+ // barrier between dispatches
+ iree_task_set_completion_task(&call0.header, &dispatch0.header);
+ iree_task_set_completion_task(&dispatch0.header, &dispatch1.header);
+ iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#endif
+
+ // fence
+ iree_task_fence_t fence0;
+ iree_task_fence_initialize(&scope_a, &fence0);
+ iree_task_set_completion_task(&call1.header, &fence0.header);
+
+ //
+ iree_task_submission_t sub0;
+ iree_task_submission_initialize(&sub0);
+ iree_task_submission_enqueue(&sub0, &call0.header);
+ IREE_CHECK_OK(iree_task_executor_submit(executor, &sub0));
+
+ //
+ // iree_task_submission_t sub1;
+ // iree_task_submission_initialize(&sub1);
+ // IREE_CHECK_OK(iree_task_executor_submit(executor, &sub1));
+
+ IREE_CHECK_OK(iree_task_executor_flush(executor));
+
+ IREE_CHECK_OK(iree_task_scope_wait_idle(&scope_a, IREE_TIME_INFINITE_FUTURE));
+
+ iree_task_scope_deinitialize(&scope_a);
+ iree_task_executor_release(executor);
+}
+
+} // namespace
diff --git a/iree/task/list.c b/iree/task/list.c
new file mode 100644
index 0000000..6e70000
--- /dev/null
+++ b/iree/task/list.c
@@ -0,0 +1,227 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/list.h"
+
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist) {
+ iree_task_list_t discard_list;
+ iree_task_list_initialize(&discard_list);
+
+ // Flush the entire slist and walk it discarding the tasks as we go. This
+ // avoids the need to do more than one walk if we were simply trying to flush
+ // and discard into an iree_task_list_t.
+ //
+ // Note that we may accumulate additional tasks that need to be discarded;
+ // that's when we use the iree_task_list_t discard logic which works because
+ // we have a head/tail and are no longer in atomic land.
+ iree_task_t* task_head = NULL;
+ iree_atomic_task_slist_flush(
+ slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &task_head, NULL);
+ while (task_head != NULL) {
+ iree_task_t* next_task = task_head->next_task;
+ iree_task_discard(task_head, &discard_list);
+ task_head = next_task;
+ }
+
+ iree_task_list_discard(&discard_list);
+}
+
+void iree_task_list_initialize(iree_task_list_t* out_list) {
+ memset(out_list, 0, sizeof(*out_list));
+}
+
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list) {
+ memcpy(out_list, list, sizeof(*out_list));
+ memset(list, 0, sizeof(*list));
+}
+
+void iree_task_list_discard(iree_task_list_t* list) {
+ // Fixed point iteration over the task list and all its transitive dependent
+ // tasks that get discarded. This is in contrast to a recursive discard that
+ // could potentially be thousands of calls deep in a large graph.
+ while (!iree_task_list_is_empty(list)) {
+ iree_task_t* task = iree_task_list_pop_front(list);
+ iree_task_discard(task, list);
+ }
+}
+
+bool iree_task_list_is_empty(const iree_task_list_t* list) {
+ return list->head == NULL;
+}
+
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list) {
+ iree_host_size_t count = 0;
+ iree_task_t* p = list->head;
+ while (p) {
+ ++count;
+ p = p->next_task;
+ }
+ return count;
+}
+
+iree_task_t* iree_task_list_front(iree_task_list_t* list) { return list->head; }
+
+iree_task_t* iree_task_list_back(iree_task_list_t* list) { return list->tail; }
+
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task) {
+ if (!list->head) {
+ list->head = task;
+ }
+ if (list->tail) {
+ list->tail->next_task = task;
+ }
+ list->tail = task;
+ task->next_task = NULL;
+}
+
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task) {
+ task->next_task = list->head;
+ list->head = task;
+ if (!list->tail) {
+ list->tail = task;
+ }
+}
+
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list) {
+ if (!list->head) return NULL;
+ iree_task_t* task = list->head;
+ list->head = task->next_task;
+ if (list->tail == task) {
+ list->tail = NULL;
+ }
+ task->next_task = NULL;
+ return task;
+}
+
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+ iree_task_t* task) {
+ if (task == list->head) {
+ // Removing head.
+ list->head = task->next_task;
+ } else if (task == list->tail) {
+ // Removing tail.
+ list->tail = prev_task;
+ prev_task->next_task = NULL;
+ } else {
+ // Removing inner.
+ prev_task->next_task = task->next_task;
+ }
+ task->next_task = NULL;
+}
+
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix) {
+ if (iree_task_list_is_empty(prefix)) return;
+ if (iree_task_list_is_empty(list)) {
+ list->head = prefix->head;
+ list->tail = prefix->tail;
+ } else {
+ prefix->tail->next_task = list->head;
+ list->head = prefix->head;
+ }
+ memset(prefix, 0, sizeof(*prefix));
+}
+
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix) {
+ if (iree_task_list_is_empty(suffix)) return;
+ if (iree_task_list_is_empty(list)) {
+ list->head = suffix->head;
+ list->tail = suffix->tail;
+ } else {
+ list->tail->next_task = suffix->head;
+ list->tail = suffix->tail;
+ }
+ memset(suffix, 0, sizeof(*suffix));
+}
+
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+ iree_atomic_task_slist_t* slist) {
+ iree_task_list_t suffix;
+ iree_task_list_initialize(&suffix);
+ if (!iree_atomic_task_slist_flush(
+ slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO, &suffix.head,
+ &suffix.tail)) {
+ return; // empty
+ }
+ iree_task_list_append(list, &suffix);
+}
+
+void iree_task_list_reverse(iree_task_list_t* list) {
+ if (iree_task_list_is_empty(list)) return;
+ iree_task_t* tail = list->head;
+ iree_task_t* head = list->tail;
+ iree_task_t* p = list->head;
+ do {
+ iree_task_t* next = p->next_task;
+ p->next_task = head;
+ head = p;
+ p = next;
+ } while (p != NULL);
+ tail->next_task = NULL;
+ list->head = head;
+ list->tail = tail;
+}
+
+void iree_task_list_split(iree_task_list_t* head_list,
+ iree_host_size_t max_tasks,
+ iree_task_list_t* out_tail_list) {
+ iree_task_list_initialize(out_tail_list);
+ if (head_list->head == NULL) return;
+ if (head_list->head == head_list->tail) {
+ // 1 task in the source list; always prefer to steal it.
+ // This is because the victim is likely working on their last item and we
+ // can help them out by popping this off. It also has the side-effect of
+ // handling cases of donated workers wanting to steal all tasks to
+ // synchronously execute things.
+ iree_task_list_move(head_list, out_tail_list);
+ return;
+ }
+
+ // Walk through the |head_list| with two iterators; one at double-rate.
+ // If we ever notice this function showing up in profiling then we should
+ // build an acceleration structure to avoid the full walk of the first half
+ // (e.g. skip list).
+ iree_task_t* p_x1_m1 = head_list->head; // p_x1 - 1 (previous to p_x1)
+ iree_task_t* p_x1 = head_list->head; // x1 speed ptr
+ iree_task_t* p_x2 = head_list->head; // x2 speed ptr
+ while (p_x2->next_task != NULL) {
+ p_x1_m1 = p_x1;
+ p_x1 = p_x1->next_task;
+ p_x2 = p_x2->next_task;
+ if (p_x2->next_task) p_x2 = p_x2->next_task;
+ }
+
+ // p_x1 now points at the half way point in the head_list. This is where we
+ // *start* our windowed walk for pulling out max_tasks, implicitly limiting us
+ // to take at most half of the tasks from the list.
+
+ // Advance the tail list keeping an iterator -max_tasks back; when we hit the
+ // end we have our head and tail to form the list.
+ iree_task_t* p_window_prev = p_x1_m1;
+ iree_task_t* p_window_head = p_x1;
+ iree_task_t* p_window_tail = p_x1;
+ while (p_window_tail->next_task != NULL && --max_tasks > 0) {
+ p_window_tail = p_window_tail->next_task;
+ }
+ while (p_window_tail->next_task != NULL) {
+ p_window_prev = p_window_head;
+ p_window_head = p_window_head->next_task;
+ p_window_tail = p_window_tail->next_task;
+ }
+
+ head_list->tail = p_window_prev;
+ p_window_prev->next_task = NULL;
+
+ out_tail_list->head = p_window_head;
+ out_tail_list->tail = p_window_tail;
+}
diff --git a/iree/task/list.h b/iree/task/list.h
new file mode 100644
index 0000000..a52c2e6
--- /dev/null
+++ b/iree/task/list.h
@@ -0,0 +1,113 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_LIST_H_
+#define IREE_TASK_LIST_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomic_slist.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// iree_atomic_task_slist_t, an atomic approximately LIFO singly-linked list.
+// iree_task_list_t should be preferred when working with
+// uncontended/thread-local lists as it has no overhead, while the
+// iree_atomic_task_slist_t should be used when multiple threads may need to
+// share lists of tasks (free lists, mailboxes, etc).
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task, iree_task_t,
+ offsetof(iree_task_t, next_task));
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset.
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist);
+
+// A singly-linked list of tasks using the embedded task next_task pointer.
+//
+// Thread-compatible; designed to be used from a single thread manipulating a
+// list for passing to an API that accepts lists.
+typedef struct iree_task_list_s {
+ iree_task_t* head;
+ iree_task_t* tail;
+} iree_task_list_t;
+
+// Initializes an empty task list.
+void iree_task_list_initialize(iree_task_list_t* out_list);
+
+// Moves |list| into |out_list|, leaving |list| empty.
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list);
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset.
+void iree_task_list_discard(iree_task_list_t* list);
+
+// Returns true if the list is empty.
+bool iree_task_list_is_empty(const iree_task_list_t* list);
+
+// Counts the total number of tasks in the list.
+// WARNING: this requires an O(n) walk of the entire list; use this only for
+// debugging or when the list is known to be small and hot in cache.
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list);
+
+// Returns the first task in the list, if any.
+iree_task_t* iree_task_list_front(iree_task_list_t* list);
+
+// Returns the last task in the list, if any.
+iree_task_t* iree_task_list_back(iree_task_list_t* list);
+
+// Pushes a task onto the back of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task);
+
+// Pushes a task onto the front of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task);
+
+// Pops a task from the front of the task list or returns NULL if the list is
+// empty. Caller takes ownership of the returned task.
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list);
+
+// Erases |task| from the list.
+// |prev_task| must point to the task immediately prior to |task| in the list
+// or NULL if the task was at the head.
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+ iree_task_t* task);
+
+// Prepends |prefix| onto the beginning of |list|. |prefix| will be reset.
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix);
+
+// Appends |suffix| onto the end of |list|. |suffix| will be reset.
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix);
+
+// Flushes the given |slist| and appends all tasks to the list in FIFO order.
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+ iree_atomic_task_slist_t* slist);
+
+// Reverses the list in-place.
+// Requires a full O(n) traversal.
+void iree_task_list_reverse(iree_task_list_t* list);
+
+// Splits |head_list| in half (up to |max_tasks|) and retains the first half
+// in |head_list| and the second half in |tail_list|.
+void iree_task_list_split(iree_task_list_t* head_list,
+ iree_host_size_t max_tasks,
+ iree_task_list_t* out_tail_list);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_LIST_H_
diff --git a/iree/task/list_test.cc b/iree/task/list_test.cc
new file mode 100644
index 0000000..2140e21
--- /dev/null
+++ b/iree/task/list_test.cc
@@ -0,0 +1,574 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/list.h"
+
+#include "iree/task/testing/test_util.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(TaskListTest, Empty) {
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+ EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+ iree_task_list_discard(&list);
+}
+
+TEST(TaskListTest, CalculateSize) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+ EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list, task0);
+ EXPECT_FALSE(iree_task_list_is_empty(&list));
+ EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+
+ iree_task_list_push_back(&list, task1);
+ EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+ iree_task_list_push_back(&list, task2);
+ EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+ iree_task_list_push_back(&list, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+}
+
+TEST(TaskListTest, Move) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+ iree_task_list_push_back(&list_a, task0);
+ iree_task_list_push_back(&list_a, task1);
+ iree_task_list_push_back(&list_a, task2);
+ iree_task_list_push_back(&list_a, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+
+ iree_task_list_move(&list_a, &list_b);
+ EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+}
+
+TEST(TaskListTest, PushFront) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_front(&list, task0);
+ iree_task_list_push_front(&list, task1);
+ iree_task_list_push_front(&list, task2);
+ iree_task_list_push_front(&list, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderLIFO(&list));
+
+ EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, PopFront) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list, task0);
+ iree_task_list_push_back(&list, task1);
+ iree_task_list_push_back(&list, task2);
+ iree_task_list_push_back(&list, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+ EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+ EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Erase) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list, task0);
+ iree_task_list_push_back(&list, task1);
+ iree_task_list_push_back(&list, task2);
+ iree_task_list_push_back(&list, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+ // Remove head.
+ iree_task_list_erase(&list, NULL, task0);
+ EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ EXPECT_EQ(task1, iree_task_list_front(&list));
+
+ // Remove tail.
+ iree_task_list_erase(&list, task2, task3);
+ EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ EXPECT_EQ(task2, iree_task_list_back(&list));
+
+ // Remove the rest.
+ iree_task_list_erase(&list, task1, task2);
+ EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ EXPECT_EQ(task1, iree_task_list_front(&list));
+ EXPECT_EQ(task1, iree_task_list_back(&list));
+
+ iree_task_list_erase(&list, NULL, task1);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, PrependEmpty) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_a, task0);
+ iree_task_list_push_back(&list_a, task1);
+
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+ iree_task_list_prepend(&list_a, &list_b);
+ EXPECT_EQ(2, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+}
+
+TEST(TaskListTest, PrependIntoEmpty) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task0);
+ iree_task_list_push_back(&list_b, task1);
+ iree_task_list_push_back(&list_b, task2);
+ iree_task_list_push_back(&list_b, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+ EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+ iree_task_list_prepend(&list_a, &list_b);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto1) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task0);
+ iree_task_list_push_back(&list_b, task1);
+ iree_task_list_push_back(&list_b, task2);
+
+ iree_task_list_push_back(&list_a, task3);
+ iree_task_list_prepend(&list_a, &list_b);
+
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto2) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task0);
+ iree_task_list_push_back(&list_b, task1);
+ iree_task_list_push_back(&list_a, task2);
+ iree_task_list_push_back(&list_a, task3);
+ iree_task_list_prepend(&list_a, &list_b);
+
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendIntoEmpty) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task0);
+ iree_task_list_push_back(&list_b, task1);
+ iree_task_list_push_back(&list_b, task2);
+ iree_task_list_push_back(&list_b, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+ EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+ iree_task_list_append(&list_a, &list_b);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto1) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task1);
+ iree_task_list_push_back(&list_b, task2);
+
+ iree_task_list_push_back(&list_b, task3);
+ iree_task_list_push_back(&list_a, task0);
+
+ iree_task_list_append(&list_a, &list_b);
+
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto2) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list_a, list_b;
+ iree_task_list_initialize(&list_a);
+ iree_task_list_initialize(&list_b);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list_b, task2);
+ iree_task_list_push_back(&list_b, task3);
+
+ iree_task_list_push_back(&list_a, task0);
+ iree_task_list_push_back(&list_a, task1);
+
+ iree_task_list_append(&list_a, &list_b);
+
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+ EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+ EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, Reverse0) {
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+ iree_task_list_reverse(&list);
+ EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Reverse1) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+
+ iree_task_list_push_back(&list, task0);
+ EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ iree_task_list_reverse(&list);
+ EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse2) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+
+ iree_task_list_push_back(&list, task0);
+ iree_task_list_push_back(&list, task1);
+ EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ iree_task_list_reverse(&list);
+ EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse4) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t list;
+ iree_task_list_initialize(&list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&list, task0);
+ iree_task_list_push_back(&list, task1);
+ iree_task_list_push_back(&list, task2);
+ iree_task_list_push_back(&list, task3);
+ EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+ EXPECT_TRUE(CheckListOrderFIFO(&list));
+ iree_task_list_reverse(&list);
+ EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, SplitEmpty) {
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+ EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+ EXPECT_TRUE(iree_task_list_is_empty(&tail_list));
+}
+
+TEST(TaskListTest, Split1) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ iree_task_list_push_back(&head_list, task0);
+ EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+ EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+ EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+}
+
+TEST(TaskListTest, Split2) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+
+ iree_task_list_push_back(&head_list, task0);
+ iree_task_list_push_back(&head_list, task1);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+ EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+ EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split3) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+
+ iree_task_list_push_back(&head_list, task0);
+ iree_task_list_push_back(&head_list, task1);
+ iree_task_list_push_back(&head_list, task2);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+ EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+ EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split4) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&head_list, task0);
+ iree_task_list_push_back(&head_list, task1);
+ iree_task_list_push_back(&head_list, task2);
+ iree_task_list_push_back(&head_list, task3);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+ EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+ EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks1) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&head_list, task0);
+ iree_task_list_push_back(&head_list, task1);
+ iree_task_list_push_back(&head_list, task2);
+ iree_task_list_push_back(&head_list, task3);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/1, &tail_list);
+
+ EXPECT_EQ(3, iree_task_list_calculate_size(&head_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+ EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks2) {
+ auto pool = AllocateNopPool();
+ auto scope = AllocateScope("a");
+
+ iree_task_list_t head_list;
+ iree_task_list_initialize(&head_list);
+
+ auto task0 = AcquireNopTask(pool, scope, 0);
+ auto task1 = AcquireNopTask(pool, scope, 1);
+ auto task2 = AcquireNopTask(pool, scope, 2);
+ auto task3 = AcquireNopTask(pool, scope, 3);
+
+ iree_task_list_push_back(&head_list, task0);
+ iree_task_list_push_back(&head_list, task1);
+ iree_task_list_push_back(&head_list, task2);
+ iree_task_list_push_back(&head_list, task3);
+
+ iree_task_list_t tail_list;
+ iree_task_list_split(&head_list, /*max_tasks=*/2, &tail_list);
+
+ EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+ EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+ EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+} // namespace
diff --git a/iree/task/pool.c b/iree/task/pool.c
new file mode 100644
index 0000000..172a2ce
--- /dev/null
+++ b/iree/task/pool.c
@@ -0,0 +1,294 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/pool.h"
+
+#include "iree/base/math.h"
+
+// Minimum byte size of a block in bytes, including the tasks as well as the
+// allocation header. This is here to allow us to reduce the number of times
+// we go to the allocator and amortize the overhead of our block header.
+#define IREE_TASK_POOL_MIN_BLOCK_SIZE (4 * 1024)
+
+// Alignment for block allocations; roughly a (likely) page size.
+// Since many allocators after the small byte range (~thousands of bytes) will
+// round up this just prevents us from being 1 over the allocator block size and
+// wasting space in a larger bucket.
+#define IREE_TASK_POOL_BLOCK_ALIGNMENT (4 * 1024)
+
+// The minimum number of tasks that will be allocated when growth is needed.
+// The total number may be larger once rounded to meet block size and alignment
+// requirements. Note that we leave a bit of room here for the block header
+// such that we don't always allocate a nice round number + N bytes that then
+// bumps us into the next power of two bucket.
+#define IREE_TASK_POOL_MIN_GROWTH_CAPACITY (255)
+
+// Grows the task pool by at least |minimum_capacity| on top of its current
+// capacity. The actual number of tasks available may be rounded up to make the
+// allocated blocks more allocator-friendly sizes.
+//
+// As an optimization for on-demand growth cases an |out_task| can be specified
+// to receive a task without the need for acquiring one from the pool
+// immediately after the growth completes. This avoids a race condition where
+// another thread could snipe the tasks we just allocated for the caller prior
+// to the caller getting a chance to acquire one.
+static iree_status_t iree_task_pool_grow(iree_task_pool_t* pool,
+ iree_host_size_t minimum_capacity,
+ iree_task_t** out_task) {
+ if (IREE_UNLIKELY(!minimum_capacity)) return iree_ok_status();
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Allocate a new block of tasks. To try to prevent the allocator from
+ // fragmenting we try to always allocate blocks that are page-aligned and
+ // powers of two.
+ //
+ // Note that we pad out our header to iree_max_align_t bytes so that all tasks
+ // are aligned on the same boundaries as required by atomic operations.
+ iree_host_size_t header_size =
+ iree_math_align(sizeof(iree_task_allocation_header_t), iree_max_align_t);
+ iree_host_size_t pow2_block_size = iree_math_round_up_to_pow2_u64(
+ header_size + minimum_capacity * pool->task_size);
+ iree_host_size_t aligned_block_size =
+ iree_math_align(pow2_block_size, IREE_TASK_POOL_BLOCK_ALIGNMENT);
+ if (aligned_block_size < IREE_TASK_POOL_MIN_BLOCK_SIZE) {
+ aligned_block_size = IREE_TASK_POOL_MIN_BLOCK_SIZE;
+ }
+ iree_task_allocation_header_t* allocation = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_allocator_malloc(pool->allocator, aligned_block_size,
+ (void**)&allocation));
+
+ // Insert the allocation into the tracking list. Nothing reads the list until
+ // the pool is trimmed/deinitialized so it's safe to do now prior to
+ // populating anything. It's all just empty data anyway.
+ iree_atomic_task_allocation_slist_push(&pool->allocations_slist, allocation);
+
+ // Since we may have rounded up the allocation we may have gotten more space
+ // for tasks than we were asked for. Ensure we actually make use of them.
+ iree_host_size_t actual_capacity =
+ (aligned_block_size - header_size) / pool->task_size;
+
+ // Stitch together the tasks by setting all next pointers.
+ // Since we are going to be touching all the pages the order here is important
+ // as once we insert these new tasks into the available_slist they'll be
+ // popped out head->tail. To ensure the head that gets popped first is still
+ // warm in cache we construct the list backwards, with the tail tasks being
+ // fine to be evicted.
+ //
+ // The nice thing about this walk is that it ensures that if there were any
+ // zero-fill-on-demand trickery going on the pages are all wired here vs.
+ // when the tasks are first acquired from the list where it'd be harder to
+ // track.
+ uintptr_t p = ((uintptr_t)allocation + aligned_block_size) - pool->task_size;
+ iree_task_t* head = (iree_task_t*)p;
+ iree_task_t* tail = head;
+ head->next_task = NULL;
+ for (iree_host_size_t i = 0; i < actual_capacity; ++i, p -= pool->task_size) {
+ iree_task_t* task = (iree_task_t*)p;
+ task->next_task = head;
+ head = task;
+ }
+
+ // If the caller needs a task we can slice off the head to return prior to
+ // adding it to the slist where it may get stolen.
+ if (out_task) {
+ *out_task = head;
+ head = head->next_task;
+ }
+
+ // Concatenate the list of new free tasks into the pool.
+ iree_atomic_task_slist_concat(&pool->available_slist, head, tail);
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+ iree_host_size_t task_size,
+ iree_host_size_t initial_capacity,
+ iree_task_pool_t* out_pool) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, task_size);
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, initial_capacity);
+
+ out_pool->allocator = allocator;
+ out_pool->task_size = task_size;
+ iree_atomic_task_allocation_slist_initialize(&out_pool->allocations_slist);
+ iree_atomic_task_slist_initialize(&out_pool->available_slist);
+ iree_status_t status =
+ iree_task_pool_grow(out_pool, initial_capacity, /*out_task=*/NULL);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+void iree_task_pool_deinitialize(iree_task_pool_t* pool) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_task_allocation_header_t* allocation = NULL;
+ if (iree_atomic_task_allocation_slist_flush(
+ &pool->allocations_slist,
+ IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation, NULL)) {
+ while (allocation) {
+ iree_task_allocation_header_t* next =
+ iree_atomic_task_allocation_slist_get_next(allocation);
+ iree_allocator_free(pool->allocator, allocation);
+ allocation = next;
+ }
+ }
+ iree_atomic_task_allocation_slist_deinitialize(&pool->allocations_slist);
+ iree_atomic_task_slist_deinitialize(&pool->available_slist);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_pool_trim(iree_task_pool_t* pool) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ // NOTE: this is only safe if there are no outstanding tasks.
+ // Hopefully the caller read the docstring!
+
+ // We only need to flush the list to empty it - these are just references into
+ // the allocations and don't need to be released.
+ iree_task_t* task_head = NULL;
+ iree_atomic_task_slist_flush(&pool->available_slist,
+ IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+ &task_head, /*tail=*/NULL);
+
+ iree_task_allocation_header_t* allocation_head = NULL;
+ if (iree_atomic_task_allocation_slist_flush(
+ &pool->allocations_slist,
+ IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation_head,
+ /*tail=*/NULL)) {
+ do {
+ iree_task_allocation_header_t* next =
+ iree_atomic_task_allocation_slist_get_next(allocation_head);
+ iree_allocator_free(pool->allocator, allocation_head);
+ allocation_head = next;
+ } while (allocation_head != NULL);
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+ iree_task_t** out_task) {
+ if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+ // Attempt to acquire a task from the available list.
+ iree_task_t* task = iree_atomic_task_slist_pop(&pool->available_slist);
+ if (task) {
+ *out_task = task;
+ return iree_ok_status();
+ }
+
+ // No tasks were available when we tried; force growth now.
+ // Note that due to races it's possible that there are now tasks that have
+ // been released back into the pool, but the fact that we failed once means
+ // we are sitting right at the current limit of the pool and growing will
+ // help ensure we go down the fast path more frequently in the future.
+ return iree_task_pool_grow(pool, IREE_TASK_POOL_MIN_GROWTH_CAPACITY,
+ out_task);
+}
+
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+ iree_host_size_t count,
+ iree_task_list_t* out_list) {
+ if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+ // If we acquire more than the requested count we need to give those leftovers
+ // back to the pool before we leave.
+ iree_task_list_t leftover_tasks;
+ iree_task_list_initialize(&leftover_tasks);
+ iree_task_list_initialize(out_list);
+
+ iree_status_t status = iree_ok_status();
+ while (count) {
+ // Flush the entire available list so we can start operating on it.
+ // This is where the potential race comes in: if another thread goes to
+ // acquire a task while we have the list local here it'll grow the list so
+ // it can meet its demand. That's still correct behavior but will result in
+ // potentially more wasted memory than if the other thread would have
+ // waited. Thankfully we save memory in so many other places that in the
+ // rare case there are multiple concurrent schedulers acquiring tasks it's
+ // not the end of the world.
+ iree_task_list_t acquired_tasks;
+ iree_task_list_initialize(&acquired_tasks);
+ if (iree_atomic_task_slist_flush(
+ &pool->available_slist,
+ IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+ &acquired_tasks.head,
+ /*tail=*/NULL)) {
+ // Had some items in the pool; eat up to the requested count.
+ // Note that we may run out and need to allocate more or have gotten
+ // too many during the flush and need to track those leftovers.
+ //
+ // Instead of having the slist flush walk the list and give us a tail we
+ // do that here: we need to walk the list anyway to partition it.
+ iree_task_t* p = acquired_tasks.head;
+ while (count > 0) {
+ p = iree_atomic_task_slist_get_next(p);
+ if (!p) break;
+ acquired_tasks.tail = p;
+ --count;
+ }
+
+ // If we got everything we need then we have to put all of the flushed
+ // tasks we didn't use into the leftover list.
+ if (count == 0) {
+ iree_task_list_t acquire_leftovers;
+ iree_task_list_initialize(&acquire_leftovers);
+ acquire_leftovers.head =
+ iree_atomic_task_slist_get_next(acquired_tasks.tail);
+ iree_atomic_task_slist_set_next(acquired_tasks.tail, NULL);
+ p = acquire_leftovers.head;
+ iree_task_t* next;
+ while ((next = iree_atomic_task_slist_get_next(p))) p = next;
+ acquire_leftovers.tail = p;
+ iree_task_list_append(&leftover_tasks, &acquire_leftovers);
+ }
+
+ // Add the tasks we did acquire to our result list.
+ // NOTE: this is unmeasured but the intuition is that we want to put the
+ // tasks we just acquired at the head of the list so that they are warm
+ // upon return to the caller who will then be touching the head of the
+ // list immediately.
+ iree_task_list_prepend(out_list, &acquired_tasks);
+ }
+
+ // If we still need more tasks but ran out of ones in the flush list then we
+ // need to grow some more.
+ if (count > 0) {
+ status = iree_task_pool_grow(pool, count, /*out_task=*/NULL);
+ if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+ }
+ }
+
+ // Return leftovers that we acquired but didn't need to the pool.
+ iree_atomic_task_slist_concat(&pool->available_slist, leftover_tasks.head,
+ leftover_tasks.tail);
+
+ // Upon failure return any tasks we may have already acquired from the pool.
+ if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+ iree_atomic_task_slist_concat(&pool->available_slist, out_list->head,
+ out_list->tail);
+ }
+
+ return status;
+}
+
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task) {
+ if (!pool) return;
+ assert(task->pool == pool);
+ iree_atomic_task_slist_push(&pool->available_slist, task);
+}
diff --git a/iree/task/pool.h b/iree/task/pool.h
new file mode 100644
index 0000000..7610ff7
--- /dev/null
+++ b/iree/task/pool.h
@@ -0,0 +1,122 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_POOL_H_
+#define IREE_TASK_POOL_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// An allocation of tasks in a task pool containing multiple tasks.
+// This struct is at the head of all task allocations made from the allocator.
+// It is used to form a linked list of all allocations made so that they can be
+// easily freed during pool teardown.
+typedef struct {
+ // Next allocation in the linked list of allocations.
+ iree_atomic_slist_intrusive_ptr_t* next;
+} iree_task_allocation_header_t;
+
+// An atomic approximately LIFO singly-linked list.
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task_allocation,
+ iree_task_allocation_header_t,
+ offsetof(iree_task_allocation_header_t, next));
+
+// Shared thread-safe pool of iree_task_t structures of a particular size.
+// This can be used to quickly allocate blocks of tasks to be initialized by
+// task producers, enqueued, and then eventually recycled back to the pool.
+//
+// The lifetime of all tasks must be less than the pool they were acquired
+// from. Tasks acquired from one pool must not be released to another pool or
+// via any other mechanism.
+//
+// Pools can either be fixed-size with a maximum number of available tasks that
+// can be outstanding at any time or growable to allow the pool to be grown
+// unbounded after initialization.
+typedef struct iree_task_pool_s {
+ // Allocator used for allocating/freeing each allocation block.
+ iree_allocator_t allocator;
+
+ // Task size, in bytes.
+ iree_host_size_t task_size;
+
+ // NOTE: we don't track current usage count as that would introduce additional
+ // contention as tasks are acquired/released. If we end up finding a lot of
+ // memory idling here we can add a threshold over which we reclaim it, but the
+ // easiest (and most efficient) solution is to force the user to synchronize
+ // with the executor on a low memory event and use iree_task_pool_trim.
+
+ // Head of a linked list of all allocations made by the pool.
+ iree_atomic_task_allocation_slist_t allocations_slist;
+
+ // Linked list of free tasks used as a stack (LIFO).
+ // This is not a great structure for this as over time the tasks will get out
+ // of order and walking the linked list will incur cache misses. We offset
+ // that cost a bit by knowing that the time between walking the list to
+ // acquire tasks and when we initialize the tasks is short and that we would
+ // have triggered a cache miss anyway. In the future we can explore other
+ // approaches (such as small chunked linear lists) that better exploit spatial
+ // locality, if needed.
+ iree_atomic_task_slist_t available_slist;
+} iree_task_pool_t;
+
+// Initializes a task pool and optionally performs an initial task allocation.
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+ iree_host_size_t task_size,
+ iree_host_size_t initial_capacity,
+ iree_task_pool_t* out_pool);
+
+// Deinitializes a task pool and releases all task allocations back to the
+// allocator specified during initialization. All tasks must have already been
+// released back to the pool.
+void iree_task_pool_deinitialize(iree_task_pool_t* pool);
+
+// Attempts to trim unused allocations from the task pool.
+// Must not be called while any tasks that were acquired from this pool are
+// still live; callers must synchronize with the executor and ensure they aren't
+// pushing any more work during the trim operation.
+void iree_task_pool_trim(iree_task_pool_t* pool);
+
+// Acquires a task from the task pool. The returned task will have undefined
+// contents and must be intialized by the caller.
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+ iree_task_t** out_task);
+
+// Acquires a set of tasks from the task pool. The returned tasks will have
+// undefined contents besides their intrusive next pointers and must be
+// intialized by the caller.
+//
+// WARNING: this may cause growth during races if multiple threads are trying to
+// acquire at the same time. Our usage patterns here are such that this is never
+// the case, though, as all acquisition from the internal executor pools happens
+// with the coordination lock held.
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+ iree_host_size_t count,
+ iree_task_list_t* out_list);
+
+// Releases a task to the task pool.
+// Callers must ensure the task is no longer in use.
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_POOL_H_
diff --git a/iree/task/pool_test.cc b/iree/task/pool_test.cc
new file mode 100644
index 0000000..641292f
--- /dev/null
+++ b/iree/task/pool_test.cc
@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/pool.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(PoolTest, Any) {
+ // TODO(benvanik): tests.
+}
+
+} // namespace
diff --git a/iree/task/post_batch.c b/iree/task/post_batch.c
new file mode 100644
index 0000000..111c39c
--- /dev/null
+++ b/iree/task/post_batch.c
@@ -0,0 +1,189 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/post_batch.h"
+
+#include "iree/base/math.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/executor_impl.h"
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+ iree_task_worker_t* current_worker,
+ iree_task_post_batch_t* out_post_batch) {
+ out_post_batch->executor = executor;
+ out_post_batch->current_worker = current_worker;
+ out_post_batch->worker_pending_mask = 0;
+ memset(&out_post_batch->worker_pending_lifos, 0,
+ executor->worker_count * sizeof(iree_task_list_t));
+}
+
+iree_host_size_t iree_task_post_batch_worker_count(
+ const iree_task_post_batch_t* post_batch) {
+ return post_batch->executor->worker_count;
+}
+
+static iree_host_size_t iree_task_post_batch_select_random_worker(
+ iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+ iree_task_affinity_set_t worker_live_mask =
+ iree_atomic_task_affinity_set_load(
+ &post_batch->executor->worker_live_mask, iree_memory_order_relaxed);
+ iree_task_affinity_set_t valid_worker_mask = affinity_set & worker_live_mask;
+ if (!valid_worker_mask) {
+ // No valid workers as desired; for now just bail to worker 0.
+ return 0;
+ }
+
+ // TODO(benvanik): rotate through workers here. Instead, if the affinity set
+ // has the current_worker allowed we just use that to avoid needing a
+ // cross-thread hop.
+ return iree_task_affinity_set_count_trailing_zeros(valid_worker_mask);
+}
+
+iree_host_size_t iree_task_post_batch_select_worker(
+ iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+ if (post_batch->current_worker) {
+ // Posting from a worker - prefer sending right back to this worker.
+ if (affinity_set & post_batch->current_worker->worker_bit) {
+ return iree_task_affinity_set_count_trailing_zeros(
+ post_batch->current_worker->worker_bit);
+ }
+ }
+
+ // Prefer workers that are idle as though they'll need to wake up it is
+ // guaranteed that they aren't working on something else and the latency of
+ // waking should (hopefully) be less than the latency of waiting for a
+ // worker's queue to finish.
+ iree_task_affinity_set_t worker_idle_mask =
+ iree_atomic_task_affinity_set_load(
+ &post_batch->executor->worker_idle_mask, iree_memory_order_relaxed);
+ iree_task_affinity_set_t idle_affinity_set = affinity_set & worker_idle_mask;
+ if (idle_affinity_set) {
+ return iree_task_post_batch_select_random_worker(post_batch,
+ idle_affinity_set);
+ }
+
+ // No more workers are idle; farm out at random. In the worst case work
+ // stealing will help balance things out on the backend.
+ return iree_task_post_batch_select_random_worker(post_batch, affinity_set);
+}
+
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+ iree_host_size_t worker_index,
+ iree_task_t* task) {
+ iree_task_list_push_front(&post_batch->worker_pending_lifos[worker_index],
+ task);
+ post_batch->worker_pending_mask |=
+ iree_task_affinity_for_worker(worker_index);
+}
+
+// Wakes each worker indicated in the |wake_mask|, if needed.
+static void iree_task_post_batch_wake_workers(
+ iree_task_post_batch_t* post_batch, iree_task_affinity_set_t wake_mask) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_VALUE(z0, iree_math_count_ones_u64(wake_mask));
+
+ iree_task_executor_t* executor = post_batch->executor;
+
+ // Wake workers that may be suspended. We fetch the set of workers we need to
+ // wake (hopefully none in the common case) and mark that we've woken them so
+ // that we don't double-resume.
+ iree_task_affinity_set_t resume_mask =
+ iree_atomic_task_affinity_set_fetch_and(&executor->worker_suspend_mask,
+ ~wake_mask,
+ iree_memory_order_acquire);
+ resume_mask &= wake_mask;
+ if (IREE_UNLIKELY(resume_mask)) {
+ int resume_count = iree_task_affinity_set_count_ones(resume_mask);
+ int worker_index = 0;
+ for (int i = 0; i < resume_count; ++i) {
+ int offset = iree_task_affinity_set_count_trailing_zeros(resume_mask) + 1;
+ int resume_index = worker_index + offset;
+ worker_index += offset + 1;
+ resume_mask = resume_mask >> (offset + 1);
+ iree_thread_resume(executor->workers[resume_index].thread);
+ }
+ }
+
+ // TODO(#4016): use a FUTEX_WAKE_BITSET here to wake all of the workers that
+ // have pending work in a single syscall (vs. popcnt(worker_pending_mask)
+ // syscalls). This will reduce wake latency for workers later in the set;
+ // for example today worker[31] will wait until workers[0-30] have had their
+ // syscalls performed before it's even requested to wake. This also loses
+ // information the kernel could use to avoid core migration as it knows when N
+ // threads will be needed simultaneously and can hopefully perform any needed
+ // migrations prior to beginning execution.
+ int wake_count = iree_task_affinity_set_count_ones(wake_mask);
+ int worker_index = 0;
+ for (int i = 0; i < wake_count; ++i) {
+ int offset = iree_task_affinity_set_count_trailing_zeros(wake_mask);
+ int wake_index = worker_index + offset;
+ worker_index += offset + 1;
+ wake_mask = wake_mask >> (offset + 1);
+
+ // Wake workers if they are waiting - workers are the only thing that can
+ // wait on this notification so this should almost always be either free (an
+ // atomic load) if a particular worker isn't waiting or it's required to
+ // actually wake it and we can't avoid it.
+ iree_task_worker_t* worker = &executor->workers[wake_index];
+ iree_notification_post(&worker->wake_notification, 1);
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch) {
+ if (!post_batch->worker_pending_mask) return false;
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Run through each worker that has a bit set in the pending mask and post
+ // the pending tasks.
+ iree_task_affinity_set_t worker_mask = post_batch->worker_pending_mask;
+ post_batch->worker_pending_mask = 0;
+ int worker_index = 0;
+ int post_count = iree_task_affinity_set_count_ones(worker_mask);
+ iree_task_affinity_set_t worker_wake_mask = 0;
+ for (int i = 0; i < post_count; ++i) {
+ int offset = iree_task_affinity_set_count_trailing_zeros(worker_mask);
+ int target_index = worker_index + offset;
+ worker_index += offset + 1;
+ worker_mask = worker_mask >> (offset + 1);
+
+ iree_task_worker_t* worker = &post_batch->executor->workers[target_index];
+ iree_task_list_t* target_pending_lifo =
+ &post_batch->worker_pending_lifos[target_index];
+ if (worker == post_batch->current_worker) {
+ // Fast-path for posting to self; this happens when a worker plays the
+ // role of coordinator and we want to ensure we aren't doing a fully
+ // block-and-flush loop when we could just be popping the next new task
+ // off the list.
+ iree_task_queue_append_from_lifo_list_unsafe(&worker->local_task_queue,
+ target_pending_lifo);
+ } else {
+ iree_task_worker_post_tasks(worker, target_pending_lifo);
+ worker_wake_mask |= iree_task_affinity_for_worker(target_index);
+ }
+ }
+
+ // Wake all workers that now have pending work. If a worker is not already
+ // waiting this will be cheap (no syscall).
+ if (worker_wake_mask != 0) {
+ iree_task_post_batch_wake_workers(post_batch, worker_wake_mask);
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+ return post_count != 0;
+}
diff --git a/iree/task/post_batch.h b/iree/task/post_batch.h
new file mode 100644
index 0000000..944fdd7
--- /dev/null
+++ b/iree/task/post_batch.h
@@ -0,0 +1,77 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_POST_BATCH_H_
+#define IREE_TASK_POST_BATCH_H_
+
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+typedef struct iree_task_worker_s iree_task_worker_t;
+
+// Transient/stack-allocated structure for batching up tasks for posting to
+// worker mailboxes in single operations. This avoids the need to repeatedly
+// thrash caches during coordination as only during submission are the worker
+// mailboxes touched and only once per worker.
+typedef struct {
+ iree_task_executor_t* executor;
+
+ // Local worker constructing the post batch.
+ // This is used to know when lighter-weight queuing can occur (no need to
+ // post across a mailbox channel to yourself!).
+ // May be NULL if not being posted from a worker (such as a submission).
+ iree_task_worker_t* current_worker;
+
+ // A bitmask of workers indicating which have pending tasks in their lists.
+ // Used to quickly scan the lists and perform the posts only when required.
+ iree_task_affinity_set_t worker_pending_mask;
+
+ // A per-worker LIFO task list waiting to be posted.
+ iree_task_list_t worker_pending_lifos[0];
+} iree_task_post_batch_t;
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+ iree_task_worker_t* current_worker,
+ iree_task_post_batch_t* out_post_batch);
+
+// Returns the total number of workers that the post batch is targeting.
+iree_host_size_t iree_task_post_batch_worker_count(
+ const iree_task_post_batch_t* post_batch);
+
+// Selects a random worker from the given affinity set.
+iree_host_size_t iree_task_post_batch_select_worker(
+ iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set);
+
+// Enqueues a task to the given worker. Note that the pending work lists for
+// each work is kept in LIFO order so that we can easily concatenate it with the
+// worker mailbox slist that's in LIFO order.
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+ iree_host_size_t worker_index,
+ iree_task_t* task);
+
+// Submits all pending tasks to their worker mailboxes and resets state.
+// Returns true if any tasks were posted to workers.
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_POST_BATCH_H_
diff --git a/iree/task/queue.c b/iree/task/queue.c
new file mode 100644
index 0000000..d546b0d
--- /dev/null
+++ b/iree/task/queue.c
@@ -0,0 +1,97 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/queue.h"
+
+#include <assert.h>
+
+void iree_task_queue_initialize(iree_task_queue_t* out_queue) {
+ memset(out_queue, 0, sizeof(*out_queue));
+ iree_slim_mutex_initialize(&out_queue->mutex);
+ iree_task_list_initialize(&out_queue->list);
+}
+
+void iree_task_queue_deinitialize(iree_task_queue_t* queue) {
+ iree_task_list_discard(&queue->list);
+ iree_slim_mutex_deinitialize(&queue->mutex);
+}
+
+bool iree_task_queue_is_empty(iree_task_queue_t* queue) {
+ iree_slim_mutex_lock(&queue->mutex);
+ bool is_empty = iree_task_list_is_empty(&queue->list);
+ iree_slim_mutex_unlock(&queue->mutex);
+ return is_empty;
+}
+
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task) {
+ iree_slim_mutex_lock(&queue->mutex);
+ iree_task_list_push_front(&queue->list, task);
+ iree_slim_mutex_unlock(&queue->mutex);
+}
+
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+ iree_task_list_t* list) {
+ // NOTE: reversing the list outside of the lock.
+ iree_task_list_reverse(list);
+ iree_slim_mutex_lock(&queue->mutex);
+ iree_task_list_append(&queue->list, list);
+ iree_slim_mutex_unlock(&queue->mutex);
+}
+
+iree_task_t* iree_task_queue_append_from_lifo_slist(
+ iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist) {
+ // Perform the flush and swap outside of the lock; acquiring the list is
+ // atomic and then we own it exclusively.
+ iree_task_list_t suffix;
+ iree_task_list_initialize(&suffix);
+ bool did_flush = iree_atomic_task_slist_flush(
+ source_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO,
+ &suffix.head, &suffix.tail);
+
+ // Append the tasks and pop off the front for return.
+ iree_slim_mutex_lock(&queue->mutex);
+ if (did_flush) iree_task_list_append(&queue->list, &suffix);
+ iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+ iree_slim_mutex_unlock(&queue->mutex);
+
+ return next_task;
+}
+
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue) {
+ iree_slim_mutex_lock(&queue->mutex);
+ iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+ iree_slim_mutex_unlock(&queue->mutex);
+ return next_task;
+}
+
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+ iree_task_queue_t* target_queue,
+ iree_host_size_t max_tasks) {
+ // First attempt to steal up to max_tasks from the source queue.
+ iree_task_list_t stolen_tasks;
+ iree_task_list_initialize(&stolen_tasks);
+ iree_slim_mutex_lock(&source_queue->mutex);
+ iree_task_list_split(&source_queue->list, max_tasks, &stolen_tasks);
+ iree_slim_mutex_unlock(&source_queue->mutex);
+
+ // Add any stolen tasks to the target queue and pop off the head for return.
+ iree_task_t* next_task = NULL;
+ if (!iree_task_list_is_empty(&stolen_tasks)) {
+ iree_slim_mutex_lock(&target_queue->mutex);
+ iree_task_list_append(&target_queue->list, &stolen_tasks);
+ next_task = iree_task_list_pop_front(&target_queue->list);
+ iree_slim_mutex_unlock(&target_queue->mutex);
+ }
+ return next_task;
+}
diff --git a/iree/task/queue.h b/iree/task/queue.h
new file mode 100644
index 0000000..6248551
--- /dev/null
+++ b/iree/task/queue.h
@@ -0,0 +1,172 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_QUEUE_H_
+#define IREE_TASK_QUEUE_H_
+
+#include "iree/base/api.h"
+#include "iree/base/synchronization.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// A simple work-stealing LIFO queue modeled on a Chase-Lev concurrent deque.
+// This is used by workers to maintain their thread-local working lists. The
+// workers keep the tasks they will process in FIFO order. They allow it to
+// empty and then refresh it with more tasks from the incoming worker mailbox.
+// The performance bias here is to the workers as they are >90% of the
+// accesses and the only other accesses are thieves that hopefully we can just
+// improve our distribution to vs. introducing a slowdown here.
+//
+// A futex is used to synchronize access; because the common case is that of
+// only the worker that owns the queue touching it for pushing and popping items
+// this puts us into the sweet-spot of uncontended lightweight exclusive locks.
+// Since futices are effectively just single machine words managed with atomic
+// ops we can avoid a lot of the traditional atomic tomfoolery one finds in
+// systems like these that originated prior to the introduction of futices while
+// also keeping the tiny overhead of the pure atomic solutions.
+//
+// We can also take advantage of the futex providing an actual exclusive region
+// such that our data structure can be whatever we want as opposed to needing to
+// be something that someone had figured out how to make atomic. For example,
+// common implementations of work-stealing queues are all bounded as unbounded
+// atomic deques are an unsolved problem in CS.
+//
+// Very rarely when another worker runs out of work it'll try to steal tasks
+// from nearby workers and use this queue type to do it: the assumption is that
+// it's better to take the last task the victim worker will get to so that in a
+// long list of tasks it remains chugging through the head of the list with good
+// cache locality. If we end up with a lot of theft, though, it's possible for
+// the cache benefits of the pop_back approach to the worker to outweigh the
+// cache pessimism for all thieves. Let's hope we can schedule deterministic-
+// enough tiles such that theft is rare!
+//
+// Our queue variant here is tuned for the use case we have: we exclusively
+// push in multiple tasks at a time (flushed from the mailbox) and exclusively
+// pop a single task a time (what to work on next). The stealing part is batched
+// so that when a remote worker has to perform a theft it takes a good chunk of
+// tasks in one go (hopefully roughly half) to reduce the total overhead when
+// there is high imbalance in workloads.
+//
+// Flushing from the mailbox slist (LIFO) to our list (FIFO) requires a full
+// walk of the incoming task linked list. This is generally fine as the number
+// of tasks in any given flush is low(ish) and by walking in reverse order to
+// then process forward the cache should be hot as the worker starts making its
+// way back through the tasks. As we walk forward we'll be using the task fields
+// for execution and retiring of tasks (notifing dependencies/etc) and the
+// intrusive next pointer sitting next to those should be in-cache when we need
+// to access it. This, combined with slab allocation of tasks in command buffers
+// to begin with gives us the (probabilistically) same characteristics of a flat
+// array walked with an index as is common in other work queues but with the
+// flexibility to reorder tasks as we see fit (theft, redistribution/rotation,
+// reprioritization, etc).
+//
+// Similar concepts, though implemented with atomics:
+// "Dynamic Circular Work-Stealing Deque":
+// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.170.1097&rep=rep1&type=pdf
+// "Correct and Efficient Work-Stealing for Weak Memory Models":
+// https://fzn.fr/readings/ppopp13.pdf
+// Motivating article:
+// https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+//
+// Useful diagram from https://github.com/injinj/WSQ
+// Much of this implementation is inspired from that; though significant
+// reworking was required for our FIFO->LIFO->FIFO sandwich.
+// +--------+ <- tasks[0]
+// | top | <- stealers consume here: task = tasks[top++]
+// | |
+// | || |
+// | |
+// | vv |
+// | bottom | <- owner pushes here: tasks[bottom++] = task
+// | | owner consumes here: task = tasks[--bottom]
+// | |
+// +--------+ <- tasks[IREE_TASK_QUEUE_CAPACITY-1]
+//
+// Unlike that implementation, though, our task list is unbounded because we use
+// a linked list. To keep our options open, though, I've left the API of this
+// implementation compatible with classic atomic work-stealing queues. I'm
+// hopeful this will not need to be revisted for awhile, though!
+//
+// Future improvement idea: have the owner of the queue maintain a theft point
+// skip list that makes it possible for thieves to quickly come in and slice
+// off batches of tasks at the tail of the queue. Since we are a singly-linked
+// list we can't easily just walk backward and we don't want to be introducing
+// cache line contention as thieves start touching the same tasks as the worker
+// is while processing.
+typedef struct {
+ // Must be held when manipulating the queue. >90% accesses are by the owner.
+ iree_slim_mutex_t mutex;
+
+ // FIFO task list.
+ iree_task_list_t list IREE_GUARDED_BY(mutex);
+} iree_task_queue_t;
+
+// Initializes a work-stealing task queue in-place.
+void iree_task_queue_initialize(iree_task_queue_t* out_queue);
+
+// Deinitializes a task queue and clears all references.
+// Must not be called while any other worker may be attempting to steal tasks.
+void iree_task_queue_deinitialize(iree_task_queue_t* queue);
+
+// Returns true if the queue is empty.
+// Note that due to races this may return both false-positives and -negatives.
+bool iree_task_queue_is_empty(iree_task_queue_t* queue);
+
+// Pushes a task to the front of the queue.
+// Always prefer the multi-push variants (prepend/append) when adding more than
+// one task to the queue. This is mostly useful for exceptional cases such as
+// when a task may yield and need to be reprocessed after the worker resumes.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task);
+
+// Appends a LIFO |list| of tasks to the queue.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+ iree_task_list_t* list);
+
+// Flushes the |source_slist| LIFO mailbox into the task queue in FIFO order.
+// Returns the first task in the queue upon success; the task may be
+// pre-existing or from the newly flushed tasks.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_append_from_lifo_slist(
+ iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist);
+
+// Pops a task from the front of the queue if any are available.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the |source_queue| will be moved to the
+// |target_queue| and the first of the stolen tasks is returned.
+//
+// It's expected this is not called from the queue's owning worker, though it's
+// valid to do so.
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+ iree_task_queue_t* target_queue,
+ iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_QUEUE_H_
diff --git a/iree/task/queue_test.cc b/iree/task/queue_test.cc
new file mode 100644
index 0000000..9b8b448
--- /dev/null
+++ b/iree/task/queue_test.cc
@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/queue.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(QueueTest, Any) {
+ // TODO(benvanik): tests.
+}
+
+} // namespace
diff --git a/iree/task/scope.c b/iree/task/scope.c
new file mode 100644
index 0000000..f35a687
--- /dev/null
+++ b/iree/task/scope.c
@@ -0,0 +1,128 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/scope.h"
+
+void iree_task_scope_initialize(iree_string_view_t name,
+ iree_task_scope_t* out_scope) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ memset(out_scope, 0, sizeof(*out_scope));
+
+ iree_host_size_t name_length =
+ iree_min(name.size, IREE_ARRAYSIZE(out_scope->name) - 1);
+ memcpy(out_scope->name, name.data, name_length);
+ out_scope->name[name.size] = 0;
+
+ // TODO(benvanik): pick trace colors based on name hash.
+ IREE_TRACE(out_scope->task_trace_color = 0xFFFF0000u);
+
+ iree_notification_initialize(&out_scope->idle_notification);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_deinitialize(iree_task_scope_t* scope) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ assert(iree_task_scope_is_idle(scope) &&
+ "pending submissions must be aborted prior to deinitializing their "
+ "scope");
+
+ // Makes it easier to see if we were incorrectly using the name even after the
+ // scope is deinitialized. Since scopes may be stack allocated we don't want
+ // to have anyone trying to access them (like tracy).
+ memset(scope->name, 0xCD, sizeof(scope->name));
+
+ // In most cases the status will have been consumed by the scope owner.
+ iree_status_t status = (iree_status_t)iree_atomic_exchange_ptr(
+ &scope->permanent_status, (uintptr_t)NULL, iree_memory_order_acquire);
+ IREE_IGNORE_ERROR(status);
+
+ iree_notification_deinitialize(&scope->idle_notification);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_task_scope_try_set_status(iree_task_scope_t* scope,
+ iree_status_t new_status) {
+ if (IREE_UNLIKELY(iree_status_is_ok(new_status))) return;
+
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_TRACE_ZONE_APPEND_TEXT(z0, "failed: ");
+ IREE_TRACE_ZONE_APPEND_TEXT(
+ z0, iree_status_code_string(iree_status_code(new_status)));
+
+ iree_status_t old_status = iree_ok_status();
+ if (!iree_atomic_compare_exchange_strong_ptr(
+ &scope->permanent_status, (uintptr_t*)&old_status,
+ (uintptr_t)new_status, iree_memory_order_seq_cst,
+ iree_memory_order_seq_cst)) {
+ // Previous status was not OK; drop our new status.
+ IREE_IGNORE_ERROR(new_status);
+ }
+
+ // TODO(#4026): poke to wake idle waiters.
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_abort(iree_task_scope_t* scope) {
+ iree_status_t status =
+ iree_make_status(IREE_STATUS_ABORTED, "entire scope aborted by user");
+ iree_task_scope_try_set_status(scope, status);
+}
+
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_task_t* task,
+ iree_status_t status) {
+ // TODO(benvanik): logging/tracing based on task.
+ iree_task_scope_try_set_status(scope, status);
+}
+
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope) {
+ iree_status_t old_status = iree_ok_status();
+ iree_status_t new_status = iree_ok_status();
+ while (!iree_atomic_compare_exchange_strong_ptr(
+ &scope->permanent_status, (uintptr_t*)&old_status, (uintptr_t)new_status,
+ iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+ // Previous status was not OK; we have it now though and can try again.
+ new_status = iree_status_from_code(iree_status_code(new_status));
+ }
+ return old_status;
+}
+
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+ iree_task_scope_t* scope) {
+ iree_task_dispatch_statistics_t result = scope->dispatch_statistics;
+ memset(&scope->dispatch_statistics, 0, sizeof(scope->dispatch_statistics));
+ return result;
+}
+
+bool iree_task_scope_is_idle(iree_task_scope_t* scope) {
+ return iree_atomic_load_int32(&scope->pending_submissions,
+ iree_memory_order_relaxed) == 0;
+}
+
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+ iree_time_t deadline_ns) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Wait for the scope to enter the idle state.
+ // NOTE: we are currently ignoring |deadline_ns|.
+ iree_notification_await(&scope->idle_notification,
+ (iree_condition_fn_t)iree_task_scope_is_idle, scope);
+
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
diff --git a/iree/task/scope.h b/iree/task/scope.h
new file mode 100644
index 0000000..ec10a34
--- /dev/null
+++ b/iree/task/scope.h
@@ -0,0 +1,136 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_SCOPE_H_
+#define IREE_TASK_SCOPE_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomics.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/tracing.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// A loose way of grouping tasks within the task system.
+// Each scope represents a unique collection of tasks that have some related
+// properties - most often their producer - that need to carry along some
+// tracking information to act on all related tasks at once. They do not
+// indicate any particular ordering of tasks or how the tasks are to be treated
+// by executors.
+//
+// Scopes can be used to signal, propagate, and retrieve failure statuses. As
+// the executor processes tasks in an unordered fashion this is the only way to
+// perform cross-task operations such as "abort all of the tasks from this
+// producer" or "wait until all tasks from this producer finish." In addition
+// there are statistics that can be aggregated across all tasks attributed to
+// the scope that allows for an efficient roll-up of activity over specific
+// durations.
+//
+// Task producers can decide whether to create new scopes for each batch of
+// tasks they submit or reuse scopes for the lifetime of their subprocess. Scope
+// overhead is low and the only advantage of reusing them is that lifetime can
+// become easier to manage by tying them 1:1 with producers.
+//
+// Thread-safe; once created scopes are modified exclusively via atomic
+// operations.
+typedef struct iree_task_scope_s {
+ // Name used for logging and tracing.
+ char name[16];
+
+ // Base color used for tasks in this scope.
+ // The color will be modulated based on task type.
+ IREE_TRACE(uint32_t task_trace_color;)
+
+ // A permanent status code set when a task within the scope fails. All pending
+ // tasks will be cancelled, though any in-flight tasks may continue executing
+ // to completion.
+ iree_atomic_ptr_t permanent_status;
+
+ // Dispatch statistics aggregated from all dispatches in this scope. Updated
+ // relatively infrequently and must not be used for task control as values
+ // are undefined in the case of failure and may tear.
+ iree_task_dispatch_statistics_t dispatch_statistics;
+
+ // A count of pending submissions within this scope. 0 indicates idle.
+ // Each submission has a fence that references this value and decrements it
+ // as it is reached indicating that all memory used by all tasks within that
+ // submission is available for reuse.
+ iree_atomic_int32_t pending_submissions;
+
+ // A notification signaled when the scope transitions to having no pending
+ // tasks or completes all pending tasks after a failure.
+ iree_notification_t idle_notification;
+} iree_task_scope_t;
+
+// Initializes a caller-allocated scope.
+// Callers must ensure the scope remains live for as long as there are any
+// tasks that may reference it.
+void iree_task_scope_initialize(iree_string_view_t name,
+ iree_task_scope_t* out_scope);
+
+// Deinitializes an task scope.
+// No tasks may be pending and the scope must be idle.
+void iree_task_scope_deinitialize(iree_task_scope_t* scope);
+
+// Marks the scope as having been aborted by the user with IREE_STATUS_ABORTED.
+// All pending tasks will be dropped though in-flight tasks may complete
+// execution. Callers must use iree_task_scope_wait_idle to ensure the scope
+// state synchronizes prior to deinitializing. If the scope has already been
+// aborted or failed with a permanent error then the operation is ignored and
+// the previous error status is preserved.
+void iree_task_scope_abort(iree_task_scope_t* scope);
+
+// Marks the scope as having encountered an error while processing |task|.
+// The scope will be moved into a permanent failure state and all pending tasks
+// will be aborted. In-flight tasks may continue executing prior to
+// iree_task_scope_wait_idle returning true. If the scope has already been
+// marked as failing then the status is ignored.
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_task_t* task,
+ iree_status_t status);
+
+// Returns the permanent scope failure status to the caller (transfering
+// ownership). The scope will remain in a failed state with the status code.
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope);
+
+// Returns and resets the statistics for the scope.
+// Statistics may experience tearing (non-atomic update across fields) if this
+// is performed while tasks are in-flight.
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+ iree_task_scope_t* scope);
+
+// Returns true if the scope has no pending or in-flight tasks.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers are expected to use this only when it is safe.
+bool iree_task_scope_is_idle(iree_task_scope_t* scope);
+
+// Waits for the scope to become idle indicating that all pending and in-flight
+// tasks have completed. If the scope is aborted or marked for permanent failure
+// then the wait will only return after it is guaranteed no more tasks will ever
+// be issued by the task system.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers must ensure this is used for command and control
+// decisions only when no other threads may be enqueuing more work.
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+ iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_SCOPE_H_
diff --git a/iree/task/scope_test.cc b/iree/task/scope_test.cc
new file mode 100644
index 0000000..9f42dd4
--- /dev/null
+++ b/iree/task/scope_test.cc
@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/scope.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(ScopeTest, Any) {
+ // TODO(benvanik): tests.
+}
+
+} // namespace
diff --git a/iree/task/submission.c b/iree/task/submission.c
new file mode 100644
index 0000000..6a1aefe
--- /dev/null
+++ b/iree/task/submission.c
@@ -0,0 +1,65 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/submission.h"
+
+#include "iree/base/debugging.h"
+
+void iree_task_submission_initialize(iree_task_submission_t* out_submission) {
+ iree_task_list_initialize(&out_submission->ready_list);
+ iree_task_list_initialize(&out_submission->waiting_list);
+}
+
+void iree_task_submission_initialize_from_lifo_slist(
+ iree_atomic_task_slist_t* ready_slist,
+ iree_task_submission_t* out_submission) {
+ // Flush from the LIFO ready list to the LIFO submission queue.
+ // We have to walk everything here to get the tail pointer, which could be
+ // improved by sourcing from something other than an slist.
+ iree_task_submission_initialize(out_submission);
+ iree_atomic_task_slist_flush(
+ ready_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+ &out_submission->ready_list.head, &out_submission->ready_list.tail);
+}
+
+void iree_task_submission_reset(iree_task_submission_t* submission) {
+ memset(&submission->ready_list, 0, sizeof(submission->ready_list));
+ memset(&submission->waiting_list, 0, sizeof(submission->waiting_list));
+}
+
+void iree_task_submission_discard(iree_task_submission_t* submission) {
+ iree_task_list_discard(&submission->ready_list);
+ iree_task_list_discard(&submission->waiting_list);
+}
+
+bool iree_task_submission_is_empty(iree_task_submission_t* submission) {
+ return iree_task_list_is_empty(&submission->ready_list) &&
+ iree_task_list_is_empty(&submission->waiting_list);
+}
+
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+ iree_task_t* task) {
+ IREE_ASSERT_TRUE(iree_task_is_ready(task),
+ "must be a root task to be enqueued on a submission");
+ if (task->type == IREE_TASK_TYPE_WAIT &&
+ (task->flags & IREE_TASK_FLAG_WAIT_COMPLETED) == 0) {
+ // A wait that we know is unresolved and can immediately route to the
+ // waiting list. This avoids the need to try to schedule the wait when it's
+ // almost certain that the wait would not be satisfied.
+ iree_task_list_push_front(&submission->waiting_list, task);
+ } else {
+ // Task is ready to execute immediately.
+ iree_task_list_push_front(&submission->ready_list, task);
+ }
+}
diff --git a/iree/task/submission.h b/iree/task/submission.h
new file mode 100644
index 0000000..0a1e84e
--- /dev/null
+++ b/iree/task/submission.h
@@ -0,0 +1,102 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_SUBMISSION_H_
+#define IREE_TASK_SUBMISSION_H_
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// A pending submission to a task queue made up of a DAG of tasks.
+// Tasks are executed when ready in the order they were enqueued while observing
+// all dependencies. This means that two tasks that have no dependencies may
+// execute out of order/overlap.
+//
+// By keeping track of which tasks are ready for execution (ready_list) upon
+// submission to a queue we avoid the need to walk the task list again and
+// instead only touch the waiting tasks during construction and as they are made
+// ready, avoiding needless work and cache thrashing.
+//
+// Waiting tasks (waiting_list) are those waiting on external dependencies such
+// as file descriptor wait handles. Because we track all of these the executor
+// can perform an efficient multi-wait across queues without needing to block
+// (or even check) every waiting task individually.
+//
+// Because we only track roots of the DAG to release all tasks in a submission
+// early (due to failure or shutdown) the DAG must be walked. Releasing just the
+// lists will only handle the roots and leave all the rest of the tasks
+// dangling.
+//
+// Thread-compatible; designed to be used from a single thread producing the
+// submission.
+typedef struct {
+ // List of tasks that are ready for execution immediately. Upon submission to
+ // a queue the tasks will be passed on to the executor with no delay.
+ //
+ // Tasks are stored in LIFO order; this allows us to quickly concat them with
+ // incoming/mailbox slists that are naturally in LIFO order and that may
+ // contain tasks from prior submissions. Note that we are representing a
+ // ready list - meaning that all tasks are able to start simultaneously (in
+ // the best case where tasks <= workers); this means that the ordering
+ // requirements here are purely for performance and ease of debugging. In
+ // cases where tasks >> workers we could also see some benefits from the
+ // eventual FIFO order matching how the tasks were allocated.
+ iree_task_list_t ready_list;
+
+ // List of tasks that are waiting for execution on external dependencies.
+ // These are root tasks that have no internal task dependencies.
+ // Order is not important here; the assumption is that all waiting tasks are
+ // more of a set than an ordered list and that they can all be waited on as a
+ // multi-wait-any.
+ iree_task_list_t waiting_list;
+} iree_task_submission_t;
+
+// Initializes a task submission.
+void iree_task_submission_initialize(iree_task_submission_t* out_submission);
+
+// Flushes the given |ready_slist| and initializes the submission with all tasks
+// to the submission in LIFO order. All tasks in |ready_slist| are assumed to be
+// ready for execution immediately.
+void iree_task_submission_initialize_from_lifo_slist(
+ iree_atomic_task_slist_t* ready_slist,
+ iree_task_submission_t* out_submission);
+
+// Resets the submission by dropping the list references.
+void iree_task_submission_reset(iree_task_submission_t* submission);
+
+// Discards all pending tasks in the submission. This is only safe to call if
+// the submission has not yet been submitted to a queue for execution and should
+// be used for failure cleanup during submission construction.
+void iree_task_submission_discard(iree_task_submission_t* submission);
+
+// Returns true if the submission has no tasks.
+bool iree_task_submission_is_empty(iree_task_submission_t* submission);
+
+// Enqueues |task| to the pending |submission|.
+// The task will be checked to see whether it is immediately ready to execute
+// and placed in an appropriate list; all dependencies must be declared prior to
+// calling this method. After returning new tasks that depend on this task may
+// still be defined. The submission takes ownership of the |task|.
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+ iree_task_t* task);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_SUBMISSION_H_
diff --git a/iree/task/task.c b/iree/task/task.c
new file mode 100644
index 0000000..38c708b
--- /dev/null
+++ b/iree/task/task.c
@@ -0,0 +1,793 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/task.h"
+
+#include <stdio.h>
+
+#include "iree/task/task_impl.h"
+
+//==============================================================================
+// Task bookkeeping
+//==============================================================================
+
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+ iree_task_t* out_task) {
+ // NOTE: only clears the header, not the task body.
+ memset(out_task, 0, sizeof(*out_task));
+ out_task->scope = scope;
+ out_task->affinity_set = iree_task_affinity_for_any_worker();
+ out_task->type = type;
+}
+
+void iree_task_set_completion_task(iree_task_t* task,
+ iree_task_t* completion_task) {
+ assert(!task->completion_task);
+ task->completion_task = completion_task;
+ iree_atomic_fetch_add_int32(&completion_task->pending_dependency_count, 1,
+ iree_memory_order_seq_cst);
+}
+
+bool iree_task_is_ready(iree_task_t* task) {
+ if (iree_atomic_load_int32(&task->pending_dependency_count,
+ iree_memory_order_relaxed) > 0) {
+ // At least one dependency is still pending.
+ return false;
+ }
+ return true;
+}
+
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist) {
+ // NOTE: we always try adding to the head of the discard_worklist so that
+ // we hopefully get some locality benefits. This models a DFS discard in
+ // our non-recursive approach.
+
+ // Almost all tasks will have a completion task; some may have additional
+ // dependent tasks (like barriers) that will be handled below.
+ if (task->completion_task) {
+ iree_task_list_push_front(discard_worklist, task->completion_task);
+ }
+
+ switch (task->type) {
+ default:
+ case IREE_TASK_TYPE_NOP:
+ case IREE_TASK_TYPE_CALL:
+ break;
+ case IREE_TASK_TYPE_BARRIER: {
+ iree_task_barrier_t* barrier_task = (iree_task_barrier_t*)task;
+ for (uint32_t i = 0; i < barrier_task->dependent_task_count; ++i) {
+ iree_task_list_push_front(discard_worklist,
+ barrier_task->dependent_tasks[i]);
+ }
+ break;
+ }
+ case IREE_TASK_TYPE_FENCE: {
+ // TODO(benvanik): signal as error.
+ // iree_task_fence_t* fence_task = (iree_task_fence_t*)task;
+ iree_atomic_fetch_sub_int32(&task->scope->pending_submissions, 1,
+ iree_memory_order_relaxed);
+ break;
+ }
+ case IREE_TASK_TYPE_WAIT:
+ case IREE_TASK_TYPE_DISPATCH:
+ case IREE_TASK_TYPE_DISPATCH_SLICE:
+ break;
+ }
+
+ // Release the task back to the pool it was allocated from, if any.
+ // Some tasks are allocated from arenas and may not be able to be freed
+ // individually.
+ if (task->pool) {
+ iree_task_pool_release(task->pool, task);
+ }
+
+ // NOTE: task is invalidated here and cannot be used!
+}
+
+static void iree_task_retire(iree_task_t* task,
+ iree_task_submission_t* pending_submission) {
+ // Decrement the pending count on the completion task, if any.
+ iree_task_t* completion_task = task->completion_task;
+ if (completion_task &&
+ iree_atomic_fetch_sub_int32(&completion_task->pending_dependency_count, 1,
+ iree_memory_order_acq_rel) == 1) {
+ // The completion task has retired and can now be made ready.
+ iree_task_submission_enqueue(pending_submission, completion_task);
+ }
+ task->completion_task = NULL;
+
+ // Return the task to the pool it was allocated from.
+ // Some tasks are allocated as part of arenas/ringbuffers and won't have a
+ // pool as they'll be cleaned up as part of a larger operation.
+ if (task->pool) {
+ iree_task_pool_release(task->pool, task);
+ }
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+ iree_task_nop_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_NOP, scope, &out_task->header);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ iree_task_call_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_CALL, scope, &out_task->header);
+ out_task->closure = closure;
+}
+
+iree_status_t iree_task_call_execute(
+ iree_task_call_t* task, iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_status_t status =
+ task->closure.fn(task->closure.user_context, /*task_context=*/0);
+
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+ iree_host_size_t dependent_task_count,
+ iree_task_t* const* dependent_tasks,
+ iree_task_barrier_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_BARRIER, scope, &out_task->header);
+ out_task->dependent_task_count = dependent_task_count;
+ out_task->dependent_tasks = dependent_tasks;
+ for (iree_host_size_t i = 0; i < out_task->dependent_task_count; ++i) {
+ iree_task_t* dependent_task = out_task->dependent_tasks[i];
+ iree_atomic_fetch_add_int32(&dependent_task->pending_dependency_count, 1,
+ iree_memory_order_relaxed);
+ }
+}
+
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // NOTE: we walk in reverse so that we enqueue in LIFO order.
+ for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+ iree_task_t* dependent_task =
+ task->dependent_tasks[task->dependent_task_count - i - 1];
+ if (iree_atomic_fetch_sub_int32(&dependent_task->pending_dependency_count,
+ 1, iree_memory_order_acq_rel) == 1) {
+ // The dependent task has retired and can now be made ready.
+ iree_task_submission_enqueue(pending_submission, dependent_task);
+ }
+ }
+
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+ iree_task_fence_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_FENCE, scope, &out_task->header);
+ iree_atomic_fetch_add_int32(&scope->pending_submissions, 1,
+ iree_memory_order_relaxed);
+}
+
+void iree_task_fence_retire(iree_task_fence_t* task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_task_scope_t* scope = task->header.scope;
+ if (iree_atomic_fetch_sub_int32(&scope->pending_submissions, 1,
+ iree_memory_order_acq_rel) == 1) {
+ // All submissions have completed in this scope - notify any waiters.
+ iree_notification_post(&scope->idle_notification, IREE_ALL_WAITERS);
+ }
+
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+ iree_wait_handle_t wait_handle,
+ iree_task_wait_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_WAIT, scope, &out_task->header);
+ out_task->wait_handle = wait_handle;
+}
+
+bool iree_task_wait_check_condition(iree_task_wait_t* task) {
+ // TODO(benvanik): conditions.
+ task->header.flags |= IREE_TASK_FLAG_WAIT_COMPLETED;
+ return true;
+}
+
+void iree_task_wait_retire(iree_task_wait_t* task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ // TODO(benvanik): allow deinit'ing the wait handle (if transient).
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* utilities
+//==============================================================================
+
+// Returns an XXBBGGRR color (red in the lowest bits).
+// Must not be 0 (tracy will ignore).
+static uint32_t iree_task_tile_to_color(
+ const iree_task_tile_context_t* tile_context);
+
+#if defined(IREE_TASK_TRACING_PER_TILE_COLORS)
+
+// TODO(#4017): optimize this to compute entire slices at once and fold in the
+// work grid location code.
+static uint32_t iree_math_hsv_to_xrgb(const uint8_t h, const uint8_t s,
+ const uint8_t v) {
+ // NOTE: this is matching with tracy's TracyColor.cpp implementation so that
+ // our colors fit nicely in the UI.
+ const uint8_t reg = h / 43;
+ const uint8_t rem = (h - (reg * 43)) * 6;
+ const uint8_t p = (v * (255 - s)) >> 8;
+ const uint8_t q = (v * (255 - ((s * rem) >> 8))) >> 8;
+ const uint8_t t = (v * (255 - ((s * (255 - rem)) >> 8))) >> 8;
+
+ // clang-format off
+ uint8_t r, g, b;
+ switch (reg) {
+ case 0: r = v; g = t; b = p; break;
+ case 1: r = q; g = v; b = p; break;
+ case 2: r = p; g = v; b = t; break;
+ case 3: r = p; g = q; b = v; break;
+ case 4: r = t; g = p; b = v; break;
+ default: r = v; g = p; b = q; break;
+ }
+ // clang-format on
+
+ uint32_t xrgb = (r << 16) | (g << 8) | b;
+ xrgb |= (xrgb ? 0 : 1); // ensure never zero
+ return xrgb;
+}
+
+static uint32_t iree_task_tile_to_color(
+ const iree_task_tile_context_t* tile_context) {
+ // TODO(#4017): optimize such that it's always on when tracing is
+ // enabled by amortizing the cost across the entire slice.
+
+ // Picked to try to make it easy to see gradients from tiles along the same x,
+ // y, and z (in that order). x is the fastest changing dimension and as such
+ // should all have the same hue, while z is the slowest changing dimension and
+ // should have different hues.
+ uint8_t h = (tile_context->workgroup_xyz[1] /
+ (float)(tile_context->workgroup_count[1])) *
+ 255;
+ h = (h * 11400714819323198485ull) & 0xFF;
+ uint8_t s = 100 - (tile_context->workgroup_xyz[2] /
+ (float)(tile_context->workgroup_count[2])) *
+ 100;
+ uint8_t v = (tile_context->workgroup_xyz[0] /
+ (float)(tile_context->workgroup_count[0])) *
+ 50 +
+ 50;
+ return iree_math_hsv_to_xrgb(h, s, v);
+}
+
+#else
+
+static uint32_t iree_task_tile_to_color(
+ const iree_task_tile_context_t* tile_context) {
+ return 0; // use default tracy colors
+}
+
+#endif // IREE_TASK_TRACING_PER_TILE_COLORS
+
+void iree_task_dispatch_statistics_merge(
+ const iree_task_dispatch_statistics_t* source,
+ iree_task_dispatch_statistics_t* target) {
+ // TODO(benvanik): statistics.
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+static void iree_task_dispatch_initialize_base(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ const uint32_t workgroup_size[3],
+ iree_task_dispatch_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_DISPATCH, scope, &out_task->header);
+ out_task->closure = closure;
+ memcpy(out_task->workgroup_size, workgroup_size,
+ sizeof(out_task->workgroup_size));
+ out_task->shared_memory_size = 0;
+ memset(&out_task->statistics, 0, sizeof(out_task->statistics));
+}
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ const uint32_t workgroup_size[3],
+ const uint32_t workgroup_count[3],
+ iree_task_dispatch_t* out_task) {
+ iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+ memcpy(out_task->workgroup_count.value, workgroup_count,
+ sizeof(out_task->workgroup_count.value));
+}
+
+void iree_task_dispatch_initialize_indirect(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ const uint32_t workgroup_size[3],
+ const uint32_t* workgroup_count_ptr,
+ iree_task_dispatch_t* out_task) {
+ iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+ out_task->header.flags |= IREE_TASK_FLAG_DISPATCH_INDIRECT;
+ out_task->workgroup_count.ptr = workgroup_count_ptr;
+}
+
+void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
+ iree_task_pool_t* slice_task_pool,
+ iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Mark the dispatch as having been issued; the next time it retires it'll be
+ // because all work has completed.
+ dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
+
+ // Fetch the workgroup count (directly or indirectly).
+ // By the task being ready to execute we know any dependencies on the
+ // indirection buffer have been satisfied and its safe to read.
+ uint32_t workgroup_count[3];
+ if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
+ memcpy(workgroup_count, dispatch_task->workgroup_count.ptr,
+ sizeof(workgroup_count));
+ } else {
+ memcpy(workgroup_count, dispatch_task->workgroup_count.value,
+ sizeof(workgroup_count));
+ }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+ char xyz_string[32];
+ int xyz_string_length =
+ snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+ workgroup_count[0], workgroup_count[1], workgroup_count[2]);
+ IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+ // Divide up all tiles into slices, our finest-granularity scheduling task.
+ const uint32_t tiles_per_slice_x = IREE_TASK_DISPATCH_TILES_PER_SLICE_X;
+ const uint32_t tiles_per_slice_y = IREE_TASK_DISPATCH_TILES_PER_SLICE_Y;
+ const uint32_t tiles_per_slice_z = IREE_TASK_DISPATCH_TILES_PER_SLICE_Z;
+ uint32_t slice_count_x = iree_max(1, workgroup_count[0] / tiles_per_slice_x);
+ uint32_t slice_count_y = iree_max(1, workgroup_count[1] / tiles_per_slice_y);
+ uint32_t slice_count_z = iree_max(1, workgroup_count[2] / tiles_per_slice_z);
+
+ // Compute how many slices each worker will process.
+ uint32_t slice_count = slice_count_x * slice_count_y * slice_count_z;
+ iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
+ uint32_t slices_per_worker = iree_max(1, slice_count / worker_count);
+
+ // Randomize starting worker.
+ iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
+ post_batch, dispatch_task->header.affinity_set);
+ iree_host_size_t worker_index = worker_offset;
+
+ // TODO(benvanik): rework this with some science. For now we just iteratively
+ // divide up the space from outer->inner scheduling dimension, but ideally
+ // we'd use some fun cray-style torus scheduling or hilbert curve magic to
+ // try to ensure better locality using worker constructive sharing masks.
+ // TODO(benvanik): observe affinity_set here when dividing ranges.
+ iree_host_size_t worker_slice_count = 0;
+ for (uint32_t slice_z = 0; slice_z < slice_count_z; ++slice_z) {
+ for (uint32_t slice_y = 0; slice_y < slice_count_y; ++slice_y) {
+ for (uint32_t slice_x = 0; slice_x < slice_count_x; ++slice_x) {
+ uint32_t workgroup_base[3];
+ workgroup_base[0] = slice_x * tiles_per_slice_x;
+ workgroup_base[1] = slice_y * tiles_per_slice_y;
+ workgroup_base[2] = slice_z * tiles_per_slice_z;
+ uint32_t workgroup_range[3];
+ workgroup_range[0] = iree_min(
+ workgroup_count[0], workgroup_base[0] + tiles_per_slice_x - 1);
+ workgroup_range[1] = iree_min(
+ workgroup_count[1], workgroup_base[1] + tiles_per_slice_y - 1);
+ workgroup_range[2] = iree_min(
+ workgroup_count[2], workgroup_base[2] + tiles_per_slice_z - 1);
+
+ // Allocate and initialize the slice.
+ iree_task_dispatch_slice_t* slice_task =
+ iree_task_dispatch_slice_allocate(dispatch_task, workgroup_base,
+ workgroup_range, workgroup_count,
+ slice_task_pool);
+
+ // Enqueue on the worker selected for the task.
+ iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
+ &slice_task->header);
+ if (++worker_slice_count >= slices_per_worker) {
+ ++worker_index;
+ worker_slice_count = 0;
+ }
+ }
+ }
+ }
+
+ // NOTE: the dispatch is not retired until all slices complete. Upon the last
+ // slice completing the lucky worker will retire the task inline and
+ // potentially queue up more ready tasks that follow.
+ //
+ // The gotcha here is that it's possible for there to be zero slices within
+ // a dispatch (if, for example, and indirect dispatch had its workgroup counts
+ // set to zero to prevent it from running). We check for that here.
+ if (slice_count == 0) {
+ iree_task_dispatch_retire(dispatch_task, pending_submission);
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_dispatch_issue_sharded(
+ iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
+ iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Mark the dispatch as having been issued; the next time it retires it'll be
+ // because all work has completed.
+ dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
+
+ iree_task_dispatch_shard_state_t* shared_state =
+ &dispatch_task->shared.shard_state;
+ shared_state->dispatch_task = dispatch_task;
+
+ // Fetch the workgroup count (directly or indirectly).
+ // By the task being ready to execute we know any dependencies on the
+ // indirection buffer have been satisfied and its safe to read.
+ if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
+ memcpy(shared_state->workgroup_count, dispatch_task->workgroup_count.ptr,
+ sizeof(shared_state->workgroup_count));
+ } else {
+ memcpy(shared_state->workgroup_count, dispatch_task->workgroup_count.value,
+ sizeof(shared_state->workgroup_count));
+ }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+ char xyz_string[32];
+ int xyz_string_length = snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string),
+ "%ux%ux%u", shared_state->workgroup_count[0],
+ shared_state->workgroup_count[1],
+ shared_state->workgroup_count[2]);
+ IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+ // TODO(benvanik): shared memory; likely pulled from a ringbuffer.
+ // We'll have to ensure we have the memory available prior to scheduling the
+ // dispatch and probably just pass it in as an argument in here.
+ shared_state->shared_memory = iree_make_byte_span(NULL, 0);
+
+ // Setup the iteration space for shards to pull work from the complete grid.
+ iree_atomic_store_int32(&shared_state->tile_index, 0,
+ iree_memory_order_relaxed);
+ shared_state->tile_count = shared_state->workgroup_count[0] *
+ shared_state->workgroup_count[1] *
+ shared_state->workgroup_count[2];
+
+ // Compute shard count - almost always worker_count unless we are a very small
+ // dispatch (1x1x1, etc).
+ iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
+ iree_host_size_t shard_count =
+ iree_min(shared_state->tile_count, worker_count);
+
+ // Compute how many tiles we want each shard to reserve at a time from the
+ // larger grid. A higher number reduces overhead and improves locality while
+ // a lower number reduces maximum worst-case latency (coarser work stealing).
+ if (shared_state->tile_count <
+ worker_count * IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION) {
+ // Grid is small - allow it to be eagerly sliced up.
+ shared_state->tiles_per_reservation = 1;
+ } else {
+ shared_state->tiles_per_reservation =
+ IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION;
+ }
+
+ // Randomize starting worker.
+ iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
+ post_batch, dispatch_task->header.affinity_set);
+ iree_host_size_t worker_index = worker_offset;
+
+ for (iree_host_size_t i = 0; i < shard_count; ++i) {
+ // Allocate and initialize the shard.
+ iree_task_dispatch_shard_t* shard_task = iree_task_dispatch_shard_allocate(
+ dispatch_task, shared_state, shard_task_pool);
+
+ // Enqueue on the worker selected for the task.
+ iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
+ &shard_task->header);
+ ++worker_index;
+ }
+
+ // NOTE: the dispatch is not retired until all shards complete. Upon the last
+ // shard completing the lucky worker will retire the task inline and
+ // potentially queue up more ready tasks that follow.
+ //
+ // The gotcha here is that it's possible for there to be zero shards within
+ // a dispatch (if, for example, and indirect dispatch had its workgroup counts
+ // set to zero to prevent it from running). We check for that here.
+ if (shard_count == 0) {
+ iree_task_dispatch_retire(dispatch_task, pending_submission);
+ }
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // TODO(benvanik): attach statistics to the tracy zone.
+
+ // Merge the statistics from the dispatch into the scope so we can track all
+ // of the work without tracking all the dispatches at a global level.
+ iree_task_dispatch_statistics_merge(
+ &dispatch_task->statistics,
+ &dispatch_task->header.scope->dispatch_statistics);
+
+ iree_task_retire(&dispatch_task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
+ const uint32_t workgroup_base[3],
+ const uint32_t workgroup_range[3],
+ const uint32_t workgroup_count[3],
+ iree_task_dispatch_slice_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SLICE,
+ dispatch_task->header.scope, &out_task->header);
+ iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
+ out_task->closure = dispatch_task->closure;
+
+ memcpy(out_task->workgroup_base, workgroup_base,
+ sizeof(out_task->workgroup_base));
+ memcpy(out_task->workgroup_range, workgroup_range,
+ sizeof(out_task->workgroup_range));
+ memcpy(out_task->workgroup_size, dispatch_task->workgroup_size,
+ sizeof(out_task->workgroup_size));
+ memcpy(out_task->workgroup_count, workgroup_count,
+ sizeof(out_task->workgroup_count));
+
+ // TODO(benvanik): shared memory; likely pulled from a ringbuffer.
+ // We'll have to ensure we have the memory available prior to scheduling the
+ // dispatch and probably just pass it in as an argument in here.
+ out_task->shared_memory = iree_make_byte_span(NULL, 0);
+
+ // Wire up dispatch statistics; we'll track on the slice while we run and
+ // then the per-slice statistics will roll up into the dispatch statistics.
+ out_task->dispatch_statistics = &dispatch_task->statistics;
+ memset(&out_task->slice_statistics, 0, sizeof(out_task->slice_statistics));
+}
+
+iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
+ iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
+ const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
+ iree_task_pool_t* slice_task_pool) {
+ iree_task_dispatch_slice_t* slice_task = NULL;
+ iree_status_t status =
+ iree_task_pool_acquire(slice_task_pool, (iree_task_t**)&slice_task);
+ if (!iree_status_is_ok(status)) {
+ iree_status_ignore(status);
+ return NULL;
+ }
+ iree_task_dispatch_slice_initialize(dispatch_task, workgroup_base,
+ workgroup_range, workgroup_count,
+ slice_task);
+ slice_task->header.pool = slice_task_pool;
+ return slice_task;
+}
+
+iree_status_t iree_task_dispatch_slice_execute(
+ iree_task_dispatch_slice_t* task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // TODO(benvanik): coroutine support. Ideally this function can be called
+ // multiple times for the same slice, and we'll have a way to ready up the
+ // slices on the same workers (some per-worker suspended list?).
+
+ // Prepare context shared for all tiles in the slice.
+ iree_task_tile_context_t tile_context;
+ memcpy(&tile_context.workgroup_size, task->workgroup_size,
+ sizeof(tile_context.workgroup_size));
+ memcpy(&tile_context.workgroup_count, task->workgroup_count,
+ sizeof(tile_context.workgroup_count));
+ tile_context.shared_memory = task->shared_memory;
+ tile_context.statistics = &task->slice_statistics;
+
+ const uint32_t base_x = task->workgroup_base[0];
+ const uint32_t base_y = task->workgroup_base[1];
+ const uint32_t base_z = task->workgroup_base[2];
+ const uint32_t range_x = task->workgroup_range[0];
+ const uint32_t range_y = task->workgroup_range[1];
+ const uint32_t range_z = task->workgroup_range[2];
+ for (uint32_t z = base_z; z <= range_z; ++z) {
+ tile_context.workgroup_xyz[2] = z;
+ for (uint32_t y = base_y; y <= range_y; ++y) {
+ tile_context.workgroup_xyz[1] = y;
+ for (uint32_t x = base_x; x <= range_x; ++x) {
+ tile_context.workgroup_xyz[0] = x;
+ IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
+ "iree_task_dispatch_slice_execute_tile");
+ IREE_TRACE_ZONE_SET_COLOR(z_tile,
+ iree_task_tile_to_color(&tile_context));
+
+ // NOTE: these are useful for debugging but dramatically increase our
+ // cost here; only enable if needed for tracking work distribution:
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, x);
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, y);
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, z);
+ // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
+
+ iree_status_t status = task->closure.fn(task->closure.user_context,
+ (uintptr_t)&tile_context);
+
+ IREE_TRACE_ZONE_END(z_tile);
+ if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+ // NOTE: we don't bother to update statistics here on failure as the
+ // partial results won't really help much.
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+ }
+ }
+ }
+ }
+
+ // Push aggregate statistics up to the dispatch.
+ if (task->dispatch_statistics) {
+ iree_task_dispatch_statistics_merge(&task->slice_statistics,
+ task->dispatch_statistics);
+ }
+
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+void iree_task_dispatch_shard_initialize(
+ iree_task_dispatch_t* dispatch_task,
+ iree_task_dispatch_shard_state_t* shared_state,
+ iree_task_dispatch_shard_t* out_task) {
+ iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SHARD,
+ dispatch_task->header.scope, &out_task->header);
+ iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
+ out_task->shared_state = shared_state;
+}
+
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+ iree_task_dispatch_t* dispatch_task,
+ iree_task_dispatch_shard_state_t* shared_state,
+ iree_task_pool_t* shard_task_pool) {
+ iree_task_dispatch_shard_t* shard_task = NULL;
+ iree_status_t status =
+ iree_task_pool_acquire(shard_task_pool, (iree_task_t**)&shard_task);
+ if (!iree_status_is_ok(status)) {
+ iree_status_ignore(status);
+ return NULL;
+ }
+ iree_task_dispatch_shard_initialize(dispatch_task, shared_state, shard_task);
+ shard_task->header.pool = shard_task_pool;
+ return shard_task;
+}
+
+iree_status_t iree_task_dispatch_shard_execute(
+ iree_task_dispatch_shard_t* task,
+ iree_task_submission_t* pending_submission) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_task_dispatch_shard_state_t* shared_state = task->shared_state;
+ iree_task_dispatch_t* dispatch_task = shared_state->dispatch_task;
+
+ // Prepare context shared for all tiles in the shard.
+ iree_task_tile_context_t tile_context;
+ memcpy(&tile_context.workgroup_size, dispatch_task->workgroup_size,
+ sizeof(tile_context.workgroup_size));
+ memcpy(&tile_context.workgroup_count, task->shared_state->workgroup_count,
+ sizeof(tile_context.workgroup_count));
+ tile_context.shared_memory = shared_state->shared_memory;
+ uint32_t workgroup_count_x = tile_context.workgroup_count[0];
+ uint32_t workgroup_count_y = tile_context.workgroup_count[1];
+
+ // We perform all our shard statistics work locally here and only push back to
+ // the dispatch at the end; this avoids contention from each shard trying to
+ // update the statistics together.
+ iree_task_dispatch_statistics_t shard_statistics;
+ memset(&shard_statistics, 0, sizeof(shard_statistics));
+ tile_context.statistics = &shard_statistics;
+
+ // Loop over all tiles until they are all processed.
+ const uint32_t tile_count = shared_state->tile_count;
+ const uint32_t tiles_per_reservation = shared_state->tiles_per_reservation;
+ uint32_t tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+ tiles_per_reservation,
+ iree_memory_order_relaxed);
+ while (tile_base < tile_count) {
+ const uint32_t next_tile_base = iree_atomic_fetch_add_int32(
+ &shared_state->tile_index, tiles_per_reservation,
+ iree_memory_order_relaxed);
+
+ const uint32_t tile_range =
+ iree_min(tile_base + tiles_per_reservation, tile_count);
+ for (uint32_t tile_index = tile_base; tile_index < tile_range;
+ ++tile_index) {
+ // TODO(benvanik): faster math here, especially knowing we pull off N
+ // sequential indices per reservation.
+ uint32_t tile_i = tile_index;
+ tile_context.workgroup_xyz[0] = tile_i % (workgroup_count_x + 1);
+ tile_i /= (workgroup_count_x + 1);
+ tile_context.workgroup_xyz[1] = tile_i % (workgroup_count_y + 1);
+ tile_i /= (workgroup_count_y + 1);
+ tile_context.workgroup_xyz[2] = tile_i;
+
+ IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
+ "iree_task_dispatch_shard_execute_tile");
+ IREE_TRACE_ZONE_SET_COLOR(z_tile, iree_task_tile_to_color(&tile_context));
+
+ // NOTE: these are useful for debugging but dramatically increase our
+ // cost here; only enable if needed for tracking work distribution:
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[0]);
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[1]);
+ IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[2]);
+ // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
+
+ iree_status_t status = dispatch_task->closure.fn(
+ dispatch_task->closure.user_context, (uintptr_t)&tile_context);
+
+ IREE_TRACE_ZONE_END(z_tile);
+ if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+ // NOTE: we don't bother to update statistics here on failure as the
+ // partial results won't really help much.
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+ }
+ }
+
+ tile_base = next_tile_base;
+ }
+
+ // Push aggregate statistics up to the dispatch.
+ iree_task_dispatch_statistics_merge(&shard_statistics,
+ &dispatch_task->statistics);
+
+ iree_task_retire(&task->header, pending_submission);
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
diff --git a/iree/task/task.h b/iree/task/task.h
new file mode 100644
index 0000000..35a632f
--- /dev/null
+++ b/iree/task/task.h
@@ -0,0 +1,620 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TASK_H_
+#define IREE_TASK_TASK_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomic_slist.h"
+#include "iree/base/atomics.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/wait_handle.h"
+#include "iree/task/affinity_set.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+typedef struct iree_task_list_s iree_task_list_t;
+typedef struct iree_task_pool_s iree_task_pool_t;
+typedef struct iree_task_scope_s iree_task_scope_t;
+
+//==============================================================================
+// Function closures
+//==============================================================================
+
+typedef iree_status_t(IREE_API_PTR* iree_task_closure_fn_t)(
+ uintptr_t user_context, uintptr_t task_context);
+
+// A function closure representing the function to call and its arguments.
+typedef struct {
+ // Function called per tile invocation.
+ iree_task_closure_fn_t fn;
+
+ // User-defined argument passed to task functions during invocation.
+ // Opaque pointer-sized values that could point to user data structures or
+ // contain embedded values. No lifetime management is performed by the task
+ // system and it is required that users ensure that the memory referenced is
+ // live until after the task has completed.
+ uintptr_t user_context;
+
+ // TODO(benvanik): cleanup function? right now assume arg is never freed.
+} iree_task_closure_t;
+
+// Binds a function pointer and the arguments it should be called with.
+// If the arguments represent pointers they must remain live until the task
+// has completed execution.
+static inline iree_task_closure_t iree_task_make_closure(
+ iree_task_closure_fn_t fn, uintptr_t user_context) {
+ iree_task_closure_t closure = {fn, user_context};
+ return closure;
+}
+
+//==============================================================================
+// Task header for internal tracking
+//==============================================================================
+
+// Specifies the type of a task and how executors handle it.
+enum iree_task_type_e {
+ // Task is a no-op (performs no work) and exists for flexibility.
+ IREE_TASK_TYPE_NOP = 0u,
+
+ // Task will synchronously call a function before continuing.
+ IREE_TASK_TYPE_CALL = 1u,
+
+ // Task exists only as a barrier to join/fork tasks and has no executable
+ // payload.
+ IREE_TASK_TYPE_BARRIER = 2u,
+
+ // Task is a fence indicating that a certain point in the task graph has been
+ // reached. All tasks prior to this fence (by way of happens-before
+ // dependencies) are guaranteed to have retired.
+ IREE_TASK_TYPE_FENCE = 3u,
+
+ // Task is a wait on an external wait handle (fd, HANDLE, etc).
+ // Executors will wait on the handle until it is signaled and meets the
+ // specified condition prior to readying the dependent tasks.
+ IREE_TASK_TYPE_WAIT = 4u,
+
+ // Task is a 3D grid dispatch of zero or more tiles.
+ // Dispatches are issued when ready by either being split into zero or more
+ // slices with one or more tiles each based on the workgroup count or one
+ // shard per worker that should process the dispatch.
+ //
+ // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
+ // workgroup count from a buffer immediately prior to fan-out instead of using
+ // the values embedded in the task structure.
+ //
+ // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
+ // set to indicate that when the dispatch becomes ready again it will be after
+ // all slices have completed.
+ IREE_TASK_TYPE_DISPATCH = 5u,
+
+ // Task is a slice of a larger contiguous dispatch tile range. The full
+ // dispatch will be sliced into zero or more slices and each slice will be
+ // posted to a particular worker for executiion. If work progresses unevenly
+ // then entire slices will be stolen across workers to balance out the timing.
+ // Slices retire once they have completed the tiles assigned to them.
+ IREE_TASK_TYPE_DISPATCH_SLICE = 6u,
+
+ // Task is one of potentially many shards processing a larger dispatch grid.
+ // Each shard may have a preference as to which parts of grid it will focus
+ // on but is able to otherwise steal any available region directly from the
+ // shared dispatch coordination state. Shards retire once there are no more
+ // tiles remaining in the dispatch grid.
+ IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
+};
+typedef uint8_t iree_task_type_t;
+
+enum iree_task_flags_e {
+ // The wait handle the task is specified to wait on has resolved and the task
+ // can now be considered complete.
+ IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,
+
+ // The workgroup count for the dispatch is provided by way of a pointer to a
+ // list of 3 uint32_t values that will be sampled immediately prior to
+ // issuing of the dispatch. The contents of the pointer can be safely modified
+ // up until the last dependency has completed and the dispatch is about to be
+ // issued.
+ IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,
+
+ // The dispatch should be sliced across workers via the low-contention
+ // IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
+ // to the time when the grid is sliced for a savings during when the grid is
+ // executed.
+ IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,
+
+ // The dispatch has been issued and the task is waiting for one or more
+ // slices to complete. After they complete the dispatch will be readied and
+ // can be retired.
+ //
+ // Though added by the executor after issuing a dispatch users can also set
+ // this to indicate that all dispatch slices for a particular dispatch have
+ // been statically scheduled. Executors will then skip issuing the dispatch
+ // and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
+ // behavior but without an additional task as dispatches are still required
+ // to store information for slices.
+ IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
+};
+typedef uint16_t iree_task_flags_t;
+
+typedef struct iree_task_s iree_task_t;
+
+// A task within the task system that runs on an executor.
+// Tasks have an iree_task_type_t that defines which parameters are valid and
+// how the executor is to treat the task. Dependency edges can be defined that
+// determine the execution order of tasks within the executors.
+struct iree_alignas(iree_max_align_t) iree_task_s {
+ // Instrusive pointer used to store tasks within iree_task_list_t and
+ // iree_atomic_task_list_t singly-linked lists. This must come first in the
+ // structure so that it is at the appropriate alignment.
+ iree_task_t* next_task;
+
+ // The scope this task is attributed to. Errors with the task will be
+ // propagated to the scope and errors in the scope will cause pending tasks to
+ // be skipped.
+ iree_task_scope_t* scope;
+
+ // Optional task that will be notified when the task completes.
+ // The task will have its pending_dependency_count decremented and will be
+ // readied for execution when the count reaches 0.
+ iree_task_t* completion_task;
+
+ // Specifies which workers will be used to execute this task.
+ // Forked tasks will inherit their parent task affinity (possibly with some
+ // task-dependent rules) to partition workloads across workers with knowledge
+ // of the specific work being performed. For example, some dispatches can be
+ // limited to run on certain microarchitectures that workers have affinity
+ // with at the OS scheduler level (such as little.BIG topologies).
+ iree_task_affinity_set_t affinity_set;
+
+ // Total number of dependent tasks still outstanding. Decremented each time
+ // a dependent task completes. The task is considered ready to execute when
+ // this value reaches 0.
+ iree_atomic_int32_t pending_dependency_count;
+
+ // Optional pool the task should be returned to after it has resolved. If the
+ // task was allocated as part of a larger data structure (embedded within
+ // an arena for example) then this can be NULL to prevent the task system
+ // from interfering.
+ iree_task_pool_t* pool;
+
+ // Specifies the type of the task and how the executor handles it.
+ iree_task_type_t type;
+
+ // Task-specific flag bits.
+ iree_task_flags_t flags;
+};
+static_assert(offsetof(iree_task_t, next_task) == 0,
+ "next_task intrusive pointer must be at offset 0");
+
+// Initializes a task header with the given type.
+// Must be called on all tasks to ensure proper dependency tracking and list
+// state prior to enqueuing. Only the task header structure is initialized and
+// any additional data as part of the wrapping task type must be initialized by
+// the caller.
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+ iree_task_t* out_task);
+
+// Sets up a dependency edge from |task| to |completion_task| such that when
+// |task| completes |completion_task| will be notified and have its
+// pending_dependency_count decremented.
+void iree_task_set_completion_task(iree_task_t* task,
+ iree_task_t* completion_task);
+
+// Returns true if the |task| is ready to execute immediately.
+// Though this is safe to call from any thread the test may have false-negatives
+// (ready tasks are not returned as ready) due to cross-thread synchronization
+// latency. Note that tasks may yield themselves during execution and switch
+// from ready to waiting (such as when an indirect dispatch needs to wait for
+// all tiles to complete).
+bool iree_task_is_ready(iree_task_t* task);
+
+// Discards the task and any dependent tasks.
+// Any dependent tasks that need to be discarded will be added to
+// |discard_worklist| for the caller to continue discarding.
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist);
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+// Task is a no-op (performs no work) and exists for flexibility.
+// NOP tasks can be used to link together task lists from multiple threads
+// where it may otherwise not be ideal to have heavy-weight concurrency
+// structures. NOP tasks can also be useful for neutering another task type
+// after it has already been recorded into a list such as when cancellations
+// occur.
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+} iree_task_nop_t;
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+ iree_task_nop_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// A task that will synchronously call a function from the executor and wait
+// for it to complete before continuing.
+//
+// Memory referenced by closure arguments must be kept valid until the function
+// executes (in general with the same lifetime as the task itself).
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // Function closure to call when the task is executed.
+ iree_task_closure_t closure;
+} iree_task_call_t;
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ iree_task_call_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// A join point for fork/join-style scheduling.
+// References a set of dependent tasks that will be notified and possibly
+// readied when the barrier is reached.
+//
+// This allows for modeling one-to-many and many-to-many relationships. The base
+// task dependency system only models one-to-one and should be used if possible
+// to avoid the additional overhead of a barrier task both in memory and task
+// indirection/queuing.
+//
+// Example:
+// * [A] -> Barrier -> [C, D]
+// - A executes
+// - Barrier is processed after A completes
+// - C and D execute concurrently (in any order)
+//
+// * [A, B] -> Barrier -> [C, D]
+// - A and B execute concurrently (in any order)
+// - Barrier is processed after both A and B complete
+// - C and D execute concurrently
+//
+// * [A] -> Barrier -> [B]
+// - Don't do this and use the base task dependency instead; it'll work, but
+// it's much better to avoid the additional barrier indirection when
+// possible.
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // Number of valid tasks in the dependent_tasks list.
+ iree_host_size_t dependent_task_count;
+ // [0-dependent_task_count] tasks that will be notified when the barrier is
+ // reached. Each task will have its pending_dependency_count decremented and
+ // when the count reaches 0 be added to the ready list.
+ iree_task_t* const* dependent_tasks;
+} iree_task_barrier_t;
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+ iree_host_size_t dependent_task_count,
+ iree_task_t* const* dependent_tasks,
+ iree_task_barrier_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// A fence indicating that a certain point in the task graph has been reached.
+// All tasks prior to this fence (by way of happens-before dependencies) are
+// guaranteed to have retired.
+//
+// When all of the dependencies of a fence have retired the fence will notify
+// the parent scope of the task by decrementing the pending_submissions count
+// and publishing an idle_notification if it was the last in-flight submission.
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // TODO(benvanik): user-defined fence data for semaphore signaling. Optional
+ // wait_handle to signal?
+} iree_task_fence_t;
+
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+ iree_task_fence_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+typedef struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // The external wait handle that the task is waiting on.
+ iree_wait_handle_t wait_handle;
+
+ // TODO(benvanik): deadline_ns.
+ // TODO(benvanik): condition (possibly a closure to evaluate) ala condvar.
+} iree_task_wait_t;
+
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+ iree_wait_handle_t wait_handle,
+ iree_task_wait_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* structures
+//==============================================================================
+
+// Statistics tracked across an entire dispatch operation.
+// Each tile contributes to these statistics as they execute to provide an
+// aggregate set of statistics that can be reported to tracing/user queries.
+//
+// We want to keep this structure relatively compact as it does add overhead.
+// If statistics are used purely for interactive tracing then they can be
+// piped directly to the tracing tool using IREE_TRACE_* macros. If the
+// statistics are programmatically queried for benchmarks or reporting then
+// they belong here where we can efficiently move them around.
+//
+// If we find ourselves with a lot of hardware-specific counters (vs more
+// generic ones like 'l2 cache misses' or 'ipc') then we can sprinkle in some
+// #ifdefs.
+typedef struct {
+ // TODO(benvanik): statistics counters.
+ iree_atomic_int32_t reserved;
+} iree_task_dispatch_statistics_t;
+
+// Merges statistics from |source| to |target| atomically per-field.
+// As each field is updated independently and in a relaxed memory order it's
+// possible for statistics consumers to see a tear.
+void iree_task_dispatch_statistics_merge(
+ const iree_task_dispatch_statistics_t* source,
+ iree_task_dispatch_statistics_t* target);
+
+typedef struct {
+ // TODO(benvanik): coroutine storage.
+ // Ideally we'll be able to have a fixed coroutine storage size per dispatch
+ // (via @llvm.coro.size) such that we can preallocate all of the storage for
+ // a dispatch in one shot. If we need to do dynamic allocation we will need a
+ // ringbuffer or other kind of pool to allocate from on-demand.
+ uint32_t reserved;
+} iree_task_tile_storage_t;
+
+// Per-tile context provided to each dispatch function invocation in the grid.
+// This information is unique to the tile being dispatched and may contain
+// specific state about the calling thread/fiber/etc.
+//
+// If tile execution is suspended by hitting a coroutine suspend point then the
+// coroutine state will be stored within the tile context until the tile is
+// resumed.
+typedef iree_alignas(iree_max_align_t) struct {
+ // Workgroup ID for the current invocation.
+ uint32_t workgroup_xyz[3];
+ // Workgroup size for each invocation.
+ uint32_t workgroup_size[3];
+ // Total workgroup count for the task. Can be used in conjunction with the
+ // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+ uint32_t workgroup_count[3];
+ // TODO(benvanik): workgroup index to amortize calculating linear offsets.
+ // (like gl_GlobalInvocationID)
+
+ // Incoherent memory shared across all invocations of the task.
+ // Aligned to at least the natural pointer size of the machine. Functions must
+ // use atomic operations to ensure proper memory ordering.
+ iree_byte_span_t shared_memory;
+
+ // Shared statistics counters for the dispatch slice.
+ iree_task_dispatch_statistics_t* statistics;
+
+ // TODO(benvanik): cpuid uarch.
+ // TODO(benvanik): per-tile coroutine storage.
+} iree_task_tile_context_t;
+
+typedef struct iree_task_dispatch_s iree_task_dispatch_t;
+
+// Shared state for all shards processing a dispatch.
+typedef iree_alignas(iree_max_align_t) struct {
+ // Direct reference to the parent dispatch that all shards are processing.
+ iree_task_dispatch_t* dispatch_task;
+
+ // The tail tile index; the next reservation will start from here.
+ iree_atomic_int32_t tile_index;
+
+ // The total number of tiles in the dispatch bounding tile_index.
+ uint32_t tile_count;
+
+ // Maximum number of tiles to fetch per tile reservation from the grid.
+ // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
+ // reasonable number chosen based on the tile and shard counts.
+ uint32_t tiles_per_reservation;
+
+ // Total workgroup count for the task. Can be used in conjunction with the
+ // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+ uint32_t workgroup_count[3];
+
+ // Incoherent memory shared across all invocations of the task.
+ // Aligned to at least the natural pointer size of the machine. Functions must
+ // use atomic operations to ensure proper memory ordering.
+ iree_byte_span_t shared_memory;
+} iree_task_dispatch_shard_state_t;
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// An execution request across a tiled grid.
+// Dispatches are fork points where zero or more dispatch slice tasks are
+// spawned and processed prior to joining again on the dispatch completion task.
+//
+// The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
+// The count may either be embedded directly into the dispatch or provided as a
+// pointer to the workgroup_count[3] that will be read immediately prior to
+// forking. If any dimension of the workgroup count is zero then the dispatch is
+// skipped and the completion task will be readied immediately.
+//
+// Example:
+// dispatch([5, 1, 1])
+// forked into slices based on affinity/scheduling parameters:
+// -> dispatch_slice([0-1, 1, 1])
+// -> dispatch_slice([2-3, 1, 1])
+// -> dispatch_slice([4-5, 1, 1])
+// completion_task run after all slices complete
+typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_s {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // Function closure to call per tile.
+ iree_task_closure_t closure;
+
+ // Workgroup size for each invocation. Passed on to tiles without
+ // modification and not used for scheduling.
+ uint32_t workgroup_size[3];
+
+ // 3D workgroup count used to tile the dispatch.
+ // [1,1,1] specifies single invocation of the function. A value of 0 in
+ // any dimension will skip execution of the function.
+ union {
+ // Embedded immutable 3D workgroup count value.
+ uint32_t value[3];
+ // Pointer to the uint32_t[3] containing the 3D workgroup count.
+ // Sampled immediately prior to execution.
+ const uint32_t* ptr;
+ } workgroup_count;
+
+ // Optional transient shared memory size to allocate and pass into the
+ // iree_task_context_t::shared_memory of each invocation of the task
+ // closure.
+ iree_host_size_t shared_memory_size;
+
+ // Statistics storage used for aggregating counters across all slices.
+ iree_task_dispatch_statistics_t statistics;
+
+ // Shared state across all slices/shards/etc.
+ // Stored once per dispatch and then referenced by all subtasks.
+ union {
+ iree_task_dispatch_shard_state_t shard_state;
+ } shared;
+} iree_task_dispatch_t;
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ const uint32_t workgroup_size[3],
+ const uint32_t workgroup_count[3],
+ iree_task_dispatch_t* out_task);
+
+void iree_task_dispatch_initialize_indirect(iree_task_scope_t* scope,
+ iree_task_closure_t closure,
+ const uint32_t workgroup_size[3],
+ const uint32_t* workgroup_count_ptr,
+ iree_task_dispatch_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+// TODO(benvanik): per-region dependencies (allow slices to execute directly
+// across dispatches).
+
+// A slice of tiles within a larger dispatch grid.
+// These tasks are designed to be synthesized by the task system when processing
+// a dispatch task. Based on the workgroup count, affinity settings, and
+// available executor threads zero or more slices are enqueued, executed, and
+// retired as part of the complete dispatch task. The dispatch is only
+// considered completed and subsquent tasks readied once all slices are
+// complete.
+//
+// Slices aggregate statistics from all tiles within them and then upon
+// completion merge those into the shared dispatch statistics. As slices may
+// suspend and resume several times the dispatch-level statistics should only be
+// read once all slices have completed fully.
+//
+// In general slices represent a contiguous range of tiles along the most
+// rapidly changing dimension (x, then y, then z). This ensures that we at least
+// give the opportunity for cache locality to the tiles as they are processed.
+// If work stealing is enabled then slices may shed their trailing tiles to
+// other threads that have completed all of their work (at a cost of power vs.
+// potential latency savings).
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
+ // This removes the need for touching the dispatch struct when beginning a
+ // tile which would likely be a cache miss as we fan out to other cores.
+
+ // Function closure to call per tile (same as the closure in the dispatch).
+ iree_task_closure_t closure;
+
+ // Base workgroup ID for the slice range.
+ uint32_t workgroup_base[3];
+ // Total count of tiles within the slice range.
+ uint32_t workgroup_range[3];
+
+ // Workgroup size for each invocation.
+ uint32_t workgroup_size[3];
+ // Total workgroup count for the task. Can be used in conjunction with the
+ // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+ uint32_t workgroup_count[3];
+
+ // Incoherent memory shared across all invocations of the task.
+ // Aligned to at least the natural pointer size of the machine. Functions must
+ // use atomic operations to ensure proper memory ordering.
+ iree_byte_span_t shared_memory;
+
+ // Shared statistics counters for the entire dispatch. References the storage
+ // held in the parent iree_task_dispatch_t.
+ iree_task_dispatch_statistics_t* dispatch_statistics;
+ // Statistics just for this single slice. The statistics will be added to the
+ // dispatch_statistics after the slice completes to prevent excessive
+ // contention on the shared dispatch statistics across multiple threads.
+ iree_task_dispatch_statistics_t slice_statistics;
+
+ // Per-tile initialized coroutine storage for all tiles in the range
+ // initialized as each tile begins execution.
+ // TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
+} iree_task_dispatch_slice_t;
+
+// TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
+// This would be useful to reduce latency when the number of slices is small
+// (~<5) as the dispatch wouldn't need to be issued. This can also be used to
+// implement per-region dependencies as direct slice->slice deps vs. fork/join
+// dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
+void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
+ const uint32_t workgroup_base[3],
+ const uint32_t workgroup_range[3],
+ const uint32_t workgroup_count[3],
+ iree_task_dispatch_slice_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+typedef iree_alignas(iree_max_align_t) struct {
+ // Task header: implementation detail, do not use.
+ iree_task_t header;
+
+ // Active dispatch progress shared across all shards.
+ // Each shard will be read/modify/writing this and there's likely to be
+ // contention.
+ iree_task_dispatch_shard_state_t* shared_state;
+} iree_task_dispatch_shard_t;
+
+void iree_task_dispatch_shard_initialize(
+ iree_task_dispatch_t* dispatch_task,
+ iree_task_dispatch_shard_state_t* shared_state,
+ iree_task_dispatch_shard_t* out_task);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_TASK_H_
diff --git a/iree/task/task_impl.h b/iree/task/task_impl.h
new file mode 100644
index 0000000..6320998
--- /dev/null
+++ b/iree/task/task_impl.h
@@ -0,0 +1,158 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TASK_IMPL_H_
+#define IREE_TASK_TASK_IMPL_H_
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// Executes and retires a user call.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns the status of the user call.
+iree_status_t iree_task_call_execute(
+ iree_task_call_t* task, iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// Retires a barrier task by notifying all dependent tasks.
+// May add zero or more tasks to the |pending_submission| if they are ready.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+ iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// Retires a fence task by updating the scope state.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_fence_retire(iree_task_fence_t* task,
+ iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+// Returns true if the user-specified condition on the task is true.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+bool iree_task_wait_check_condition(iree_task_wait_t* task);
+
+// Retires a wait when it has completed waiting (successfully or not).
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_wait_retire(iree_task_wait_t* task,
+ iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// Schedules a dispatch by forking out to zero or more slices that will be
+// executed on workers. The slices are allocated from an executor-owned pool
+// and are generally not user-visible - they'll just see their dispatch begin
+// execution prior to the slices and end execution after the last slice
+// finishes.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
+ iree_task_pool_t* slice_task_pool,
+ iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch);
+
+// Schedules a dispatch by forking out to zero or more shards that will be
+// executed on workers. The shards are allocated from an executor-owned pool
+// and are generally not user-visible - they'll just see their dispatch begin
+// execution prior to the slices and end execution after the last shard
+// finishes.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_issue_sharded(
+ iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
+ iree_task_submission_t* pending_submission,
+ iree_task_post_batch_t* post_batch);
+
+// Retires a dispatch when all issued slices have completed executing.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+ iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+// Allocates a dispatch slice task from the shared executor task pool.
+// The slice will be released back to the pool when it has completed execution.
+iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
+ iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
+ const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
+ iree_task_pool_t* slice_task_pool);
+
+// Executes and retires a dispatch slice task.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns ok if all tiles were successfully executed and otherwise returns
+// an unspecified status (probably the first non-ok status hit).
+iree_status_t iree_task_dispatch_slice_execute(
+ iree_task_dispatch_slice_t* task,
+ iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+// Allocates a dispatch shard task from the shared executor task pool.
+// The shard will be released back to the pool when it has completed execution.
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+ iree_task_dispatch_t* dispatch_task,
+ iree_task_dispatch_shard_state_t* shared_state,
+ iree_task_pool_t* shard_task_pool);
+
+// Executes and retires a dispatch shard task.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns ok if all tiles processed in the shard successfully executed and
+// otherwise returns an unspecified status (probably the first non-ok status
+// hit).
+iree_status_t iree_task_dispatch_shard_execute(
+ iree_task_dispatch_shard_t* task,
+ iree_task_submission_t* pending_submission);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_TASK_IMPL_H_
diff --git a/iree/task/testing/BUILD b/iree/task/testing/BUILD
new file mode 100644
index 0000000..5c32559
--- /dev/null
+++ b/iree/task/testing/BUILD
@@ -0,0 +1,29 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+ default_visibility = ["//visibility:public"],
+ features = ["layering_check"],
+ licenses = ["notice"], # Apache 2.0
+)
+
+cc_library(
+ name = "test_util",
+ testonly = 1,
+ hdrs = ["test_util.h"],
+ deps = [
+ "//iree/task",
+ "//iree/testing:gtest",
+ ],
+)
diff --git a/iree/task/testing/CMakeLists.txt b/iree/task/testing/CMakeLists.txt
new file mode 100644
index 0000000..5ba9ce9
--- /dev/null
+++ b/iree/task/testing/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+ NAME
+ test_util
+ HDRS
+ "test_util.h"
+ DEPS
+ iree::task
+ iree::testing::gtest
+ TESTONLY
+ PUBLIC
+)
diff --git a/iree/task/testing/test_util.h b/iree/task/testing/test_util.h
new file mode 100644
index 0000000..c7ef9a4
--- /dev/null
+++ b/iree/task/testing/test_util.h
@@ -0,0 +1,85 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_TASK_TESTING_TEST_UTIL_H_
+#define IREE_TASK_TESTING_TEST_UTIL_H_
+
+#include <memory>
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/scope.h"
+#include "iree/testing/status_matchers.h"
+
+using TaskPoolPtr =
+ std::unique_ptr<iree_task_pool_t, void (*)(iree_task_pool_t*)>;
+static inline TaskPoolPtr AllocateNopPool() {
+ iree_task_pool_t* pool = new iree_task_pool_t();
+ IREE_CHECK_OK(iree_task_pool_initialize(iree_allocator_system(),
+ sizeof(iree_task_nop_t), 1024, pool));
+ return {pool, [](iree_task_pool_t* pool) {
+ iree_task_pool_deinitialize(pool);
+ delete pool;
+ }};
+}
+
+using TaskScopePtr =
+ std::unique_ptr<iree_task_scope_t, void (*)(iree_task_scope_t*)>;
+static inline TaskScopePtr AllocateScope(const char* name) {
+ iree_task_scope_t* scope = new iree_task_scope_t();
+ iree_task_scope_initialize(iree_make_cstring_view(name), scope);
+ return {scope, [](iree_task_scope_t* scope) {
+ iree_task_scope_deinitialize(scope);
+ delete scope;
+ }};
+}
+
+static inline iree_task_t* AcquireNopTask(TaskPoolPtr& pool,
+ TaskScopePtr& scope, uint16_t value) {
+ iree_task_t* task = NULL;
+ IREE_CHECK_OK(iree_task_pool_acquire(pool.get(), &task));
+ iree_task_initialize(IREE_TASK_TYPE_NOP, scope.get(), task);
+ task->flags = value;
+ return task;
+}
+
+static inline bool CheckListOrderFIFO(iree_task_list_t* list) {
+ iree_task_t* p = list->head;
+ if (!p) return true;
+ uint16_t value = p->flags;
+ p = p->next_task;
+ while (p) {
+ if (p->flags <= value) return false;
+ p = p->next_task;
+ }
+ return true;
+}
+
+static inline bool CheckListOrderLIFO(iree_task_list_t* list) {
+ iree_task_t* p = list->head;
+ if (!p) return true;
+ uint16_t value = p->flags;
+ p = p->next_task;
+ while (p) {
+ if (p->flags >= value) return false;
+ p = p->next_task;
+ }
+ return true;
+}
+
+#endif // IREE_TASK_TESTING_TEST_UTIL_H_
diff --git a/iree/task/topology.c b/iree/task/topology.c
new file mode 100644
index 0000000..ee0061f
--- /dev/null
+++ b/iree/task/topology.c
@@ -0,0 +1,387 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/topology.h"
+
+#include <cpuinfo.h>
+#include <stdio.h>
+
+#include "iree/base/math.h"
+#include "iree/base/tracing.h"
+
+struct iree_task_topology_s {
+ iree_allocator_t allocator;
+ iree_host_size_t group_capacity;
+ iree_host_size_t group_count;
+ iree_task_topology_group_t groups[0];
+};
+
+iree_status_t iree_task_topology_allocate(iree_host_size_t group_capacity,
+ iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_host_size_t topology_size =
+ sizeof(iree_task_topology_t) +
+ group_capacity * sizeof(iree_task_topology_group_t);
+
+ iree_task_topology_t* topology = NULL;
+ IREE_RETURN_IF_ERROR(
+ iree_allocator_malloc(allocator, topology_size, (void**)&topology));
+ topology->allocator = allocator;
+ topology->group_capacity = group_capacity;
+ topology->group_count = 0;
+
+ *out_topology = topology;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+void iree_task_topology_free(iree_task_topology_t* topology) {
+ if (!topology) return;
+ IREE_TRACE_ZONE_BEGIN(z0);
+ iree_allocator_free(topology->allocator, topology);
+ IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+ iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ // TODO(benvanik): define a format that is generally useful alongside cpuinfo.
+ // Maybe colon-separated group-id values from thread affinities? Like:
+ // 0.0:0.2:0.4:0.8 to indicate cores 0,2,4,8 on group 0
+ // 0.0:0.1:1.0:1.1 to indicate cores 0,1 of both groups 0,1
+ // etc
+ return iree_make_status(IREE_STATUS_UNIMPLEMENTED);
+}
+
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+ iree_host_size_t buffer_capacity, char* buffer,
+ iree_host_size_t* out_buffer_length) {
+ // TODO(benvanik): formatting to match parsing.
+ return false;
+}
+
+iree_host_size_t iree_task_topology_group_count(
+ const iree_task_topology_t* topology) {
+ return topology->group_count;
+}
+
+const iree_task_topology_group_t* iree_task_topology_get_group(
+ const iree_task_topology_t* topology, iree_host_size_t group_index) {
+ if (group_index >= topology->group_count) return NULL;
+ return &topology->groups[group_index];
+}
+
+iree_status_t iree_task_topology_push_group(
+ iree_task_topology_t* topology, const iree_task_topology_group_t* group) {
+ if (topology->group_count + 1 > topology->group_capacity) {
+ return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+ "group capacity exceeded");
+ }
+ iree_task_topology_group_t* dst_group =
+ &topology->groups[topology->group_count];
+ memcpy(dst_group, group, sizeof(*group));
+ dst_group->group_index = topology->group_count++;
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_from_group_count(
+ iree_host_size_t group_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ iree_task_topology_t* topology = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_task_topology_allocate(group_count, allocator, &topology));
+
+ for (iree_host_size_t i = 0; i < group_count; ++i) {
+ iree_task_topology_group_t* group = &topology->groups[i];
+ group->group_index = i;
+ snprintf(group->name, IREE_ARRAYSIZE(group->name), "worker[%d]", (int)i);
+ iree_thread_affinity_set_any(&group->ideal_thread_affinity);
+ group->constructive_sharing_mask = IREE_TASK_TOPOLOGY_GROUP_MASK_ALL;
+ }
+ topology->group_count = group_count;
+
+ *out_topology = topology;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+// Runs the cpuinfo initializer which caches its result on the first call.
+// Returns a failure if cpuinfo does not support the CPU/platform.
+static iree_status_t iree_task_topology_ensure_cpuinfo_available() {
+ if (!cpuinfo_initialize()) {
+ return iree_make_status(IREE_STATUS_UNAVAILABLE,
+ "cpuinfo failed to initialize");
+ }
+ return iree_ok_status();
+}
+
+// Returns the core of the calling thread or NULL if not supported.
+// We wrap this here because cpuinfo only returns non-NULL on linux.
+static const struct cpuinfo_core* iree_task_topology_get_current_core() {
+ const struct cpuinfo_core* current_core = cpuinfo_get_current_core();
+#if defined(IREE_PLATFORM_WINDOWS)
+ // TODO(benvanik): upstream into cpuinfo.
+ if (current_core == NULL) {
+ PROCESSOR_NUMBER processor_number;
+ GetCurrentProcessorNumberEx(&processor_number);
+ uint32_t processor_id =
+ cpuinfo_get_package(processor_number.Group)->processor_start +
+ processor_number.Number;
+ current_core = cpuinfo_get_processor(processor_id)->core;
+ }
+#endif // IREE_PLATFORM_WINDOWS
+ return current_core;
+}
+
+// Returns |core_id| rotated by the calling base core ID.
+// On many systems the kernel will have already assigned a randomized starting
+// core for thread distribution and we can just reuse that.
+static uint32_t iree_task_topology_rotate_from_base_core(uint32_t core_id) {
+ const struct cpuinfo_core* current_core =
+ iree_task_topology_get_current_core();
+ if (!current_core) {
+ return core_id; // don't modify if we don't know
+ }
+ uint32_t next_core_id =
+ (current_core->core_id + 1) % cpuinfo_get_cores_count();
+ return (next_core_id + core_id) % cpuinfo_get_cores_count();
+}
+
+// Sets a platform-specific iree_thread_affinity_t based on the cpuinfo
+// processor.
+static void iree_task_topology_set_affinity_from_processor(
+ const struct cpuinfo_processor* processor,
+ iree_thread_affinity_t* out_affinity) {
+ memset(out_affinity, 0, sizeof(*out_affinity));
+ out_affinity->specified = 1;
+
+ // Special bit to indicate that (if required) we want the entire core.
+ if (processor->core->processor_count > 1) {
+ out_affinity->smt = 1;
+ }
+
+ // cpuinfo #ifdefs the fields we need to extract the right platform IDs.
+ // We purposefully use the same exact macros they do there so that we don't
+ // have to worry about skew.
+
+#if defined(__MACH__) && defined(__APPLE__)
+ // TODO(benvanik): run on darwin to see how the l2 caches map. We ideally want
+ // a unique affinity ID per L2 cache.
+ // For now, we just use some random pointer bytes. It's just a tag used by
+ // the kernel to distribute the threads so the exact bits don't matter as long
+ // as they are unique per group we want isolated.
+ out_affinity->id = (uint32_t)(uintptr_t)processor;
+#elif defined(__linux__)
+ out_affinity->id = processor->linux_id;
+#elif defined(_WIN32) || defined(__CYGWIN__)
+ out_affinity->group = processor->windows_group_id;
+ out_affinity->id = processor->windows_processor_id;
+#else
+ // WASM? Unusued today.
+ out_affinity->specified = 0;
+#endif // cpuinfo-like platform field
+}
+
+// Returns a bitset with all *processors* that share the same |cache|.
+static uint64_t iree_task_topology_calculate_cache_bits(
+ const struct cpuinfo_cache* cache) {
+ if (!cache) return 0;
+ uint64_t mask = 0;
+ for (uint32_t processor_i = 0; processor_i < cache->processor_count;
+ ++processor_i) {
+ mask |= 1ull << (cache->processor_start + processor_i);
+ }
+ return mask;
+}
+
+// Constructs a constructive sharing mask for all *processors* that share the
+// same cache as the specified |processor|.
+static uint64_t iree_task_topology_calculate_constructive_sharing_mask(
+ const struct cpuinfo_processor* processor) {
+ uint64_t mask = 0;
+ mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1i);
+ mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1d);
+ mask |= iree_task_topology_calculate_cache_bits(processor->cache.l2);
+ // TODO(benvanik): include L3 here too (for systems that have it)? Or use L3
+ // info purely for distribution and focus the group mask on lower-latency
+ // caches?
+ return mask;
+}
+
+// Populates |our_group| with the information from |core|.
+static void iree_task_topology_group_initialize_from_core(
+ uint32_t group_index, const struct cpuinfo_core* core,
+ iree_task_topology_group_t* out_group) {
+ memset(out_group, 0, sizeof(*out_group));
+ out_group->group_index = group_index;
+ snprintf(out_group->name, IREE_ARRAYSIZE(out_group->name), "worker[%u]",
+ group_index);
+
+ // Guess: always pick the first processor in a core.
+ // When pinning to threads we'll take into account whether the core is SMT
+ // and use all threads anyway so this alignment is just helpful for debugging.
+ uint32_t processor_i = core->processor_start;
+ out_group->processor_index = processor_i;
+
+ const struct cpuinfo_processor* processor =
+ cpuinfo_get_processor(processor_i);
+ iree_task_topology_set_affinity_from_processor(
+ processor, &out_group->ideal_thread_affinity);
+}
+
+// Fixes constructive_sharing_mask values such that they represent other chosen
+// topology groups instead of processor indices. We do this so that code using
+// the topology groups doesn't need to know anything about which physical
+// processor IDs a particular group is mapped to.
+static void iree_task_topology_fixup_constructive_sharing_masks(
+ iree_task_topology_t* topology) {
+ // O(n^2), but n is always <= 64 (and often <= 8).
+ for (iree_host_size_t i = 0; i < topology->group_count; ++i) {
+ iree_task_topology_group_t* group = &topology->groups[i];
+
+ // Compute the processors that we can constructively share with.
+ uint64_t constructive_sharing_mask =
+ iree_task_topology_calculate_constructive_sharing_mask(
+ cpuinfo_get_processor(group->processor_index));
+
+ iree_task_topology_group_mask_t group_mask = 0;
+ for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
+ if (i == j) continue;
+ const iree_task_topology_group_t* other_group = &topology->groups[j];
+ uint64_t group_processor_bits = 1ull << other_group->processor_index;
+ if (constructive_sharing_mask & group_processor_bits) {
+ group_mask |= 1ull << other_group->group_index;
+ }
+ }
+
+ group->constructive_sharing_mask = group_mask;
+ }
+}
+
+// Matches all cores.
+static bool iree_task_topology_core_filter_all(const struct cpuinfo_core* core,
+ uintptr_t user_data) {
+ return true;
+}
+
+iree_status_t iree_task_topology_from_physical_cores(
+ iree_host_size_t max_core_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ return iree_task_topology_from_physical_cores_with_filter(
+ iree_task_topology_core_filter_all, 0, max_core_count, allocator,
+ out_topology);
+}
+
+// Matches only cores with the uarch as specified in |user_data|.
+static bool iree_task_topology_core_filter_uarch(
+ const struct cpuinfo_core* core, uintptr_t user_data) {
+ return core->uarch == user_data;
+}
+
+iree_status_t iree_task_topology_from_physical_cores_with_uarch(
+ uint32_t cpuinfo_uarch, iree_host_size_t max_core_count,
+ iree_allocator_t allocator, iree_task_topology_t** out_topology) {
+ return iree_task_topology_from_physical_cores_with_filter(
+ iree_task_topology_core_filter_uarch, cpuinfo_uarch, max_core_count,
+ allocator, out_topology);
+}
+
+iree_status_t iree_task_topology_from_physical_cores_with_filter(
+ iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
+ iree_host_size_t max_core_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_task_topology_ensure_cpuinfo_available());
+
+ // Count cores that match the filter.
+ iree_host_size_t core_count = 0;
+ for (uint32_t i = 0; i < cpuinfo_get_cores_count(); i++) {
+ const struct cpuinfo_core* core = cpuinfo_get_core(i);
+ if (filter_fn(core, filter_fn_data)) ++core_count;
+ }
+ core_count = iree_min(core_count, max_core_count);
+
+ iree_task_topology_t* topology = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_task_topology_allocate(core_count, allocator, &topology));
+
+ // Build each core up to the max allowed.
+ // TODO(benvanik): if our group_count <= core_count/2 then distribute better;
+ // for now we just do a straight-line through (cores 0-N) when instead we may
+ // want to take advantage of L3 cache info (half of groups on one L3 cache,
+ // half of groups on another, etc).
+ topology->group_count = core_count;
+ for (uint32_t core_i = 0, group_i = 0; group_i < topology->group_count;
+ ++core_i) {
+ // Rotate the core ID so that we avoid setting the affinity to the calling
+ // thread which we assume is something the user has plans for and doesn't
+ // want to have our workers stealing their time.
+ const struct cpuinfo_core* core =
+ cpuinfo_get_core(iree_task_topology_rotate_from_base_core(core_i));
+ if (filter_fn(core, filter_fn_data)) {
+ iree_task_topology_group_initialize_from_core(group_i, core,
+ &topology->groups[group_i]);
+ ++group_i;
+ }
+ }
+
+ iree_task_topology_fixup_constructive_sharing_masks(topology);
+ *out_topology = topology;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_from_unique_l2_cache_groups(
+ iree_host_size_t max_group_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_task_topology_ensure_cpuinfo_available());
+
+ iree_host_size_t cache_count = cpuinfo_get_l2_caches_count();
+ cache_count = iree_min(cache_count, max_group_count);
+
+ iree_task_topology_t* topology = NULL;
+ IREE_RETURN_AND_END_ZONE_IF_ERROR(
+ z0, iree_task_topology_allocate(cache_count, allocator, &topology));
+
+ // TODO(benvanik): iree_task_topology_rotate_from_base_core to offset all of
+ // the selection here (while still preserving the cache groups). May need to
+ // rework this to instead walk the core list and skip until a new cache is
+ // found instead of starting with the cache list.
+
+ // TODO(benvanik): if our group_count <= cache_count/2 then distribute better;
+ // we could use l3 cache in addition to ensure we are selecting cores that do
+ // (or do not) share.
+ topology->group_count = cache_count;
+ for (uint32_t cache_i = 0, group_i = 0; group_i < topology->group_count;
+ ++cache_i) {
+ const struct cpuinfo_cache* cache = cpuinfo_get_l2_cache(cache_i);
+ const struct cpuinfo_core* core =
+ cpuinfo_get_processor(cache->processor_start)->core;
+ iree_task_topology_group_initialize_from_core(group_i, core,
+ &topology->groups[group_i]);
+ ++group_i;
+ }
+
+ iree_task_topology_fixup_constructive_sharing_masks(topology);
+ *out_topology = topology;
+ IREE_TRACE_ZONE_END(z0);
+ return iree_ok_status();
+}
diff --git a/iree/task/topology.h b/iree/task/topology.h
new file mode 100644
index 0000000..c07d68f
--- /dev/null
+++ b/iree/task/topology.h
@@ -0,0 +1,162 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TOPOLOGY_H_
+#define IREE_TASK_TOPOLOGY_H_
+
+#include <limits.h>
+
+#include "iree/base/api.h"
+#include "iree/base/threading.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+struct cpuinfo_core;
+
+// A bitmask indicating which other groups from 0 to N may constructively share
+// caches. For example, a value of 0b1100 indicates that group 2 and 3 share.
+typedef uint64_t iree_task_topology_group_mask_t;
+
+#define IREE_TASK_TOPOLOGY_GROUP_MASK_ALL UINT64_MAX
+
+// Information about a particular group within the topology.
+// Groups may be of varying levels of granularity even within the same topology
+// based on how the topology is defined.
+typedef struct {
+ // Group index within the topology matching a particular bit in
+ // iree_task_topology_group_mask_t.
+ uint8_t group_index;
+
+ // A name assigned to executor workers used for logging/tracing.
+ char name[16];
+
+ // Processor index in the cpuinfo set.
+ uint32_t processor_index;
+
+ // Ideal thread affinity for threads within this group.
+ // All threads within the group share the same affinity and this is what
+ // allows us to model Simultaneous Multi-Threading (SMT) (aka hyperthreading).
+ iree_thread_affinity_t ideal_thread_affinity;
+
+ // A bitmask of other group indices that share some level of the cache
+ // hierarchy. Workers of this group are more likely to constructively share
+ // some cache levels higher up with these other groups. For example, if the
+ // workers in a group all share an L2 cache then the groups indicated here may
+ // all share the same L3 cache.
+ iree_task_topology_group_mask_t constructive_sharing_mask;
+} iree_task_topology_group_t;
+
+// Task system topology information used to define the workers within an
+// executor.
+//
+// Topologies are used to statically configure task executors by defining the
+// total number of workers in the worker pool and how those workers map to
+// hardware compute resources.
+//
+// Users can allocate topologies, populate them with zero or more groups, and
+// then pass them to the executor to construct the desired configuration. To
+// ease testing and debugging topologies can be formatted as string values and
+// round tripped through flags, though obviously the value of such encodings are
+// machine-dependent.
+//
+// Several helper constructors are available that query the machine topology
+// and attempt to derive some (hopefully) useful task system topology from it.
+// We can add the more common heuristics over time to the core and leave the
+// edge cases for applications to construct.
+typedef struct iree_task_topology_s iree_task_topology_t;
+
+// Allocates a task topology with at least |group_capacity|.
+iree_status_t iree_task_topology_allocate(iree_host_size_t group_capacity,
+ iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// Frees a topology structure.
+void iree_task_topology_free(iree_task_topology_t* topology);
+
+// Parses a serialized topology in string form.
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+ iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// Formats the topology as a string value that can be parsed with
+// iree_task_topology_parse.
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+ iree_host_size_t buffer_capacity, char* buffer,
+ iree_host_size_t* out_buffer_length);
+
+// Returns the total group count defined by the topology.
+iree_host_size_t iree_task_topology_group_count(
+ const iree_task_topology_t* topology);
+
+// Returns the group information for the given group index.
+const iree_task_topology_group_t* iree_task_topology_get_group(
+ const iree_task_topology_t* topology, iree_host_size_t group_index);
+
+// Pushes a new group onto the topology set.
+// The provided group data will be copied into the toplogy structure.
+iree_status_t iree_task_topology_push_group(
+ iree_task_topology_t* topology, const iree_task_topology_group_t* group);
+
+// Allocates a topology with the specified number of groups.
+// 0 is a valid value, indicating that only donated threads will be used to
+// perform work. Groups will have no specific affinity and rely on the OS
+// scheduler to ensure they are distributed in a meaningful way; this generally
+// works out as threads created within a process are usually rotated across
+// preferred processors by default.
+iree_status_t iree_task_topology_from_group_count(
+ iree_host_size_t group_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each physical core in the machine.
+// If detailed cache information is not available this is a decent
+// approximation that can be used as a fallback.
+iree_status_t iree_task_topology_from_physical_cores(
+ iree_host_size_t max_core_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each physical core in the machine
+// with the given microarchitecture specified as a cpuinfo_uarch value.
+iree_status_t iree_task_topology_from_physical_cores_with_uarch(
+ uint32_t cpuinfo_uarch, iree_host_size_t max_core_count,
+ iree_allocator_t allocator, iree_task_topology_t** out_topology);
+
+// Returns true if the given |core| passes the filter and should be included.
+// |user_data| is the value passed alongside the filter function.
+typedef bool (*iree_task_topology_core_filter_t)(
+ const struct cpuinfo_core* core, uintptr_t user_data);
+
+// Allocates a topology with one group for each core that matches |filter_fn|.
+iree_status_t iree_task_topology_from_physical_cores_with_filter(
+ iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
+ iree_host_size_t max_core_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each unique L2 cache group across
+// all available cores. This optimizes for temporal and spatial cache locality
+// but may suffer from oversubscription if there are other processes trying to
+// use the same cores.
+iree_status_t iree_task_topology_from_unique_l2_cache_groups(
+ iree_host_size_t max_group_count, iree_allocator_t allocator,
+ iree_task_topology_t** out_topology);
+
+// TODO(benvanik): more? or just make users implement as desired? Ideas:
+// - _from_unique_l2_cache_groups but with a min/max count (N% utilization)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_TOPOLOGY_H_
diff --git a/iree/task/topology_test.cc b/iree/task/topology_test.cc
new file mode 100644
index 0000000..2b7a665
--- /dev/null
+++ b/iree/task/topology_test.cc
@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/topology.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(TopologyTest, Any) {
+ // TODO(benvanik): tests.
+}
+
+} // namespace
diff --git a/iree/task/tuning.h b/iree/task/tuning.h
new file mode 100644
index 0000000..57fb434
--- /dev/null
+++ b/iree/task/tuning.h
@@ -0,0 +1,130 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TUNING_H_
+#define IREE_TASK_TUNING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Maximum number of workers that an executor can manage.
+// A 64 worker hard limit is based on us using uint64_t as a bitmask to select
+// workers. It's easy to go smaller (just use fewer bits) if it's known that
+// only <64 will ever be used (such as for devices with 2 cores).
+#define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
+
+// Initial number of slice tasks that are allocated in the executor pool.
+// Increasing this number will decrease initial allocation storms in cases of
+// extremely wide fan-out (many dispatches with many thousands of slices) at the
+// cost of a higher minimum memory consumption.
+//
+// Set to zero if sliced dispatches will not be used or will be allocated by the
+// caller to avoid fixed overhead associated with the internal executor pool.
+#define IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER (0)
+
+// Initial number of shard tasks that are allocated in the executor pool.
+// Increasing this number will decrease initial allocation storms in cases of
+// extremely wide concurrency regions (many dispatches running at the same time)
+// at the cost of a higher minimum memory consumption.
+#define IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER (4)
+
+// Maximum number of simultaneous waits an executor may perform as part of a
+// wait-any operation. A larger value may enable better wake coalescing by the
+// kernel. This is only a count limiting wait tasks that have been scheduled and
+// been promoted to the root executor waiting list. There may be any number of
+// waits deeper in the pipeline so long as they don't all become ready
+// simultaneously.
+//
+// Realistically, though, if we have more than 64 outstanding **root** waits
+// it's hard to reason about if/when the executor queue could make forward
+// progress and indicates a possible error in task assignment.
+//
+// Also, the underlying iree_wait_set_t may not support more than 64 handles on
+// certain platforms without emulation. Trying to keep us on the fast-path
+// with a reasonable number seems fine for now until we have a need for more.
+//
+// NOTE: we reserve 1 wait handle for our own internal use. This allows us to
+// wake the coordination worker when new work is submitted from external
+// sources.
+#define IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS (64 - 1)
+
+// Allows for dividing the total number of attempts that a worker will make to
+// steal tasks from other workers. By default all other workers will be
+// attempted while setting this to 2, for example, will try for only half of
+// the available workers.
+#define IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR (1)
+
+// Maximum number of tasks that will be stolen in one go from another worker.
+//
+// Too few tasks will cause additional overhead as the worker repeatedly sips
+// away tasks and when it does get tasks it may suffer spatial locality cache
+// issues as it is effectively walking backwards in memory to both touch the
+// tasks and - a much larger impact - running tasks that themselves are walking
+// orders of magnitude more memory backwards.
+//
+// Too many tasks will cause additional latency on workers that may interfere
+// with higher level scheduling; for example, if a worker runs out of tasks and
+// immediately steals 8000 of them from another worker it's going to take until
+// those 8000 complete before any work that arrives specifically for the worker
+// is able to start processing.
+//
+// In real-time systems too few tasks is better (slightly more work for much
+// lower variance in execution) while in batch mode systems too many tasks is
+// better (as latencies don't matter so long as throughput is maximized).
+#define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
+ IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
+
+// Number of tiles that will be batched into a single slice along each XYZ dim.
+//
+// Larger numbers reduce overhead and ensure that more tiles are executed
+// locally on the same worker (== shared caches) while also increasing potential
+// latency as work-stealing (always per-slice) is less effective.
+//
+// Numbers >1 on Y and Z can be used to cluster tiles together within the same
+// slice when spatial coherence outside of the X dimension is useful.
+//
+// The current usage of this is provisional; we may do all of this from the
+// compiler and want this behavior to be relatively fixed so that we can predict
+// it better. The only thing we want to be introducing here is flexibility for
+// when worker topology differs at runtime from what is knowable during offline
+// compilation.
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_X (8)
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Y (1)
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Z (1)
+
+// Number of tiles that will be batched into a single reservation from the grid.
+// This is a maximum; if there are fewer tiles that would otherwise allow for
+// maximum parallelism then this may be ignored.
+//
+// The more tiles reserved at a time the higher the chance for latency to
+// increase as many reserved tiles are held up on one worker while another may
+// have otherwise been able to steal them and help finish them sooner.
+//
+// The fewer tiles reserved at a time the higher the chance for cache-locality
+// destroying behavior where multiple workers all stomp on the same cache lines
+// (as say worker 0 and worker 1 both fight over sequential tiles adjacent in
+// memory).
+#define IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION (8)
+
+// Whether to enable per-tile colors for each tile tracing zone based on the
+// tile grid xyz. Not cheap and can be disabled to reduce tracing overhead.
+// TODO(#4017): make per-tile color tracing fast enough to always have on.
+#define IREE_TASK_TRACING_PER_TILE_COLORS 1
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_TUNING_H_
diff --git a/iree/task/worker.c b/iree/task/worker.c
new file mode 100644
index 0000000..a2eecb2
--- /dev/null
+++ b/iree/task/worker.c
@@ -0,0 +1,359 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/worker.h"
+
+#include "iree/base/debugging.h"
+#include "iree/base/math.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/task_impl.h"
+
+static int iree_task_worker_main(iree_task_worker_t* worker);
+
+iree_status_t iree_task_worker_initialize(
+ iree_task_executor_t* executor, iree_host_size_t worker_index,
+ const iree_task_topology_group_t* topology_group,
+ iree_prng_splitmix64_state_t* seed_prng, iree_task_worker_t* out_worker) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ out_worker->executor = executor;
+ out_worker->worker_bit = iree_task_affinity_for_worker(worker_index);
+ out_worker->ideal_thread_affinity = topology_group->ideal_thread_affinity;
+ out_worker->constructive_sharing_mask =
+ topology_group->constructive_sharing_mask;
+ out_worker->max_theft_attempts =
+ executor->worker_count / IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR;
+ iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(seed_prng),
+ &out_worker->theft_prng);
+
+ iree_task_worker_state_t initial_state = IREE_TASK_WORKER_STATE_RUNNING;
+ if (executor->scheduling_mode &
+ IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+ // User is favoring startup latency vs. initial scheduling latency. Our
+ // thread will be created suspended and not first scheduled until work
+ // arrives for it, (almost) ensuring no context switches and 10x+ lower
+ // blocking startup time.
+ initial_state = IREE_TASK_WORKER_STATE_SUSPENDED;
+ }
+ iree_atomic_store_int32(&out_worker->state, initial_state,
+ iree_memory_order_relaxed);
+
+ iree_notification_initialize(&out_worker->wake_notification);
+ iree_notification_initialize(&out_worker->state_notification);
+ iree_atomic_task_slist_initialize(&out_worker->mailbox_slist);
+ iree_task_queue_initialize(&out_worker->local_task_queue);
+
+ iree_thread_create_params_t thread_params;
+ memset(&thread_params, 0, sizeof(thread_params));
+ thread_params.name = iree_make_cstring_view(topology_group->name);
+ thread_params.create_suspended =
+ initial_state == IREE_TASK_WORKER_STATE_SUSPENDED;
+ thread_params.priority_class = IREE_THREAD_PRIORITY_CLASS_NORMAL;
+ thread_params.initial_affinity = out_worker->ideal_thread_affinity;
+
+ // NOTE: if the thread creation fails we'll bail here and let the caller
+ // cleanup by calling deinitialize (which is safe because we zero init
+ // everything).
+ iree_status_t status = iree_thread_create(
+ (iree_thread_entry_t)iree_task_worker_main, out_worker, thread_params,
+ executor->allocator, &out_worker->thread);
+
+ IREE_TRACE_ZONE_END(z0);
+ return status;
+}
+
+// Returns true if the worker is in the zombie state (exited and awaiting
+// teardown).
+static bool iree_task_worker_is_zombie(iree_task_worker_t* worker) {
+ return iree_atomic_load_int32(&worker->state, iree_memory_order_seq_cst) ==
+ IREE_TASK_WORKER_STATE_ZOMBIE;
+}
+
+void iree_task_worker_deinitialize(iree_task_worker_t* worker) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Wait for the thread to enter the zombie state indicating it has exited our
+ // main function - it may still be live in the OS, but it'll not be touching
+ // any of our data structures again so it's fine to blast away.
+ if (worker->thread) {
+ iree_notification_await(&worker->state_notification,
+ (iree_condition_fn_t)iree_task_worker_is_zombie,
+ worker);
+ }
+ iree_thread_release(worker->thread);
+
+ // Release unfinished tasks by flushing the mailbox (which if we're here can't
+ // get anything more posted to it) and then discarding everything we still
+ // have a reference to.
+ iree_atomic_task_slist_discard(&worker->mailbox_slist);
+ iree_task_list_discard(&worker->local_task_queue.list);
+
+ iree_notification_deinitialize(&worker->wake_notification);
+ iree_notification_deinitialize(&worker->state_notification);
+ iree_atomic_task_slist_deinitialize(&worker->mailbox_slist);
+ iree_task_queue_deinitialize(&worker->local_task_queue);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_request_exit(iree_task_worker_t* worker) {
+ if (!worker->thread) return;
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // If the thread is already in the exiting/zombie state we don't need to do
+ // anything.
+ iree_task_worker_state_t prev_state =
+ (iree_task_worker_state_t)iree_atomic_exchange_int32(
+ &worker->state, IREE_TASK_WORKER_STATE_EXITING,
+ iree_memory_order_acq_rel);
+ switch (prev_state) {
+ case IREE_TASK_WORKER_STATE_SUSPENDED:
+ // Worker was suspended; resume it so that it can exit itself.
+ iree_thread_resume(worker->thread);
+ break;
+ case IREE_TASK_WORKER_STATE_ZOMBIE:
+ // Worker already exited; reset state to ZOMBIE.
+ iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+ iree_memory_order_relaxed);
+ break;
+ default:
+ // Worker now set to EXITING and should exit soon.
+ break;
+ }
+
+ // Kick the worker in case it is waiting for work.
+ iree_notification_post(&worker->wake_notification, 1);
+
+ IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+ iree_task_list_t* list) {
+ // Move the list into the mailbox. Note that the mailbox is LIFO and this list
+ // is concatenated with its current order preserved (which should be LIFO).
+ iree_atomic_task_slist_concat(&worker->mailbox_slist, list->head, list->tail);
+ memset(list, 0, sizeof(*list));
+}
+
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+ iree_task_queue_t* target_queue,
+ iree_host_size_t max_tasks) {
+ // Try to grab tasks from the worker; if more than one task is stolen then the
+ // first will be returned and the remaining will be added to the target queue.
+ iree_task_t* task = iree_task_queue_try_steal(
+ &worker->local_task_queue, target_queue,
+ /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+ if (task) return task;
+
+ // If we still didn't steal any tasks then let's try the slist instead.
+ task = iree_atomic_task_slist_pop(&worker->mailbox_slist);
+ if (task) return task;
+
+ return NULL;
+}
+
+// Executes a task on a worker.
+// Only task types that are scheduled to workers are handled; all others must be
+// handled by the coordinator during scheduling.
+static iree_status_t iree_task_worker_execute(iree_task_worker_t* worker,
+ iree_task_t* task) {
+ // Execute the task and resolve the task and gather any tasks that are now
+ // ready for submission to the executor. They'll be scheduled the next time
+ // the coordinator runs.
+ //
+ // TODO(benvanik): think a bit more about this timing; this ensures we have
+ // BFS behavior at the cost of the additional merge overhead - it's probably
+ // worth it?
+ // TODO(benvanik): handle partial tasks and re-queuing.
+ iree_task_submission_t pending_submission;
+ iree_task_submission_initialize(&pending_submission);
+ switch (task->type) {
+ case IREE_TASK_TYPE_CALL: {
+ IREE_RETURN_IF_ERROR(
+ iree_task_call_execute((iree_task_call_t*)task, &pending_submission));
+ break;
+ }
+ case IREE_TASK_TYPE_DISPATCH_SLICE: {
+ IREE_RETURN_IF_ERROR(iree_task_dispatch_slice_execute(
+ (iree_task_dispatch_slice_t*)task, &pending_submission));
+ break;
+ }
+ case IREE_TASK_TYPE_DISPATCH_SHARD: {
+ IREE_RETURN_IF_ERROR(iree_task_dispatch_shard_execute(
+ (iree_task_dispatch_shard_t*)task, &pending_submission));
+ break;
+ }
+ default:
+ return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+ "incorrect task type for worker execution");
+ }
+
+ // NOTE: task is invalidated here!
+ task = NULL;
+
+ if (!iree_task_submission_is_empty(&pending_submission)) {
+ iree_task_executor_merge_submission(worker->executor, &pending_submission);
+ }
+ return iree_ok_status();
+}
+
+// Pumps the worker thread once, processing a single task.
+// Returns true if pumping should continue as there are more tasks remaining or
+// false if the caller should wait for more tasks to be posted.
+static bool iree_task_worker_pump_once(iree_task_worker_t* worker) {
+ IREE_TRACE_ZONE_BEGIN(z0);
+
+ // Check the local work queue for any work we know we should start
+ // processing immediately. Other workers may try to steal some of this work
+ // if we take too long.
+ iree_task_t* task = iree_task_queue_pop_front(&worker->local_task_queue);
+
+ // Check the mailbox to see if we have incoming work that has been posted.
+ // We try to greedily move it to our local work list so that we can work
+ // with the full thread-local pending task list.
+ if (!task) {
+ // NOTE: there's a potential for theft pessimization if the queue runs too
+ // low and there's nothing there when a thief goes to grab some tasks. A
+ // standout there would indicate that we weren't scheduling very well in the
+ // first place (large uneven workloads for various workers, bad distribution
+ // in the face of heterogenous multi-core architectures where some workers
+ // complete tasks faster than others, etc).
+ task = iree_task_queue_append_from_lifo_slist(&worker->local_task_queue,
+ &worker->mailbox_slist);
+ }
+
+ // If we ran out of work assigned to this specific worker try to steal some
+ // from other workers that we hopefully share some of the cache hierarchy
+ // with. Their tasks will be moved from their local queue into ours and the
+ // the first task in the queue is popped off and returned.
+ if (!task) {
+ task = iree_task_executor_try_steal_task(
+ worker->executor, worker->constructive_sharing_mask,
+ worker->max_theft_attempts, &worker->theft_prng,
+ &worker->local_task_queue);
+ }
+
+ // No tasks to run; let the caller know we want to wait for more.
+ if (!task) {
+ IREE_TRACE_ZONE_END(z0);
+ return false;
+ }
+
+ // Execute the task (may call out to arbitrary user code and may submit more
+ // tasks for execution).
+ iree_status_t status = iree_task_worker_execute(worker, task);
+
+ // TODO(#4026): propagate failure to task scope.
+ // We currently drop the error on the floor here; that's because the error
+ // should have already been propagated to the scope and everyone should be
+ // checking that before running things anyway.
+ //
+ // Since we can host work from multiple scopes and want to ensure an error
+ // in one doesn't bring down the whole system we pretend we executed
+ // something here by falling through.
+ IREE_ASSERT_TRUE(iree_status_is_ok(status));
+ iree_status_ignore(status);
+
+ IREE_TRACE_ZONE_END(z0);
+ return true; // try again
+}
+
+// Alternates between pumping ready tasks in the worker queue and waiting
+// for more tasks to arrive. Only returns when the worker has been asked by
+// the executor to exit.
+static void iree_task_worker_pump_until_exit(iree_task_worker_t* worker) {
+ // Pump the thread loop to process more tasks.
+ while (true) {
+ // Check state to see if we've been asked to exit.
+ if (iree_atomic_load_int32(&worker->state, iree_memory_order_relaxed) ==
+ IREE_TASK_WORKER_STATE_EXITING) {
+ // Thread exit requested - cancel pumping.
+ // TODO(benvanik): complete tasks before exiting?
+ break;
+ }
+
+ // If we fail to find any work to do we'll wait at the end of this loop.
+ // In order not to not miss any work that is enqueued after we've already
+ // checked a particular source we use an interruptable wait token that
+ // will prevent the wait from happening if anyone touches the data
+ // structures we use.
+ iree_wait_token_t wait_token =
+ iree_notification_prepare_wait(&worker->wake_notification);
+ iree_atomic_task_affinity_set_fetch_and(&worker->executor->worker_idle_mask,
+ ~worker->worker_bit,
+ iree_memory_order_relaxed);
+
+ while (iree_task_worker_pump_once(worker)) {
+ // All work done ^, which will return false when the worker should wait.
+ }
+
+ // When we encounter a complete lack of work we can self-nominate to check
+ // the global work queue and distribute work to other threads. Only one
+ // coordinator can be running at a time so we also ensure that if another
+ // is doing its work we gracefully wait for it. It's fine to block in here
+ // as the next thing we'd have done is go idle anyway.
+
+ // First self-nominate; this *may* do something or just be ignored (if
+ // another worker is already coordinating).
+ iree_task_executor_coordinate(worker->executor, worker,
+ /*speculative=*/true);
+
+ // If nothing has been enqueued since we started this loop (so even
+ // coordination didn't find anything) we go idle. Otherwise we fall
+ // through and try the loop again.
+ if (!iree_task_queue_is_empty(&worker->local_task_queue)) {
+ // Have more work to do; loop around to try another pump.
+ iree_notification_cancel_wait(&worker->wake_notification);
+ } else {
+ IREE_TRACE_ZONE_BEGIN_NAMED(z_wait,
+ "iree_task_worker_main_pump_wake_wait");
+ iree_atomic_task_affinity_set_fetch_or(
+ &worker->executor->worker_idle_mask, worker->worker_bit,
+ iree_memory_order_relaxed);
+ iree_notification_commit_wait(&worker->wake_notification, wait_token);
+ IREE_TRACE_ZONE_END(z_wait);
+ }
+
+ // Wait completed.
+ // Jump back up and try pumping any tasks that arrived.
+ continue;
+ }
+}
+
+// Thread entry point for each worker.
+static int iree_task_worker_main(iree_task_worker_t* worker) {
+ IREE_TRACE_ZONE_BEGIN(thread_zone);
+
+ // Reset affinity (as it can change over time).
+ // TODO(benvanik): call this after waking in case CPU hotplugging happens.
+ iree_thread_request_affinity(worker->thread, worker->ideal_thread_affinity);
+
+ // Enter the running state immediately. Note that we could have been requested
+ // to exit while suspended/still starting up, so check that here before we
+ // mess with any data structures.
+ bool should_run =
+ iree_atomic_exchange_int32(&worker->state, IREE_TASK_WORKER_STATE_RUNNING,
+ iree_memory_order_seq_cst) !=
+ IREE_TASK_WORKER_STATE_EXITING;
+ if (IREE_LIKELY(should_run)) {
+ // << work happens here >>
+ iree_task_worker_pump_until_exit(worker);
+ }
+
+ IREE_TRACE_ZONE_END(thread_zone);
+ iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+ iree_memory_order_release);
+ iree_notification_post(&worker->state_notification, IREE_ALL_WAITERS);
+ return 0;
+}
diff --git a/iree/task/worker.h b/iree/task/worker.h
new file mode 100644
index 0000000..1d45a5d
--- /dev/null
+++ b/iree/task/worker.h
@@ -0,0 +1,179 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_WORKER_H_
+#define IREE_TASK_WORKER_H_
+
+#include "iree/base/synchronization.h"
+#include "iree/base/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/queue.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+// Indicates the current state of a worker or, in the case of EXITING, the state
+// the worker should transition to.
+//
+// Transition graph:
+// SUSPENDED -> RUNNING (IDLE<->PROCESSING) -> EXITING -> ZOMBIE
+//
+// NOTE: state values are ordered such that </> comparisons can be used; ensure
+// that for example all states after resuming are > SUSPENDED and all states
+// before exiting are < EXITING.
+enum iree_task_worker_state_e {
+ // Worker has been created in a suspended state and must be resumed to wake.
+ IREE_TASK_WORKER_STATE_SUSPENDED = 0u,
+ // Worker is idle or actively processing tasks (either its own or others).
+ IREE_TASK_WORKER_STATE_RUNNING = 1u,
+ // Worker should exit (or is exiting) and will soon enter the zombie state.
+ // Coordinators can request workers to exit by setting their state to this and
+ // then waking.
+ IREE_TASK_WORKER_STATE_EXITING = 2u,
+ // Worker has exited and entered a 🧟 state (waiting for join).
+ // The thread handle is still valid and must be destroyed.
+ IREE_TASK_WORKER_STATE_ZOMBIE = 3u,
+};
+typedef int32_t iree_task_worker_state_t;
+
+// A worker within the executor pool.
+//
+// NOTE: fields in here are touched from multiple threads with lock-free
+// techniques. The alignment of the entire iree_task_worker_t as well as the
+// alignment and padding between particular fields is carefully (though perhaps
+// not yet correctly) selected; see the 'LAYOUT' comments below.
+typedef struct iree_task_worker_s {
+ // A LIFO mailbox used by coordinators to post tasks to this worker.
+ // As workers self-nominate to be coordinators and fan out dispatch slices
+ // they can directly emplace those slices into the workers that should execute
+ // them based on the work distribution policy. When workers go to look for
+ // more work after their local queue empties they will flush this list and
+ // move all of the tasks into their local queue and restart processing.
+ // LAYOUT: must be 64b away from local_task_queue.
+ iree_atomic_task_slist_t mailbox_slist;
+
+ // Current state of the worker (iree_task_worker_state_t).
+ // LAYOUT: frequent access; next to wake_notification as they are always
+ // accessed together.
+ iree_atomic_int32_t state;
+
+ // Notification signaled when the worker should wake (if it is idle).
+ // LAYOUT: next to state for similar access patterns; when posting other
+ // threads will touch mailbox_slist and then send a wake
+ // notification.
+ iree_notification_t wake_notification;
+
+ // Notification signaled when the worker changes any state.
+ iree_notification_t state_notification;
+
+ // Parent executor that can be used to access the global work queue or task
+ // pool. Executors always outlive the workers they own.
+ iree_task_executor_t* executor;
+
+ // Bit the worker represents in the various worker bitsets.
+ iree_task_affinity_set_t worker_bit;
+
+ // Ideal thread affinity for the worker thread.
+ iree_thread_affinity_t ideal_thread_affinity;
+
+ // A bitmask of other group indices that share some level of the cache
+ // hierarchy. Workers of this group are more likely to constructively share
+ // some cache levels higher up with these other groups. For example, if the
+ // workers in a group all share an L2 cache then the groups indicated here may
+ // all share the same L3 cache.
+ iree_task_affinity_set_t constructive_sharing_mask;
+
+ // Maximum number of attempts to make when trying to steal tasks from other
+ // workers. This could be 64 (try stealing from all workers) or just a handful
+ // (try stealing from these 3 other cores that share your L3 cache).
+ uint32_t max_theft_attempts;
+
+ // Rotation counter for work stealing (ensures we don't favor one victim).
+ // Only ever touched by the worker thread as it steals work.
+ iree_prng_minilcg128_state_t theft_prng;
+
+ // Thread handle of the worker. If the thread has exited the handle will
+ // remain valid so that the executor can query its state.
+ iree_thread_t* thread;
+
+ // Destructive interference padding between the mailbox and local task queue
+ // to ensure that the worker - who is pounding on local_task_queue - doesn't
+ // contend with submissions or coordinators dropping new tasks in the mailbox.
+ //
+ // TODO(benvanik): test on 32-bit platforms; I'm pretty sure we'll always be
+ // past the iree_hardware_constructive_interference_size given the bulk of
+ // stuff above, but it'd be nice to guarantee it.
+ uint8_t _padding[8];
+
+ // Worker-local FIFO queue containing the slices that will be processed by the
+ // worker. This queue supports work-stealing by other workers if they run out
+ // of work of their own.
+ // LAYOUT: must be 64b away from mailbox_slist.
+ iree_task_queue_t local_task_queue;
+} iree_task_worker_t;
+static_assert(offsetof(iree_task_worker_t, mailbox_slist) +
+ sizeof(iree_atomic_task_slist_t) <
+ iree_hardware_constructive_interference_size,
+ "mailbox_slist must be in the first cache line");
+static_assert(offsetof(iree_task_worker_t, local_task_queue) >=
+ iree_hardware_constructive_interference_size,
+ "local_task_queue must be separated from mailbox_slist by "
+ "at least a cache line");
+
+// Initializes a worker by creating its thread and configuring it for receiving
+// tasks. Where supported the worker will be created in a suspended state so
+// that we aren't creating a thundering herd on startup:
+// https://en.wikipedia.org/wiki/Thundering_herd_problem
+iree_status_t iree_task_worker_initialize(
+ iree_task_executor_t* executor, iree_host_size_t worker_index,
+ const iree_task_topology_group_t* topology_group,
+ iree_prng_splitmix64_state_t* seed_prng, iree_task_worker_t* out_worker);
+
+// Deinitializes a worker that has successfully exited. The worker must be in
+// the IREE_TASK_WORKER_STATE_ZOMBIE state.
+void iree_task_worker_deinitialize(iree_task_worker_t* worker);
+
+// Requests that the worker begin exiting (if it hasn't already).
+// If the worker is actively processing tasks it will wait until it has
+// completed all it can and is about to go idle prior to exiting.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_request_exit(iree_task_worker_t* worker);
+
+// Posts a FIFO list of tasks to the worker mailbox. The target worker takes
+// ownership of the tasks and will be woken if it is currently idle.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+ iree_task_list_t* list);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the worker FIFO will be moved to the |target_queue|
+// and the first of the stolen tasks is returned. While tasks from the FIFO
+// are preferred this may also steal tasks from the mailbox.
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+ iree_task_queue_t* target_queue,
+ iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+
+#endif // IREE_TASK_WORKER_H_