Adding an entire RTOS-scale task scheduling system.

commit: 01600a57fb6e5e4bfe7214a58c43c341b0cad60f [log] [tgz]
author: Ben Vanik <benvanik@google.com> Tue Dec 15 17:18:04 2020 -0800
committer: Ben Vanik <benvanik@google.com> Wed Dec 23 18:16:17 2020 -0800
tree: c3e83c8c34c7a66bdd4f2ccbac9ce18be164d8ee
parent: b47e6f0443a24361fc703c705587684ec94ef62b [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9f3888..98c1268 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -499,6 +499,7 @@
 add_subdirectory(iree/hal)
 add_subdirectory(iree/modules)
 add_subdirectory(iree/schemas)
+add_subdirectory(iree/task)
 add_subdirectory(iree/testing)
 add_subdirectory(iree/test)
 

diff --git a/iree/base/debugging.h b/iree/base/debugging.h
index d6cb377..538ba08 100644
--- a/iree/base/debugging.h
+++ b/iree/base/debugging.h

@@ -12,10 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// NOTE: the best kind of synchronization is no synchronization; always try to
-// design your algorithm so that you don't need anything from this file :)
-// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
-
 #ifndef IREE_BASE_DEBUGGING_H_
 #define IREE_BASE_DEBUGGING_H_
 

diff --git a/iree/task/BUILD b/iree/task/BUILD
new file mode 100644
index 0000000..aa31386
--- /dev/null
+++ b/iree/task/BUILD

@@ -0,0 +1,133 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "task",
+    srcs = [
+        "executor.c",
+        "executor_impl.h",
+        "list.c",
+        "pool.c",
+        "post_batch.c",
+        "post_batch.h",
+        "queue.c",
+        "scope.c",
+        "submission.c",
+        "task.c",
+        "task_impl.h",
+        "topology.c",
+        "worker.c",
+        "worker.h",
+    ],
+    hdrs = [
+        "affinity_set.h",
+        "executor.h",
+        "list.h",
+        "pool.h",
+        "queue.h",
+        "scope.h",
+        "submission.h",
+        "task.h",
+        "topology.h",
+        "tuning.h",
+    ],
+    deps = [
+        "//iree/base:api",
+        "//iree/base:atomic_slist",
+        "//iree/base:core_headers",
+        "//iree/base:synchronization",
+        "//iree/base:threading",
+        "//iree/base:tracing",
+        "//iree/base:wait_handle",
+        "@com_github_pytorch_cpuinfo//:cpuinfo",
+    ],
+)
+
+cc_test(
+    name = "executor_test",
+    srcs = ["executor_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/base:core_headers",
+        "//iree/task/testing:test_util",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "list_test",
+    srcs = ["list_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/task/testing:test_util",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "pool_test",
+    srcs = ["pool_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/task/testing:test_util",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "queue_test",
+    srcs = ["queue_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/task/testing:test_util",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "scope_test",
+    srcs = ["scope_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/task/testing:test_util",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "topology_test",
+    srcs = ["topology_test.cc"],
+    deps = [
+        ":task",
+        "//iree/base:api",
+        "//iree/testing:gtest",
+        "//iree/testing:gtest_main",
+    ],
+)

diff --git a/iree/task/CMakeLists.txt b/iree/task/CMakeLists.txt
new file mode 100644
index 0000000..f365ccf
--- /dev/null
+++ b/iree/task/CMakeLists.txt

@@ -0,0 +1,134 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    task
+  HDRS
+    "affinity_set.h"
+    "executor.h"
+    "list.h"
+    "pool.h"
+    "queue.h"
+    "scope.h"
+    "submission.h"
+    "task.h"
+    "topology.h"
+    "tuning.h"
+  SRCS
+    "executor.c"
+    "executor_impl.h"
+    "list.c"
+    "pool.c"
+    "post_batch.c"
+    "post_batch.h"
+    "queue.c"
+    "scope.c"
+    "submission.c"
+    "task.c"
+    "task_impl.h"
+    "topology.c"
+    "worker.c"
+    "worker.h"
+  DEPS
+    cpuinfo
+    iree::base::api
+    iree::base::atomic_slist
+    iree::base::core_headers
+    iree::base::synchronization
+    iree::base::threading
+    iree::base::tracing
+    iree::base::wait_handle
+  PUBLIC
+)
+
+iree_cc_test(
+  NAME
+    executor_test
+  SRCS
+    "executor_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::base::core_headers
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    list_test
+  SRCS
+    "list_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    pool_test
+  SRCS
+    "pool_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    queue_test
+  SRCS
+    "queue_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    scope_test
+  SRCS
+    "scope_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::task::testing::test_util
+    iree::testing::gtest
+    iree::testing::gtest_main
+)
+
+iree_cc_test(
+  NAME
+    topology_test
+  SRCS
+    "topology_test.cc"
+  DEPS
+    ::task
+    iree::base::api
+    iree::testing::gtest
+    iree::testing::gtest_main
+)

diff --git a/iree/task/affinity_set.h b/iree/task/affinity_set.h
new file mode 100644
index 0000000..5e8dee1
--- /dev/null
+++ b/iree/task/affinity_set.h

@@ -0,0 +1,93 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_AFFINITY_SET_H_
+#define IREE_TASK_AFFINITY_SET_H_
+
+#include "iree/base/atomics.h"
+#include "iree/base/math.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(benvanik): if IREE_TASK_EXECUTOR_MAX_WORKER_COUNT <= 32 then switch
+// these to using the 32-bit primitives. No real effect on larger 64-bit systems
+// but if we were on a smaller 32-bit system with 2 cores it's kind of silly to
+// be doing expensive 64-bit atomics on a 32-bit bus all for just 2 bits of
+// data :)
+
+//===----------------------------------------------------------------------===//
+// iree_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef uint64_t iree_task_affinity_set_t;
+
+// Allows for only a specific worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker(
+    uint8_t worker_index) {
+  return 1ull << worker_index;
+}
+
+// Allows for a range of workers to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_worker_range(
+    uint8_t worker_start, uint8_t worker_end) {
+  return ((1ull << (worker_start - 1)) - 1) ^ ((1ull << worker_end) - 1);
+}
+
+// Allows for any worker to be selected.
+static inline iree_task_affinity_set_t iree_task_affinity_for_any_worker() {
+  return UINT64_MAX;
+}
+
+#define iree_task_affinity_set_count_trailing_zeros \
+  iree_math_count_trailing_zeros_u64
+#define iree_task_affinity_set_count_ones iree_math_count_ones_u64
+#define iree_task_affinity_set_rotr iree_math_rotr_u64
+
+//===----------------------------------------------------------------------===//
+// iree_atomic_task_affinity_set_t
+//===----------------------------------------------------------------------===//
+
+typedef iree_atomic_int64_t iree_atomic_task_affinity_set_t;
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_load(
+    iree_atomic_task_affinity_set_t* set, iree_memory_order_t order) {
+  return iree_atomic_load_int64(set, order);
+}
+
+static inline void iree_atomic_task_affinity_set_store(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  iree_atomic_store_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_and(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  return iree_atomic_fetch_and_int64(set, value, order);
+}
+
+static inline iree_task_affinity_set_t iree_atomic_task_affinity_set_fetch_or(
+    iree_atomic_task_affinity_set_t* set, iree_task_affinity_set_t value,
+    iree_memory_order_t order) {
+  return iree_atomic_fetch_or_int64(set, value, order);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_AFFINITY_SET_H_

diff --git a/iree/task/executor.c b/iree/task/executor.c
new file mode 100644
index 0000000..b4af62e
--- /dev/null
+++ b/iree/task/executor.c

@@ -0,0 +1,725 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/executor.h"
+
+#include "iree/base/debugging.h"
+#include "iree/base/math.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/task_impl.h"
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor);
+
+iree_status_t iree_task_executor_create(
+    iree_task_scheduling_mode_t scheduling_mode,
+    const iree_task_topology_t* topology, iree_allocator_t allocator,
+    iree_task_executor_t** out_executor) {
+  iree_host_size_t worker_count = iree_task_topology_group_count(topology);
+  if (worker_count > IREE_TASK_EXECUTOR_MAX_WORKER_COUNT) {
+    return iree_make_status(
+        IREE_STATUS_RESOURCE_EXHAUSTED,
+        "requested %zu workers but a maximum of %d is allowed", worker_count,
+        IREE_TASK_EXECUTOR_MAX_WORKER_COUNT);
+  }
+
+  // TODO(benvanik): support a threadless mode where we have one dummy worker
+  // that just holds the lists but is pumped from donate_caller.
+  if (worker_count == 0) {
+    return iree_make_status(
+        IREE_STATUS_UNIMPLEMENTED,
+        "threadless donate-only executor mode not yet implemented");
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_ASSERT_ARGUMENT(out_executor);
+  *out_executor = NULL;
+
+  iree_host_size_t executor_size =
+      sizeof(iree_task_executor_t) + worker_count * sizeof(iree_task_worker_t);
+
+  iree_task_executor_t* executor = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(allocator, executor_size, (void**)&executor));
+  memset(executor, 0, executor_size);
+  iree_atomic_ref_count_init(&executor->ref_count);
+  executor->allocator = allocator;
+  executor->scheduling_mode = scheduling_mode;
+  iree_atomic_task_slist_initialize(&executor->incoming_ready_slist);
+  iree_atomic_task_slist_initialize(&executor->incoming_waiting_slist);
+  iree_slim_mutex_initialize(&executor->coordinator_mutex);
+  iree_slim_mutex_initialize(&executor->wait_mutex);
+
+  // Simple PRNG used to generate seeds for the per-worker PRNGs used to
+  // distribute work. This isn't strong (and doesn't need to be); it's just
+  // enough to ensure each worker gets a sufficiently random seed for itself to
+  // then generate entropy with. As a hack we use out_executor's address, as
+  // that should live on the caller stack and with ASLR that's likely pretty
+  // random itself. I'm sure somewhere a mathemetician just cringed :)
+  iree_prng_splitmix64_state_t seed_prng;
+  iree_prng_splitmix64_initialize(/*seed=*/(uint64_t)(out_executor),
+                                  &seed_prng);
+  iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(&seed_prng),
+                                  &executor->donation_theft_prng);
+
+  iree_status_t status = iree_ok_status();
+
+  // Wait set used to batch syscalls for polling/waiting on wait handles.
+  // This is currently limited to a relatively small max to make bad behavior
+  // clearer with nice RESOURCE_EXHAUSTED errors.
+  if (iree_status_is_ok(status)) {
+    status = iree_wait_set_allocate(IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS,
+                                    allocator, &executor->wait_set);
+  }
+
+  // Pool used for all dispatch->slice fanout tasks. These only live within the
+  // executor and since we know the precise lifetime of them we can keep them
+  // entirely within the system here.
+  if (iree_status_is_ok(status)) {
+    status = iree_task_pool_initialize(
+        allocator, sizeof(iree_task_dispatch_slice_t),
+        worker_count * IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER,
+        &executor->slice_task_pool);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_task_pool_initialize(
+        allocator, sizeof(iree_task_dispatch_shard_t),
+        worker_count * IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
+        &executor->shard_task_pool);
+  }
+
+  // Bring up the workers; the threads will be created here but be suspended
+  // (if the platform supports it) awaiting the first tasks getting scheduled.
+  if (iree_status_is_ok(status)) {
+    executor->worker_count = worker_count;
+    executor->workers = (iree_task_worker_t*)(executor + 1);
+    iree_task_affinity_set_t worker_idle_mask = 0;
+    iree_task_affinity_set_t worker_live_mask = 0;
+    iree_task_affinity_set_t worker_suspend_mask = 0;
+    for (iree_host_size_t i = 0; i < worker_count; ++i) {
+      iree_task_affinity_set_t worker_bit = iree_task_affinity_for_worker(i);
+      worker_idle_mask |= worker_bit;
+      worker_live_mask |= worker_bit;
+      if (executor->scheduling_mode &
+          IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+        worker_suspend_mask |= worker_bit;
+      }
+
+      iree_task_worker_t* worker = &executor->workers[i];
+      status = iree_task_worker_initialize(
+          executor, i, iree_task_topology_get_group(topology, i), &seed_prng,
+          worker);
+      if (!iree_status_is_ok(status)) break;
+    }
+    iree_atomic_task_affinity_set_store(&executor->worker_live_mask,
+                                        worker_live_mask,
+                                        iree_memory_order_relaxed);
+    iree_atomic_task_affinity_set_store(&executor->worker_suspend_mask,
+                                        worker_suspend_mask,
+                                        iree_memory_order_relaxed);
+    iree_atomic_task_affinity_set_store(&executor->worker_idle_mask,
+                                        worker_idle_mask,
+                                        iree_memory_order_relaxed);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    // NOTE: destroy will ensure that any workers we have initialized are
+    // properly cleaned up.
+    iree_task_executor_destroy(executor);
+    IREE_TRACE_ZONE_END(z0);
+    return status;
+  }
+
+  *out_executor = executor;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+static void iree_task_executor_destroy(iree_task_executor_t* executor) {
+  if (!executor) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // First ask all workers to exit. We do this prior to waiting on them to exit
+  // so that we parallelize the shutdown logic (which may flush pending tasks).
+  for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+    iree_task_worker_t* worker = &executor->workers[i];
+    iree_task_worker_request_exit(worker);
+  }
+
+  // Now that all workers should be in the process of exiting we can join with
+  // them. Some may take longer than others to exit but that's fine as we can't
+  // return from here until they do anyway.
+  for (iree_host_size_t i = 0; i < executor->worker_count; ++i) {
+    iree_task_worker_t* worker = &executor->workers[i];
+    iree_task_worker_deinitialize(worker);
+  }
+
+  iree_wait_set_free(executor->wait_set);
+  iree_slim_mutex_deinitialize(&executor->wait_mutex);
+  iree_slim_mutex_deinitialize(&executor->coordinator_mutex);
+  iree_atomic_task_slist_deinitialize(&executor->incoming_ready_slist);
+  iree_atomic_task_slist_deinitialize(&executor->incoming_waiting_slist);
+  iree_task_pool_deinitialize(&executor->slice_task_pool);
+  iree_task_pool_deinitialize(&executor->shard_task_pool);
+  iree_allocator_free(executor->allocator, executor);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_retain(iree_task_executor_t* executor) {
+  if (executor) {
+    iree_atomic_ref_count_inc(&executor->ref_count);
+  }
+}
+
+void iree_task_executor_release(iree_task_executor_t* executor) {
+  if (executor && iree_atomic_ref_count_dec(&executor->ref_count) == 1) {
+    iree_task_executor_destroy(executor);
+  }
+}
+
+// Schedules a generic task to a worker matching its affinity.
+// The task will be posted to the worker mailbox and available for the worker to
+// begin processing as soon as the |post_batch| is submitted.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_relay_to_worker(
+    iree_task_executor_t* executor, iree_task_post_batch_t* post_batch,
+    iree_task_t* task) {
+  iree_host_size_t worker_index =
+      iree_task_post_batch_select_worker(post_batch, task->affinity_set);
+  iree_task_post_batch_enqueue(post_batch, worker_index, task);
+}
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Task may enqueue zero or more new tasks (or newly-ready/waiting tasks) to
+// |pending_submission| or queue work for posting to workers via the
+// |post_batch|.
+//
+// NOTE: the pending submission list we walk here is in FIFO order and the
+// post batch we are building is in LIFO; this means that as we pop off the
+// least recently added tasks from the submission (nice in-order traversal) we
+// are pushing them as what will become the least recent tasks in the batch.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+    iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch) {
+  if (iree_task_list_is_empty(&pending_submission->ready_list)) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_task_t* task = NULL;
+  while ((task = iree_task_list_pop_front(&pending_submission->ready_list))) {
+    switch (task->type) {
+      case IREE_TASK_TYPE_NOP:
+      case IREE_TASK_TYPE_CALL:
+      case IREE_TASK_TYPE_DISPATCH_SLICE: {
+        // Generic routing to workers for tasks that should always run there.
+        iree_task_executor_relay_to_worker(executor, post_batch, task);
+        break;
+      }
+      case IREE_TASK_TYPE_BARRIER: {
+        // Retire the barrier to (possibly) ready up all dependent tasks.
+        // This acts as a fan-out in cases where the dependent task count >1.
+        iree_task_barrier_retire((iree_task_barrier_t*)task,
+                                 pending_submission);
+        break;
+      }
+      case IREE_TASK_TYPE_FENCE: {
+        // Scope fence hit; notifies the scope so that anyone waiting on the
+        // fence can be notified without us having to do so explicitly.
+        iree_task_fence_retire((iree_task_fence_t*)task, pending_submission);
+        break;
+      }
+      case IREE_TASK_TYPE_WAIT: {
+        // Waits may need to be moved into the wait list (not completed) or
+        // retired (after the wait condition is met).
+        if (task->flags & IREE_TASK_FLAG_WAIT_COMPLETED) {
+          iree_task_wait_retire((iree_task_wait_t*)task, pending_submission);
+        } else {
+          iree_task_submission_enqueue(pending_submission, task);
+        }
+        break;
+      }
+      case IREE_TASK_TYPE_DISPATCH: {
+        // Dispatches may need to be issued (fanning out the tiles to workers)
+        // or retired (after all tiles have completed).
+        if (task->flags & IREE_TASK_FLAG_DISPATCH_RETIRE) {
+          iree_task_dispatch_retire((iree_task_dispatch_t*)task,
+                                    pending_submission);
+        } else {
+          if (task->flags & IREE_TASK_FLAG_DISPATCH_SLICED) {
+            iree_task_dispatch_issue_sliced((iree_task_dispatch_t*)task,
+                                            &executor->slice_task_pool,
+                                            pending_submission, post_batch);
+          } else {
+            iree_task_dispatch_issue_sharded((iree_task_dispatch_t*)task,
+                                             &executor->shard_task_pool,
+                                             pending_submission, post_batch);
+          }
+        }
+        break;
+      }
+    }
+  }
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+                                         iree_task_submission_t* submission) {
+  // Concatenate all of the incoming tasks into the submission list.
+  // Note that the submission stores tasks in LIFO order such that when they are
+  // put into the LIFO atomic slist they match the order across all concats
+  // (earlier concats are later in the LIFO list).
+  iree_atomic_task_slist_concat(&executor->incoming_ready_slist,
+                                submission->ready_list.head,
+                                submission->ready_list.tail);
+  iree_atomic_task_slist_concat(&executor->incoming_waiting_slist,
+                                submission->waiting_list.head,
+                                submission->waiting_list.tail);
+
+  // NOTE: after concatenating the intrusive next_task pointers may immediately
+  // be modified by other threads. We can no longer assume anything about the
+  // submission lists and can only discard them.
+  iree_task_submission_reset(submission);
+}
+
+iree_status_t iree_task_executor_submit(iree_task_executor_t* executor,
+                                        iree_task_submission_t* submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Concatenate the submitted tasks onto our primary LIFO incoming lists.
+  iree_task_executor_merge_submission(executor, submission);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_executor_flush(iree_task_executor_t* executor) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Mostly a no-op today as we aren't deferring submission with the scheduling
+  // mode. Instead, we'll just run the coordinator inline to ensure all tasks
+  // are pushed to workers.
+  iree_task_executor_coordinate(executor, /*current_worker=*/NULL,
+                                /*speculative=*/false);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Merges incoming likely-unresolved wait tasks into the primary executor lists.
+// The handle of each task will be inserted into the wait_set (where it may be
+// a duplicate).
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_merge_wait_list(
+    iree_task_executor_t* executor, iree_task_list_t* incoming_waiting_list) {
+  if (iree_task_list_is_empty(incoming_waiting_list)) return;
+
+  iree_slim_mutex_lock(&executor->wait_mutex);
+
+  // Walk the list of incoming wait tasks and add them to our wait_set.
+  iree_task_wait_t* wait_task =
+      (iree_task_wait_t*)iree_task_list_front(incoming_waiting_list);
+  do {
+    iree_status_t status =
+        iree_wait_set_insert(executor->wait_set, wait_task->wait_handle);
+    // TODO(#4026): propagate failure to the task scope.
+    IREE_ASSERT_TRUE(iree_status_is_ok(status));
+    iree_status_ignore(status);
+    wait_task = (iree_task_wait_t*)wait_task->header.next_task;
+  } while (wait_task);
+
+  iree_slim_mutex_unlock(&executor->wait_mutex);
+
+  // Add (in undefined order) to the primary wait list used for tracking the
+  // root wait tasks until they are ready.
+  iree_task_list_append(&executor->waiting_list, incoming_waiting_list);
+}
+
+// Finds the waiting task corresponding to |wake_handle| and retires it.
+// Any dependent tasks will be enqueued in the |pending_submission| for issuing.
+// If multiple tasks were waiting on the same wait handle all will be readied.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+// The wait lock must be held as the wait_set is modified.
+static void iree_task_executor_wake_waiting_task(
+    iree_task_executor_t* executor, iree_wait_handle_t wake_handle,
+    iree_task_submission_t* pending_submission) {
+  // Walk through the waiting_list and find all waits with this handle.
+  // Some may not have resolved yet and need to remain in the list.
+  iree_task_t* prev_task = NULL;
+  iree_task_t* task = iree_task_list_front(&executor->waiting_list);
+  while (task != NULL) {
+    iree_task_t* next_task = task->next_task;
+    iree_task_wait_t* wait_task = (iree_task_wait_t*)task;
+    if (wake_handle.type == wait_task->wait_handle.type &&
+        memcmp(&wake_handle.value, &wait_task->wait_handle.value,
+               sizeof(wake_handle.value)) == 0) {
+      // Found one of possibly many. If its condition is met then remove from
+      // the wait set and ready up.
+      if (iree_task_wait_check_condition(wait_task)) {
+        iree_wait_set_erase(executor->wait_set, wake_handle);
+        iree_task_list_erase(&executor->waiting_list, prev_task, task);
+        iree_task_submission_enqueue(pending_submission, task);
+        task = prev_task;
+      }
+    }
+    prev_task = task;
+    task = next_task;
+  }
+}
+
+// Polls all waiting tasks to see if they have completed and adds any newly
+// ready dependencies to |pending_submission|.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_poll_waiting_tasks(
+    iree_task_executor_t* executor,
+    iree_task_submission_t* pending_submission) {
+  if (iree_task_list_is_empty(&executor->waiting_list)) return;
+
+  // Hold the wait lock for the duration we use the wait_set.
+  if (!iree_slim_mutex_try_lock(&executor->wait_mutex)) {
+    return;
+  }
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Poll all root waiting tasks (infinite-past duration) to see if any have
+  // completed. If one or more have resolved then wake_handle will contain an
+  // unspecified wake handle.
+  int woken_tasks = 0;
+  do {
+    iree_wait_handle_t wake_handle;
+    iree_status_t status = iree_wait_any(executor->wait_set,
+                                         IREE_TIME_INFINITE_PAST, &wake_handle);
+    if (iree_status_is_ok(status)) {
+      // One or more waiters is ready. We don't support multi-wake right now so
+      // we'll just take the one we got back and try again.
+      iree_task_executor_wake_waiting_task(executor, wake_handle,
+                                           pending_submission);
+      ++woken_tasks;
+      continue;
+    } else if (iree_status_is_deadline_exceeded(status)) {
+      // Indicates nothing was woken. Gracefully bail for now.
+      break;
+    } else {
+      // (Spurious?) error during poll.
+      // TODO(#4026): propagate failure to all scopes involved.
+      // It may be ok to ignore when polling as the eventual wait will handle
+      // the full propagation. For now we assert so its easy to see if we have
+      // tried to perform a bad iree_wait_any.
+      IREE_ASSERT_TRUE(iree_status_is_ok(status));
+      iree_status_ignore(status);
+      break;
+    }
+  } while (!iree_task_list_is_empty(&executor->waiting_list));
+
+  iree_slim_mutex_unlock(&executor->wait_mutex);
+
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Waits for one or more waiting tasks to be ready to execute.
+// If a wait task retires any newly-ready tasks will be added to
+// |pending_submission|.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+static void iree_task_executor_wait_any_task(
+    iree_task_executor_t* executor, iree_task_worker_t* current_worker,
+    iree_task_submission_t* pending_submission) {
+  if (iree_task_list_is_empty(&executor->waiting_list)) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_slim_mutex_unlock(&executor->coordinator_mutex);
+
+  // We can't hold the coordinator lock during the wait but also need to ensure
+  // no other coordination messes with the wait set. We have a dedicated wait
+  // mutex and guard wait-set accesses (polling/waiting/etc) with that. Polls
+  // may try-lock and bail if the lock is held indicating that someone else has
+  // a non-polling wait active.
+
+  // TODO(benvanik): ensure coordinator wake semantics are modeled:
+  // - donator:
+  //   attempt 0:
+  //     try steal
+  //     if fail to steal: coordinate
+  //   attempt 1:
+  //     try steal
+  //     if fail to steal: await any-posted notification?
+  // - worker:
+  //   attempt 0:
+  //     try steal
+  //     if fail to steal: coordinate
+
+  iree_slim_mutex_lock(&executor->wait_mutex);
+
+  iree_time_t deadline_ns = IREE_TIME_INFINITE_FUTURE;
+  iree_wait_handle_t wake_handle;
+  iree_status_t status =
+      iree_wait_any(executor->wait_set, deadline_ns, &wake_handle);
+
+  iree_slim_mutex_unlock(&executor->wait_mutex);
+
+  // TODO(#4026): propagate failure to all scopes involved.
+  IREE_ASSERT_TRUE(iree_status_is_ok(status));
+  iree_status_ignore(status);
+
+  iree_slim_mutex_lock(&executor->coordinator_mutex);
+
+  int woken_tasks = 0;
+  if (iree_status_is_ok(status)) {
+    // One or more waiters is ready. We don't support multi-wake right now so
+    // we'll just take the one we got back and try again.
+    iree_task_executor_wake_waiting_task(executor, wake_handle,
+                                         pending_submission);
+    ++woken_tasks;
+  } else if (iree_status_is_deadline_exceeded(status)) {
+    // Indicates nothing was woken. Gracefully bail and return to the
+    // coordinator to see if we should wait again.
+  } else {
+    // (Spurious?) error during wait.
+    // TODO(#4026): propagate failure to all scopes involved.
+    // Failures during waits are serious: ignoring them could lead to live-lock
+    // as tasks further in the pipeline expect them to have completed or - even
+    // worse - user code/other processes/drivers/etc may expect them to
+    // complete.
+    IREE_ASSERT_TRUE(iree_status_is_ok(status));
+    iree_status_ignore(status);
+  }
+
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, woken_tasks);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Dispatches tasks in the global submission queue to workers.
+// This is called by users upon submission of new tasks or by workers when they
+// run out of tasks to process. |speculative| indicates whether the coordination
+// request is done as a fallback in the event of there possibly being new work
+// available.
+//
+// If a coordination run ends up with no ready tasks and one or more waiting
+// tasks then the coordinator will wait for one of the tasks to become ready.
+// This only happens in the speculative case (so it's always a worker) as in
+// those cases the next step for the worker would have been to wait anyway. In
+// the non-speculative case the coordinator polls the wait handles to see if
+// they have resolved instead, possibly readying more tasks immediately.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+                                   iree_task_worker_t* current_worker,
+                                   bool speculative) {
+  if (speculative) {
+    if (!iree_slim_mutex_try_lock(&executor->coordinator_mutex)) {
+      // Another thread is already holding the coordination lock.
+      // Return to the caller to wait for it to finish.
+      // TODO(benvanik): spin here if it's likely we'll have work after the
+      // other coordinator finishes - that way we don't enter the wait.
+      return;
+    }
+  } else {
+    iree_slim_mutex_lock(&executor->coordinator_mutex);
+  }
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Check for incoming submissions and move their posted tasks into our
+  // local lists. Any of the tasks here are ready to execute immediately and
+  // ones we should be able to distribute to workers without delay. The
+  // waiting tasks are to the best of the caller's knowledge not ready yet.
+  //
+  // Note that we only do this once per coordination; that's so we don't
+  // starve if submissions come in faster than we can schedule them.
+  // Coordination will run again when workers become idle and will pick up
+  // any changes then.
+  //
+  // As we schedule tasks we may spawn new ones (like a dispatch -> many
+  // dispatch slices) and we keep track of those here. By doing a pass through
+  // all ready tasks and only then merging in the new submission we get
+  // breadth-first traversal of task graphs even if they originate from
+  // various places and have no relation - hopefully leading to better average
+  // latency.
+  iree_task_submission_t pending_submission;
+  iree_task_submission_initialize_from_lifo_slist(
+      &executor->incoming_ready_slist, &pending_submission);
+  iree_task_list_append_from_fifo_slist(&pending_submission.waiting_list,
+                                        &executor->incoming_waiting_slist);
+
+  // Scratch coordinator submission batch used during scheduling to batch up
+  // all tasks that will be posted to each worker. We could stash this on the
+  // executor but given that which thread is playing the role of the coordinator
+  // is random it's better to ensure that these bytes never incur a cache miss
+  // by making them live here in the stack of the chosen thread.
+  iree_task_post_batch_t* post_batch =
+      iree_alloca(sizeof(iree_task_post_batch_t) +
+                  executor->worker_count * sizeof(iree_task_list_t));
+  iree_task_post_batch_initialize(executor, current_worker, post_batch);
+
+  // Poll the waiting tasks to see if any have resolved. This dramatically
+  // cuts latency in cases where the wait handle completes prior to us
+  // entering the real wait. When we have semaphores sequencing back-to-back
+  // work this ensures that we pack in future dispatch work earlier vs.
+  // waiting for a full thread hop.
+  //
+  // If any waits have resolved then they'll be moved to the ready list here
+  // and then get processed FIFO with the tasks that were ready in the
+  // request.
+  iree_task_executor_poll_waiting_tasks(executor, &pending_submission);
+
+  // Schedule all ready tasks in this batch. Some may complete inline (such
+  // as ready barriers with all their dependencies resolved) while others may
+  // be scheduled on workers via the post batch.
+  iree_task_executor_schedule_ready_tasks(executor, &pending_submission,
+                                          post_batch);
+
+  // Merge any newly waiting tasks into the global wait list.
+  iree_task_executor_merge_wait_list(executor,
+                                     &pending_submission.waiting_list);
+
+  // Post all new work to workers; they may wake and begin executing
+  // immediately. Returns whether this worker has new tasks for it to work on.
+  bool did_post = iree_task_post_batch_submit(post_batch);
+  if (!did_post && speculative) {
+    // No work was found; wait on one or more of our wait handles.
+    // This will block the calling thread but that's fine as they were going
+    // to wait anyway and were just speculatively seeing if there was work first
+    // by requesting coordination. If work completes here we'll catch it on
+    // the poll next loop around.
+    iree_task_executor_wait_any_task(executor, current_worker,
+                                     &pending_submission);
+  }
+
+  // Merge any new work into the submission list for future coordinators to
+  // deal with - we don't want the possibility of starvation by looping on this.
+  iree_task_executor_merge_submission(executor, &pending_submission);
+
+  iree_slim_mutex_unlock(&executor->coordinator_mutex);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_task_t* iree_task_executor_try_steal_task_from_affinity_set(
+    iree_task_executor_t* executor, iree_task_affinity_set_t victim_mask,
+    uint32_t max_theft_attempts, int rotation_offset,
+    iree_task_queue_t* local_task_queue) {
+  if (!victim_mask) return NULL;
+  max_theft_attempts = iree_min(max_theft_attempts,
+                                iree_task_affinity_set_count_ones(victim_mask));
+  victim_mask = iree_task_affinity_set_rotr(victim_mask, rotation_offset);
+
+  int worker_index = rotation_offset;
+  iree_task_affinity_set_t mask =
+      iree_task_affinity_set_rotr(victim_mask, worker_index);
+  for (uint32_t i = 0; i < max_theft_attempts; ++i) {
+    // Find the last set bit and skip to it. This avoids the need for doing
+    // a full O(n) scan and instead gets us at O(popcnt) * O(ctz).
+    //
+    // Example: sharing mask = 0b01010101
+    //          mask_rotation = 3 (randomly selected)
+    //          mask = 0b01010101 rotr 3 = 0b10101010
+    //          for (i = 0; i < 4; ++i)
+    //            offset = ctz(0b10101010) = 1
+    //            mask_rotation += 1 = 4
+    //            mask >>= 1 = 0b01010101
+    //            victim_index = 4 % 64 = 4
+    int offset = iree_task_affinity_set_count_trailing_zeros(mask);
+    int victim_index = (worker_index + offset) % executor->worker_count;
+    worker_index += offset + 1;
+    mask = mask >> (offset + 1);
+    iree_task_worker_t* victim_worker = &executor->workers[victim_index];
+
+    // Policy: steal a chunk of tasks at the tail of the victim queue.
+    // This will steal multiple tasks from the victim up to the specified max
+    // and move the them into our local task queue. Not all tasks will be stolen
+    // and the assumption is that over a large-enough random distribution of
+    // thievery taking ~half of the tasks each time (across all queues) will
+    // lead to a relatively even distribution.
+    iree_task_t* task = iree_task_worker_try_steal_task(
+        victim_worker, local_task_queue,
+        /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+    if (task) return task;
+  }
+
+  // No tasks found in victim_mask.
+  return NULL;
+}
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+//
+// We do a scan through ideal victims indicated by the
+// |constructive_sharing_mask|; these are the workers most likely to have some
+// cache benefits to taking their work as they share some level of the cache
+// hierarchy and should be better to steal from than any random worker.
+//
+// To prevent biasing any particular victim we use a fast prng function to
+// select where in the set of potential victims defined by the topology
+// group we steal. We (probably) don't need anything super complex here so
+// instead of bouncing around at random we just select the starting point in
+// our search and then go in-order.
+iree_task_t* iree_task_executor_try_steal_task(
+    iree_task_executor_t* executor,
+    iree_task_affinity_set_t constructive_sharing_mask,
+    uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+    iree_task_queue_t* local_task_queue) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  const int worker_count = executor->worker_count;
+  iree_task_worker_t* workers = executor->workers;
+
+  // Limit the workers we will steal from to the ones that are currently live
+  // and not idle.
+  iree_task_affinity_set_t victim_mask =
+      iree_atomic_task_affinity_set_load(&executor->worker_live_mask,
+                                         iree_memory_order_relaxed) &
+      ~iree_atomic_task_affinity_set_load(&executor->worker_idle_mask,
+                                          iree_memory_order_relaxed);
+
+  // TODO(benvanik): it may be possible to rework this such that we better
+  // use the prng; for example, instead of all this rotating stuff we could just
+  // generate an 8-bit number (or even split it into two 4-bit numbers) per
+  // theft attempt. The current rotation strategy is biased toward the same try
+  // ordering vs. what we may really want with an unbiased random selection.
+  int rotation_offset = iree_prng_minilcg128_next_uint8(theft_prng) &
+                        (8 * sizeof(iree_task_affinity_set_t) - 1);
+
+  // Try first with the workers we may have some caches shared with. This
+  // helps to prevent cache invalidations/availability updates as it's likely
+  // that we won't need to go back to main memory (or higher cache tiers) in the
+  // event that the thief and victim are running close to each other in time.
+  iree_task_t* task = iree_task_executor_try_steal_task_from_affinity_set(
+      executor, victim_mask & constructive_sharing_mask, max_theft_attempts,
+      rotation_offset, local_task_queue);
+  if (task) {
+    IREE_TRACE_ZONE_APPEND_TEXT(z0, "local");
+  } else {
+    task = iree_task_executor_try_steal_task_from_affinity_set(
+        executor, victim_mask & ~constructive_sharing_mask, max_theft_attempts,
+        rotation_offset, local_task_queue);
+    if (task) {
+      IREE_TRACE_ZONE_APPEND_TEXT(z0, "non-local");
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return task;
+}
+
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+                                               iree_wait_handle_t* wait_handle,
+                                               iree_time_t deadline_ns) {
+  // Not implemented; just wait. Unclear we want this yet and it's complex.
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_status_t status = iree_wait_one(wait_handle, deadline_ns);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}

diff --git a/iree/task/executor.h b/iree/task/executor.h
new file mode 100644
index 0000000..0f49878
--- /dev/null
+++ b/iree/task/executor.h

@@ -0,0 +1,384 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_EXECUTOR_H_
+#define IREE_TASK_EXECUTOR_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomics.h"
+#include "iree/base/wait_handle.h"
+#include "iree/task/scope.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+#include "iree/task/topology.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+//==============================================================================
+// IREE Task Executor
+//==============================================================================
+//
+// Roughly models wavefront-style GPU dispatch. Users submit task DAGs with
+// fine-grained dependency information for the executor to schedule across a set
+// of workers. As tasks become ready to execute they are placed into per-worker
+// FIFOs and workers run through them in a breadth-first fashion executing and
+// resolving tasks and building up new waves of ready tasks. Workers will always
+// make forward progress and only when they run out of work will they attempt to
+// self-nominate to play the role of coordinator and schedule any newly-
+// submitted or readied tasks. Only once all tasks have been retired and
+// waits on external resources remain does the task system suspend itself until
+// more tasks are submitted or an external wait resolves.
+//
+// Our goal is to do the minimal amount of work to get the maximum amount of
+// concurrency the user requests or allows (by way of their dependencies).
+// Whether on a single core where you want to timeshare with an application or
+// across hundreds the same architecture holds. Where there is inefficiency it's
+// almost always surmountable with properly constructed tasks: choose the right
+// granularity for dispatches, choose the right fan-out for tiles within those
+// dispatches, choose the right places to insert barriers to force fan-in to
+// reduce memory utilization or right places to batch barriers to allow less
+// synchronization with the work queue, etc. All of those choices are ones this
+// system is designed to handle dynamically via the task graphs provided that
+// are themselves (in the IREE world) mapped 1:1 with the GPU-esque grid
+// dispatch and command buffer model. It's a super-power if a human is authoring
+// all that information but what makes it particularly powerful here is that we
+// are authoring that in the compiler based on a tremendous amount of
+// higher-level information we can derive from the whole program. Every bit of
+// dynamism here is matched with the ability to tighten down the screws and gain
+// back anything lost by way of compiler improvements while also being able to
+// generalize out to far more complex systems (higher parallelism, higher and
+// more efficient concurrency, etc).
+//
+// The design of this system allows for a spectrum of dynamic behavior based on
+// desired usage scenarios:
+// - variable number of persistent workers based on compute/memory topology
+// - per-task scope and per-task worker affinity to control for:
+//   - power islands on multi-core systems with fine-grained power management
+//   - heterogenous microarchitectures in big.LITTLE/etc compute complexes
+//   - task isolation between multiple active requests or users
+//   - latency prioritization by partitioning workloads by priority
+// - scheduling overhead tradeoffs by varying:
+//   - coordination/flush frequency to reduce cross-thread communication
+//   - by statically inserting dispatch slices to avoid dynamic fan-out
+//   - thread donation to avoid likely context switches upon submit+wait
+//   - multi-wait across all users by sharing a wait set
+//   - per-worker work-stealing specification of victim workers in the topology
+//   - limited work-stealing to prevent chained stealing/cascading theft
+//
+// Required reading:
+//  https://www.usenix.org/conference/osdi20/presentation/ma
+//    (closest equivalent to this scheduling model)
+//  https://www.cister-labs.pt/summer2017/w3/Parallelism%20-%20Dag%20Model.pdf
+//    (good overall, our worker local lists/mailboxes are work-stealing queues)
+//  http://people.csail.mit.edu/shanir/publications/Flat%20Combining%20SPAA%2010.pdf
+//    (what we model with the coordinator)
+//  http://mcg.cs.tau.ac.il/papers/opodis2010-quasi.pdf
+//    (we exploit relaxed consistency for all our cross-thread queuing, see ^)
+//  https://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++.htm
+//    (moodycamel is the state of the art on scaling queues; read it all)
+//  https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+//  https://blog.molecular-matters.com/2015/09/08/job-system-2-0-lock-free-work-stealing-part-2-a-specialized-allocator/
+//  https://blog.molecular-matters.com/2015/09/25/job-system-2-0-lock-free-work-stealing-part-3-going-lock-free/
+//  https://blog.molecular-matters.com/2015/11/09/job-system-2-0-lock-free-work-stealing-part-4-parallel_for/
+//  https://blog.molecular-matters.com/2016/04/04/job-system-2-0-lock-free-work-stealing-part-5-dependencies/
+//    (fantastic 5 part blog series; very similar to this)
+//  http://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/jacm99.pdf
+//    (provably optimal dynamic nested parallelism in 1999; basically: GPUs)
+//  http://www.cs.cmu.edu/~blelloch/papers/locality2000.pdf
+//    (followup to jacm99; using locality now to guide work stealing)
+//  https://www.cs.cmu.edu/afs/cs.cmu.edu/Web/People/blelloch/papers/CGK07.pdf
+//    (worker affinity and task locality for constructive cache sharing)
+//
+//==============================================================================
+// Life of an iree_task_t / high level algorithm
+//==============================================================================
+//
+// 1. Users allocate (from iree_task_pool_t, slice from arenas, etc) and
+//    construct a DAG of iree_task_ts.
+//
+//   a. Task dependency information is setup via completion_tasks for simple
+//      dependencies, implicit fan-out/fan-in (dispatches), or explicit fan-in
+//      (barriers).
+//
+//   b. Tasks are pushed into iree_task_submission_t (LIFO, thread-local list).
+//      If the task has no initial unmet initial dependencies it is placed into
+//      the ready_list. If it is initially waiting on an external resource such
+//      as iree_wait_handle_t then it is placed into the waiting_list.
+//
+// 2. iree_task_executor_submit (LIFO, atomic slist)
+//    Submissions have their task thread-local lists concatenated into a LIFO
+//    incoming_ready_slist or incoming_waiting_slist shared by the executor.
+//
+// 3. iree_task_executor_flush (or a worker puts on its coordinator hat 🎩)
+//
+//   a. Tasks are flushed from the incoming_ready_slist into a coordinator-local
+//      FIFO task queue and incoming_waiting_slist is concatenated into the
+//      primary executor waitlist.
+//
+//   b. iree_task_executor_poll_waiting_tasks: finds waiting tasks that are now
+//      ready and they are moved into the coordinator-local FIFO task queue.
+//
+//   c. iree_task_executor_schedule_ready_tasks: walks the FIFO task queue and
+//      builds a iree_task_post_batch_t containing the per-worker tasks
+//      in LIFO order.
+//
+//   d. iree_task_post_batch_submit: per-worker tasks are pushed to their
+//      respective iree_task_worker_t mailbox_slist and the workers with new
+//      tasks are notified to wake up (if not already awake).
+//
+// 4. iree_task_worker_main_pump_once (LIFO mailbox -> FIFO thread-local list)
+//    When either woken or after completing all available thread-local work
+//    each worker will check its mailbox_slist to see if any tasks have been
+//    posted.
+//
+//    a. Tasks are flushed from the LIFO mailbox into the local_task_queue FIFO
+//       for the particular worker.
+//
+//    b. If the mailbox is empty the worker *may* attempt to steal work from
+//       another nearby worker in the topology.
+//
+//    c. Any tasks in the local_task_queue are executed until empty.
+//       Tasks are retired and dependent tasks (via completion_task or barriers)
+//       are made ready and placed in the executor incoming_ready_slist or
+//       incoming_waiting_slist as with iree_task_executor_submit.
+//
+//    d. If no more thread-local work is available and the mailbox_slist is
+//       empty the worker will self-nominate for coordination and attempt to don
+//       the coordinator hat with iree_task_executor_coordinate. If new work
+//       becomes available after coordination step 5 repeats.
+//
+//    e. If another worker (or iree_task_executor_flush) is already wearing the
+//       coordinator hat then the worker will go to sleep.
+//
+//==============================================================================
+// Scaling Down
+//==============================================================================
+//
+// IREE is built at all levels - and both in the compiler and runtime - to scale
+// to different needs. Everything that IREE imposes on the runtime performance
+// and binary size is a spectrum of choices made that allows a user to only pay
+// for what they use.
+//
+// If a deployment scenario does not need complex multithreading and
+// out-of-order execution then this task system can be used in single-threaded
+// mode to at least allow for offloading from the main application thread. In
+// even more constrained scenarios (or embeddings within other systems that have
+// thread pools of their own) it can be used in zero-threaded mode with only
+// donated threads from the user performing work when the user wants it to
+// happen within its control. It still gives the benefits of wave-style
+// scheduling, multi-waiting, locality-aware work distribution, etc as well as
+// giving us a single target interface from the compiler to communicate
+// fine-grained dependency information to the runtime.
+//
+// If the cost of a few KB of data structures and some cheap uncontended atomic
+// linked list concatenations is still scary (it shouldn't be for 95% of uses)
+// then it's also possible to have a HAL driver that doesn't use this task
+// system at all and instead just executes the command buffers directly just
+// like our Vulkan/Metal/etc GPU backends do. Even though I don't recommend that
+// (one wouldn't be saving as much as they think and be losing a lot instead)
+// the layering holds and it can be useful if there's an existing external
+// sophisticated task execution system (ala taskflow) that is already in present
+// in an application.
+//
+// One assertion of IREE is that for models that take more than milliseconds to
+// execute then asynchronous scheduling is almost always worth it even on
+// systems with single cores. The ability to cooperatively schedule model
+// execution allows applications significant control over their total program
+// scheduling behavior; just as on a Commodore 64 you'd have to interrupt work
+// on vsync to begin scanning out pixels to the screen and then resume afterward
+// it's rare to see any system even scaling down to double-digit MHz
+// microcontrollers that doesn't benefit from the ability to cleanly suspend and
+// resume execution.
+//
+// But even if *all* of that is too much, the compile-time representations in
+// the HAL IR are designed to be lowered away: execution modeling does not need
+// to bottom out on a hal.command_buffer.dispatch that maps 1:1 with the runtime
+// iree_hal_command_buffer_dispatch call: dispatch can be lowered into LLVM
+// IR calls and finally into native code to do precisely what you want. The HAL
+// at runtime is a useful abstraction to allow for switching your target
+// execution system (statically or dynamically across deployments) and to share
+// the same execution system across multiple models that may be executing
+// simultaneously but it is _not_ a requirement that the IREE HAL runtime
+// implementation is used. It's called multi-level IR for a reason and the HAL
+// IR is just one level that may have many more below it.
+//
+// So yeah: don't worry. It's almost certain that the thing making or breaking
+// the performance of models over 1ms of execution time is not the HAL, and that
+// in models at or above that scale the benefits we get from being able to
+// holistically schedule the work far outstrip any specialization that can be
+// done by hand. That's to say: only worry about this if your model is literally
+// 4 floats coming from an IMU and a few hundred scalar instructions to predict
+// whether the user is walking, and that shouldn't be using the runtime HAL at
+// all and really likely doesn't benefit from using IREE at any scale - just go
+// straight to LLVM IR from the source.
+//
+//==============================================================================
+// Scaling Up
+//==============================================================================
+//
+// The task system has an implicit limit of 64 workers. This intentional
+// limitation simplifies several parts of the code while also preventing misuse:
+// it rarely (if ever) makes sense to have more than 64 compute-dominated
+// threads working on a single problem. Achieving high performance in such
+// situations requires extremely careful control over the OS scheduler, memory
+// bandwidth consumption, and synchronization. It's always possible to make the
+// problem more compute-bound or very carefully try to fit in specific cache
+// sizes to avoid more constrained bandwidth paths but it's a non-portable
+// whack-a-mole style solution that is in conflict with a lot of what IREE seeks
+// to do with respect to low-latency and multi-tenant workloads.
+//
+// If more than 64 unique L1/L2 caches (or realistically more than probably ~32)
+// are available *and* all of them are attached to the same memory controllers
+// (no NUMA involved) then the solution is straightfoward: use multiple IREE
+// task executors. Either within a process or in separate processes the
+// granularity is coarse enough to not be a burden and changes the problem from
+// needing 100% perfect work scaling of a single task to needing a naive
+// distributed workload solution at the algorithm level.
+//
+// Many useful effects also fall out of solving the work distribution problem.
+// Even for single-tenant workloads being able to split work between two
+// executors allows for natural mappings on NUMA systems or completely
+// independent machines. When supporting multi-tenant workloads (even if the
+// same program is acting as multiple-tenants in a minibatched-style algorithm)
+// the improvements of isolation both in memory access patterns and in variance
+// from potentially bad system behavior dramatically improve: there aren't many
+// opportunities for contention in this system but one can guarantee zero
+// contention by simply not sharing the resources!
+
+// A bitfield specifying the scheduling mode used for configuring how (or if)
+// work is balanced across queues.
+enum iree_task_scheduling_mode_e {
+  // TODO(benvanik): batch, round-robin, FCFS, SJF, etc.
+  // We can also allow for custom scheduling, though I'm skeptical of the value
+  // of that. We should look into what GPUs do in hardware for balancing things
+  // (if anything this sophisticated at all). The potential benefit here is that
+  // we can optimize for offline workloads by allowing each queue to be drained
+  // until blocking - hopefully optimizing cache coherency and reducing the
+  // total memory high-water mark - or optimize for latency across all queues by
+  // taking tasks from all queues equally. There are other more interesting
+  // scheduling strategies such as preferring the widest tasks available from
+  // any queue such that we are keeping as many workers active as possible to
+  // reach peak utilization or artificially limiting which tasks we allow
+  // through to keep certain CPU cores asleep unless absolutely required.
+  IREE_TASK_SCHEDULING_MODE_RESERVED = 0u,
+
+  // Creates all workers suspended and waits until work is first scheduled to
+  // them to resume. This trades off initial blocking startup time waking the
+  // threads for potential latency additions later on as threads take longer to
+  // wake on their first use.
+  //
+  // Prefer this setting in systems where startup time is the priority and work
+  // may not be scheduled for awhile or scheduled unevenly to start; otherwise
+  // the executor creation will take longer and a thundering herd will occur
+  // forcing context switches even if no work is needed.
+  //
+  // Avoid in systems where the latency from initial submission to worker
+  // execution is critical as this will ensure all worker threads are waiting
+  // for their respective wake notifications. The kernel then will be able to
+  // much faster schedule all worker quantums and in many cases all workers will
+  // begin processing simultaneously immediately after the submission is made.
+  IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP = 1u << 0,
+
+  // TODO(#4027): implement IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD.
+  // Creates a dedicated thread performing waits on root wait handles.
+  // On workloads with many short-duration waits this will reduce total latency
+  // as the waits are aggressively processed and dependent tasks are scheduled.
+  // It also keeps any wait-related syscalls off the worker threads that would
+  // otherwise need to perform the syscalls during coordination.
+  IREE_TASK_SCHEDULING_MODE_DEDICATED_WAIT_THREAD = 1u << 1,
+};
+typedef uint32_t iree_task_scheduling_mode_t;
+
+// Base task system executor interface.
+typedef struct iree_task_executor_s iree_task_executor_t;
+
+// Creates a task executor using the specified topology.
+// |topology| is only used during creation and need not live beyond this call.
+// |out_executor| must be released by the caller.
+iree_status_t iree_task_executor_create(
+    iree_task_scheduling_mode_t scheduling_mode,
+    const iree_task_topology_t* topology, iree_allocator_t allocator,
+    iree_task_executor_t** out_executor);
+
+// Retains the given |executor| for the caller.
+void iree_task_executor_retain(iree_task_executor_t* executor);
+
+// Releases the given |executor| from the caller.
+void iree_task_executor_release(iree_task_executor_t* executor);
+
+// TODO(benvanik): scheduling mode mutation, compute quota control, etc.
+
+// Submits a batch of tasks for execution.
+// The submission represents a DAG of tasks all reachable from the initial
+// submission lists.
+//
+// Ownership of the tasks remains with the caller for the lifetime of the
+// submission unless tasks have a custom pool specified that they can be
+// returned to.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: it's possible for all work in the submission to complete prior to this
+// function returning.
+iree_status_t iree_task_executor_submit(iree_task_executor_t* executor,
+                                        iree_task_submission_t* submission);
+
+// Flushes any pending task batches for execution.
+//
+// Safe to call from any thread. Wait-free but may block for a small duration
+// during initial scheduling of the submitted tasks.
+//
+// NOTE: due to races it's possible for new work to arrive from other threads
+// after the flush has occurred but prior to this call returning.
+iree_status_t iree_task_executor_flush(iree_task_executor_t* executor);
+
+// Donates the calling thread to the executor until either |wait_handle|
+// resolves or |deadline_ns| is exceeded.
+//
+// If there are no tasks available then the calling thread will block as if
+// iree_wait_one had been used on |wait_handle|. If tasks are ready then the
+// caller will not block prior to starting to perform work on behalf of the
+// executor.
+//
+// Donation is intended as an optimization to elide context switches when the
+// caller would have waited anyway; now instead of performing a kernel wait and
+// most certainly incurring a context switch the caller immediately begins
+// taking work from the queue - likely even prior to any of the executor workers
+// waking (assuming they were idle).
+//
+// Note that donation may not always be strictly a win: the caller may have an
+// arbitrary thread affinity that may cause oversubscription of resources within
+// the topology. This can cause additional contention for compute resources and
+// increase kernel scheduling overhead as threads are swapped or migrated.
+// Measure, measure, measure! If there is any IO that can be performed during
+// the time that a caller would otherwise donate themselves to the executor that
+// should always be preferred as should smaller computation (again to not
+// oversubscribe resources). Treat donation as a hail mary to prevent a kernel
+// wait and not something that will magically make things execute faster.
+// Especially in large applications it's almost certainly better to do something
+// useful with the calling thread (even if that's go to sleep).
+//
+// Safe to call from any thread (though bad to reentrantly call from workers).
+iree_status_t iree_task_executor_donate_caller(iree_task_executor_t* executor,
+                                               iree_wait_handle_t* wait_handle,
+                                               iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_EXECUTOR_H_

diff --git a/iree/task/executor_impl.h b/iree/task/executor_impl.h
new file mode 100644
index 0000000..dbb17c5
--- /dev/null
+++ b/iree/task/executor_impl.h

@@ -0,0 +1,146 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_EXECUTOR_IMPL_H_
+#define IREE_TASK_EXECUTOR_IMPL_H_
+
+#include "iree/base/math.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/queue.h"
+#include "iree/task/tuning.h"
+#include "iree/task/worker.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct iree_task_executor_s {
+  iree_atomic_ref_count_t ref_count;
+  iree_allocator_t allocator;
+
+  // Defines how work is selected across queues.
+  // TODO(benvanik): make mutable; currently always the same reserved value.
+  iree_task_scheduling_mode_t scheduling_mode;
+
+  // State used by the work-stealing operations performed by donated threads.
+  // This is **NOT SYNCHRONIZED** and relies on the fact that we actually don't
+  // much care about the precise selection of workers enough to mind any tears
+  // we get in the PRNG state that lives inside. Cache write-back order and
+  // incidental cache line availability/visibility update frequency is like an
+  // extra layer of PRNG anyway ;)
+  iree_prng_minilcg128_state_t donation_theft_prng;
+
+  // Pools of transient dispatch tasks shared across all workers.
+  // Depending on configuration the task pool may allocate after creation using
+  // the allocator provided upon executor creation.
+  iree_task_pool_t slice_task_pool;
+  iree_task_pool_t shard_task_pool;
+
+  // A list of incoming tasks that are ready to execute immediately.
+  // The list is LIFO and we require that task lists are reversed by the
+  // submitter so we can use iree_atomic_slist_concat to quickly prepend the
+  // LIFO list to the atomic slist. By doing this we can construct the task
+  // lists in LIFO order prior to submission, concat with a pointer swap into
+  // this list, flush from the list in LIFO order during coordination, and do a
+  // single LIFO->FIFO conversion while distributing work. What could have been
+  // half a dozen task list pointer walks and inverted sequential memory access
+  // becomes one.
+  //
+  // Example:
+  //   existing tasks: C B A
+  //        new tasks: 1 2 3
+  //    updated tasks: 3 2 1 C B A
+  iree_atomic_task_slist_t incoming_ready_slist;
+  // A list of incoming wait tasks that need to be waited on. Order doesn't
+  // really matter here as all tasks will be waited on simultaneously.
+  iree_atomic_task_slist_t incoming_waiting_slist;
+
+  // Guards coordination logic; only one thread at a time may be acting as the
+  // coordinator.
+  iree_slim_mutex_t coordinator_mutex;
+  // A list of wait tasks with external handles that need to be waited on.
+  // Coordinators can choose to poll/wait on these.
+  iree_task_list_t waiting_list;
+  // Guards manipulation and use of the wait_set.
+  // coordinator_mutex may be held when taking this lock.
+  iree_slim_mutex_t wait_mutex;
+  // Wait set containing all the tasks in waiting_list. Coordinator manages
+  // keeping the waiting_list and wait_set in sync.
+  iree_wait_set_t* wait_set;
+
+  // A bitset indicating which workers are live and usable; all attempts to
+  // push work onto a particular worker should check first with this mask. This
+  // may change over time either automatically or by user request ("don't use
+  // these cores for awhile I'm going to be using them" etc).
+  iree_atomic_task_affinity_set_t worker_live_mask;
+
+  // A bitset indicating which workers may be suspended and need to be resumed
+  // via iree_thread_resume prior to them being able to execute work.
+  iree_atomic_task_affinity_set_t worker_suspend_mask;
+
+  // A bitset indicating which workers are currently idle. Used to bias incoming
+  // tasks to workers that aren't doing much else. This is a balance of latency
+  // to wake the idle workers vs. latency to wait for existing work to complete
+  // on already woken workers.
+  iree_atomic_task_affinity_set_t worker_idle_mask;
+
+  // Specifies how many workers threads there are.
+  // For now this number is fixed per executor however if we wanted to enable
+  // live join/leave behavior we could change this to a registration mechanism.
+  iree_host_size_t worker_count;
+  iree_task_worker_t* workers;  // [worker_count]
+};
+
+// Merges a submission into the primary FIFO queues.
+// Coordinators will fetch items from here as workers demand them but otherwise
+// not be notified of the changes (waiting until coordination runs again).
+//
+// May be called from any thread.
+void iree_task_executor_merge_submission(iree_task_executor_t* executor,
+                                         iree_task_submission_t* submission);
+
+// Schedules all ready tasks in the |pending_submission| list.
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_executor_schedule_ready_tasks(
+    iree_task_executor_t* executor, iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch);
+
+// Dispatches tasks in the global submission queue to workers.
+// |current_worker| will be NULL if called from a non-worker thread and
+// otherwise be the current worker; used to avoid round-tripping through the
+// whole system to post to oneself.
+void iree_task_executor_coordinate(iree_task_executor_t* executor,
+                                   iree_task_worker_t* current_worker,
+                                   bool speculative);
+
+// Tries to steal an entire task from a sibling worker (based on topology).
+// Returns a task that is available (has not yet begun processing at all).
+// May steal multiple tasks and add them to the |local_task_queue|.
+iree_task_t* iree_task_executor_try_steal_task(
+    iree_task_executor_t* executor,
+    iree_task_affinity_set_t constructive_sharing_mask,
+    uint32_t max_theft_attempts, iree_prng_minilcg128_state_t* theft_prng,
+    iree_task_queue_t* local_task_queue);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_EXECUTOR_IMPL_H_

diff --git a/iree/task/executor_test.cc b/iree/task/executor_test.cc
new file mode 100644
index 0000000..6641edb
--- /dev/null
+++ b/iree/task/executor_test.cc

@@ -0,0 +1,181 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/executor.h"
+
+#include <thread>
+
+#include "iree/base/math.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+static thread_local volatile uint64_t xxx = 0;
+
+static void simulate_work(iree_task_tile_context_t* tile_context) {
+  iree_prng_splitmix64_state_t state;
+  iree_prng_splitmix64_initialize(xxx, &state);
+  bool slow = false;  // tile_context->workgroup_xyz[0] % 3 == 1;
+  if (tile_context->workgroup_xyz[0] == 128 ||
+      tile_context->workgroup_xyz[0] == 1023) {
+    // Introduce big variance to highlight work stealing.
+    // std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+  for (int i = 0; i < 256 * 1024; ++i) {
+    uint64_t value = iree_prng_splitmix64_next(&state);
+    xxx += value;
+    if (slow) {
+      for (int j = 0; j < 4; ++j) {
+        value = iree_prng_splitmix64_next(&state);
+        xxx += value;
+      }
+    }
+  }
+}
+
+TEST(ExecutorTest, Any) {
+  IREE_TRACE_SCOPE0("ExecutorTest::Any");
+
+  iree_allocator_t allocator = iree_allocator_system();
+
+  iree_task_topology_t* topology = NULL;
+#if 1
+  IREE_CHECK_OK(iree_task_topology_from_physical_cores(
+      /*max_core_count=*/6, allocator, &topology));
+#elif 0
+  IREE_CHECK_OK(iree_task_topology_from_unique_l2_cache_groups(
+      /*max_group_count=*/6, allocator, &topology));
+#else
+  IREE_CHECK_OK(iree_task_topology_from_group_count(/*group_count=*/6,
+                                                    allocator, &topology));
+#endif
+  iree_task_executor_t* executor = NULL;
+  iree_task_scheduling_mode_t scheduling_mode =
+      IREE_TASK_SCHEDULING_MODE_RESERVED;
+  IREE_CHECK_OK(iree_task_executor_create(scheduling_mode, topology, allocator,
+                                          &executor));
+  iree_task_topology_free(topology);
+
+  //
+  iree_task_scope_t scope_a;
+  iree_task_scope_initialize(iree_make_cstring_view("a"), &scope_a);
+
+  //
+  iree_task_call_t call0;
+  iree_task_call_initialize(
+      &scope_a,
+      iree_task_make_closure(
+          [](uintptr_t user_context, uintptr_t task_context) {
+            IREE_TRACE_SCOPE0("call0");
+            EXPECT_EQ(0, user_context);
+            return iree_ok_status();
+          },
+          0),
+      &call0);
+
+  const uint32_t workgroup_size_0[3] = {256, 1, 1};
+  const uint32_t workgroup_count_0[3] = {32, 4, 2};
+  iree_task_dispatch_t dispatch0;
+  iree_task_dispatch_initialize(
+      &scope_a,
+      iree_task_make_closure(
+          [](uintptr_t user_context, uintptr_t task_context) {
+            IREE_TRACE_SCOPE0("tile0");
+            iree_task_tile_context_t* tile_context =
+                (iree_task_tile_context_t*)task_context;
+            EXPECT_EQ(0, user_context);
+            simulate_work(tile_context);
+            iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+                                        iree_memory_order_relaxed);
+            return iree_ok_status();
+          },
+          0),
+      workgroup_size_0, workgroup_count_0, &dispatch0);
+  // dispatch0.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
+
+  const uint32_t workgroup_size_1[3] = {128, 1, 1};
+  const uint32_t workgroup_count_1[3] = {16, 2, 1};
+  iree_task_dispatch_t dispatch1;
+  iree_task_dispatch_initialize(
+      &scope_a,
+      iree_task_make_closure(
+          [](uintptr_t user_context, uintptr_t task_context) {
+            IREE_TRACE_SCOPE0("tile1");
+            iree_task_tile_context_t* tile_context =
+                (iree_task_tile_context_t*)task_context;
+            EXPECT_EQ(0, user_context);
+            simulate_work(tile_context);
+            iree_atomic_fetch_add_int32(&tile_context->statistics->reserved, 1,
+                                        iree_memory_order_relaxed);
+            return iree_ok_status();
+          },
+          0),
+      workgroup_size_1, workgroup_count_1, &dispatch1);
+  // dispatch1.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
+
+  //
+  iree_task_call_t call1;
+  iree_task_call_initialize(
+      &scope_a,
+      iree_task_make_closure(
+          [](uintptr_t user_context, uintptr_t task_context) {
+            IREE_TRACE_SCOPE0("call1");
+            EXPECT_EQ(1, user_context);
+            return iree_ok_status();
+          },
+          1),
+      &call1);
+
+#if 1
+  // no barrier between dispatches; fanout
+  iree_task_t* barrier0_tasks[2] = {&dispatch0.header, &dispatch1.header};
+  iree_task_barrier_t barrier0;
+  iree_task_barrier_initialize(&scope_a, IREE_ARRAYSIZE(barrier0_tasks),
+                               barrier0_tasks, &barrier0);
+  iree_task_set_completion_task(&call0.header, &barrier0.header);
+  iree_task_set_completion_task(&dispatch0.header, &call1.header);
+  iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#else
+  // barrier between dispatches
+  iree_task_set_completion_task(&call0.header, &dispatch0.header);
+  iree_task_set_completion_task(&dispatch0.header, &dispatch1.header);
+  iree_task_set_completion_task(&dispatch1.header, &call1.header);
+#endif
+
+  // fence
+  iree_task_fence_t fence0;
+  iree_task_fence_initialize(&scope_a, &fence0);
+  iree_task_set_completion_task(&call1.header, &fence0.header);
+
+  //
+  iree_task_submission_t sub0;
+  iree_task_submission_initialize(&sub0);
+  iree_task_submission_enqueue(&sub0, &call0.header);
+  IREE_CHECK_OK(iree_task_executor_submit(executor, &sub0));
+
+  //
+  // iree_task_submission_t sub1;
+  // iree_task_submission_initialize(&sub1);
+  // IREE_CHECK_OK(iree_task_executor_submit(executor, &sub1));
+
+  IREE_CHECK_OK(iree_task_executor_flush(executor));
+
+  IREE_CHECK_OK(iree_task_scope_wait_idle(&scope_a, IREE_TIME_INFINITE_FUTURE));
+
+  iree_task_scope_deinitialize(&scope_a);
+  iree_task_executor_release(executor);
+}
+
+}  // namespace

diff --git a/iree/task/list.c b/iree/task/list.c
new file mode 100644
index 0000000..6e70000
--- /dev/null
+++ b/iree/task/list.c

@@ -0,0 +1,227 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/list.h"
+
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist) {
+  iree_task_list_t discard_list;
+  iree_task_list_initialize(&discard_list);
+
+  // Flush the entire slist and walk it discarding the tasks as we go. This
+  // avoids the need to do more than one walk if we were simply trying to flush
+  // and discard into an iree_task_list_t.
+  //
+  // Note that we may accumulate additional tasks that need to be discarded;
+  // that's when we use the iree_task_list_t discard logic which works because
+  // we have a head/tail and are no longer in atomic land.
+  iree_task_t* task_head = NULL;
+  iree_atomic_task_slist_flush(
+      slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &task_head, NULL);
+  while (task_head != NULL) {
+    iree_task_t* next_task = task_head->next_task;
+    iree_task_discard(task_head, &discard_list);
+    task_head = next_task;
+  }
+
+  iree_task_list_discard(&discard_list);
+}
+
+void iree_task_list_initialize(iree_task_list_t* out_list) {
+  memset(out_list, 0, sizeof(*out_list));
+}
+
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list) {
+  memcpy(out_list, list, sizeof(*out_list));
+  memset(list, 0, sizeof(*list));
+}
+
+void iree_task_list_discard(iree_task_list_t* list) {
+  // Fixed point iteration over the task list and all its transitive dependent
+  // tasks that get discarded. This is in contrast to a recursive discard that
+  // could potentially be thousands of calls deep in a large graph.
+  while (!iree_task_list_is_empty(list)) {
+    iree_task_t* task = iree_task_list_pop_front(list);
+    iree_task_discard(task, list);
+  }
+}
+
+bool iree_task_list_is_empty(const iree_task_list_t* list) {
+  return list->head == NULL;
+}
+
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list) {
+  iree_host_size_t count = 0;
+  iree_task_t* p = list->head;
+  while (p) {
+    ++count;
+    p = p->next_task;
+  }
+  return count;
+}
+
+iree_task_t* iree_task_list_front(iree_task_list_t* list) { return list->head; }
+
+iree_task_t* iree_task_list_back(iree_task_list_t* list) { return list->tail; }
+
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task) {
+  if (!list->head) {
+    list->head = task;
+  }
+  if (list->tail) {
+    list->tail->next_task = task;
+  }
+  list->tail = task;
+  task->next_task = NULL;
+}
+
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task) {
+  task->next_task = list->head;
+  list->head = task;
+  if (!list->tail) {
+    list->tail = task;
+  }
+}
+
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list) {
+  if (!list->head) return NULL;
+  iree_task_t* task = list->head;
+  list->head = task->next_task;
+  if (list->tail == task) {
+    list->tail = NULL;
+  }
+  task->next_task = NULL;
+  return task;
+}
+
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+                          iree_task_t* task) {
+  if (task == list->head) {
+    // Removing head.
+    list->head = task->next_task;
+  } else if (task == list->tail) {
+    // Removing tail.
+    list->tail = prev_task;
+    prev_task->next_task = NULL;
+  } else {
+    // Removing inner.
+    prev_task->next_task = task->next_task;
+  }
+  task->next_task = NULL;
+}
+
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix) {
+  if (iree_task_list_is_empty(prefix)) return;
+  if (iree_task_list_is_empty(list)) {
+    list->head = prefix->head;
+    list->tail = prefix->tail;
+  } else {
+    prefix->tail->next_task = list->head;
+    list->head = prefix->head;
+  }
+  memset(prefix, 0, sizeof(*prefix));
+}
+
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix) {
+  if (iree_task_list_is_empty(suffix)) return;
+  if (iree_task_list_is_empty(list)) {
+    list->head = suffix->head;
+    list->tail = suffix->tail;
+  } else {
+    list->tail->next_task = suffix->head;
+    list->tail = suffix->tail;
+  }
+  memset(suffix, 0, sizeof(*suffix));
+}
+
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+                                           iree_atomic_task_slist_t* slist) {
+  iree_task_list_t suffix;
+  iree_task_list_initialize(&suffix);
+  if (!iree_atomic_task_slist_flush(
+          slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO, &suffix.head,
+          &suffix.tail)) {
+    return;  // empty
+  }
+  iree_task_list_append(list, &suffix);
+}
+
+void iree_task_list_reverse(iree_task_list_t* list) {
+  if (iree_task_list_is_empty(list)) return;
+  iree_task_t* tail = list->head;
+  iree_task_t* head = list->tail;
+  iree_task_t* p = list->head;
+  do {
+    iree_task_t* next = p->next_task;
+    p->next_task = head;
+    head = p;
+    p = next;
+  } while (p != NULL);
+  tail->next_task = NULL;
+  list->head = head;
+  list->tail = tail;
+}
+
+void iree_task_list_split(iree_task_list_t* head_list,
+                          iree_host_size_t max_tasks,
+                          iree_task_list_t* out_tail_list) {
+  iree_task_list_initialize(out_tail_list);
+  if (head_list->head == NULL) return;
+  if (head_list->head == head_list->tail) {
+    // 1 task in the source list; always prefer to steal it.
+    // This is because the victim is likely working on their last item and we
+    // can help them out by popping this off. It also has the side-effect of
+    // handling cases of donated workers wanting to steal all tasks to
+    // synchronously execute things.
+    iree_task_list_move(head_list, out_tail_list);
+    return;
+  }
+
+  // Walk through the |head_list| with two iterators; one at double-rate.
+  // If we ever notice this function showing up in profiling then we should
+  // build an acceleration structure to avoid the full walk of the first half
+  // (e.g. skip list).
+  iree_task_t* p_x1_m1 = head_list->head;  // p_x1 - 1 (previous to p_x1)
+  iree_task_t* p_x1 = head_list->head;     // x1 speed ptr
+  iree_task_t* p_x2 = head_list->head;     // x2 speed ptr
+  while (p_x2->next_task != NULL) {
+    p_x1_m1 = p_x1;
+    p_x1 = p_x1->next_task;
+    p_x2 = p_x2->next_task;
+    if (p_x2->next_task) p_x2 = p_x2->next_task;
+  }
+
+  // p_x1 now points at the half way point in the head_list. This is where we
+  // *start* our windowed walk for pulling out max_tasks, implicitly limiting us
+  // to take at most half of the tasks from the list.
+
+  // Advance the tail list keeping an iterator -max_tasks back; when we hit the
+  // end we have our head and tail to form the list.
+  iree_task_t* p_window_prev = p_x1_m1;
+  iree_task_t* p_window_head = p_x1;
+  iree_task_t* p_window_tail = p_x1;
+  while (p_window_tail->next_task != NULL && --max_tasks > 0) {
+    p_window_tail = p_window_tail->next_task;
+  }
+  while (p_window_tail->next_task != NULL) {
+    p_window_prev = p_window_head;
+    p_window_head = p_window_head->next_task;
+    p_window_tail = p_window_tail->next_task;
+  }
+
+  head_list->tail = p_window_prev;
+  p_window_prev->next_task = NULL;
+
+  out_tail_list->head = p_window_head;
+  out_tail_list->tail = p_window_tail;
+}

diff --git a/iree/task/list.h b/iree/task/list.h
new file mode 100644
index 0000000..a52c2e6
--- /dev/null
+++ b/iree/task/list.h

@@ -0,0 +1,113 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_LIST_H_
+#define IREE_TASK_LIST_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomic_slist.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// iree_atomic_task_slist_t, an atomic approximately LIFO singly-linked list.
+// iree_task_list_t should be preferred when working with
+// uncontended/thread-local lists as it has no overhead, while the
+// iree_atomic_task_slist_t should be used when multiple threads may need to
+// share lists of tasks (free lists, mailboxes, etc).
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task, iree_task_t,
+                                offsetof(iree_task_t, next_task));
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset.
+void iree_atomic_task_slist_discard(iree_atomic_task_slist_t* slist);
+
+// A singly-linked list of tasks using the embedded task next_task pointer.
+//
+// Thread-compatible; designed to be used from a single thread manipulating a
+// list for passing to an API that accepts lists.
+typedef struct iree_task_list_s {
+  iree_task_t* head;
+  iree_task_t* tail;
+} iree_task_list_t;
+
+// Initializes an empty task list.
+void iree_task_list_initialize(iree_task_list_t* out_list);
+
+// Moves |list| into |out_list|, leaving |list| empty.
+void iree_task_list_move(iree_task_list_t* list, iree_task_list_t* out_list);
+
+// Discards a task list; should be used for failure cleanup during list
+// construction to ensure intrusive pointers are reset.
+void iree_task_list_discard(iree_task_list_t* list);
+
+// Returns true if the list is empty.
+bool iree_task_list_is_empty(const iree_task_list_t* list);
+
+// Counts the total number of tasks in the list.
+// WARNING: this requires an O(n) walk of the entire list; use this only for
+// debugging or when the list is known to be small and hot in cache.
+iree_host_size_t iree_task_list_calculate_size(const iree_task_list_t* list);
+
+// Returns the first task in the list, if any.
+iree_task_t* iree_task_list_front(iree_task_list_t* list);
+
+// Returns the last task in the list, if any.
+iree_task_t* iree_task_list_back(iree_task_list_t* list);
+
+// Pushes a task onto the back of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_back(iree_task_list_t* list, iree_task_t* task);
+
+// Pushes a task onto the front of the task list. The task list takes ownership
+// of |task|.
+void iree_task_list_push_front(iree_task_list_t* list, iree_task_t* task);
+
+// Pops a task from the front of the task list or returns NULL if the list is
+// empty. Caller takes ownership of the returned task.
+iree_task_t* iree_task_list_pop_front(iree_task_list_t* list);
+
+// Erases |task| from the list.
+// |prev_task| must point to the task immediately prior to |task| in the list
+// or NULL if the task was at the head.
+void iree_task_list_erase(iree_task_list_t* list, iree_task_t* prev_task,
+                          iree_task_t* task);
+
+// Prepends |prefix| onto the beginning of |list|. |prefix| will be reset.
+void iree_task_list_prepend(iree_task_list_t* list, iree_task_list_t* prefix);
+
+// Appends |suffix| onto the end of |list|. |suffix| will be reset.
+void iree_task_list_append(iree_task_list_t* list, iree_task_list_t* suffix);
+
+// Flushes the given |slist| and appends all tasks to the list in FIFO order.
+void iree_task_list_append_from_fifo_slist(iree_task_list_t* list,
+                                           iree_atomic_task_slist_t* slist);
+
+// Reverses the list in-place.
+// Requires a full O(n) traversal.
+void iree_task_list_reverse(iree_task_list_t* list);
+
+// Splits |head_list| in half (up to |max_tasks|) and retains the first half
+// in |head_list| and the second half in |tail_list|.
+void iree_task_list_split(iree_task_list_t* head_list,
+                          iree_host_size_t max_tasks,
+                          iree_task_list_t* out_tail_list);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_LIST_H_

diff --git a/iree/task/list_test.cc b/iree/task/list_test.cc
new file mode 100644
index 0000000..2140e21
--- /dev/null
+++ b/iree/task/list_test.cc

@@ -0,0 +1,574 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/list.h"
+
+#include "iree/task/testing/test_util.h"
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(TaskListTest, Empty) {
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+  iree_task_list_discard(&list);
+}
+
+TEST(TaskListTest, CalculateSize) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(0, iree_task_list_calculate_size(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  EXPECT_FALSE(iree_task_list_is_empty(&list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+
+  iree_task_list_push_back(&list, task1);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  iree_task_list_push_back(&list, task2);
+  EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+}
+
+TEST(TaskListTest, Move) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+  iree_task_list_push_back(&list_a, task2);
+  iree_task_list_push_back(&list_a, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+
+  iree_task_list_move(&list_a, &list_b);
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+}
+
+TEST(TaskListTest, PushFront) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_front(&list, task0);
+  iree_task_list_push_front(&list, task1);
+  iree_task_list_push_front(&list, task2);
+  iree_task_list_push_front(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+
+  EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, PopFront) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  EXPECT_EQ(0, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(1, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(2, iree_task_list_pop_front(&list)->flags);
+  EXPECT_EQ(3, iree_task_list_pop_front(&list)->flags);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Erase) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  // Remove head.
+  iree_task_list_erase(&list, NULL, task0);
+  EXPECT_EQ(3, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task1, iree_task_list_front(&list));
+
+  // Remove tail.
+  iree_task_list_erase(&list, task2, task3);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task2, iree_task_list_back(&list));
+
+  // Remove the rest.
+  iree_task_list_erase(&list, task1, task2);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  EXPECT_EQ(task1, iree_task_list_front(&list));
+  EXPECT_EQ(task1, iree_task_list_back(&list));
+
+  iree_task_list_erase(&list, NULL, task1);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, PrependEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+  iree_task_list_prepend(&list_a, &list_b);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+}
+
+TEST(TaskListTest, PrependIntoEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  iree_task_list_prepend(&list_a, &list_b);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+
+  iree_task_list_push_back(&list_a, task3);
+  iree_task_list_prepend(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, PrependInto2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_a, task2);
+  iree_task_list_push_back(&list_a, task3);
+  iree_task_list_prepend(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendIntoEmpty) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task0);
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_b));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_b));
+
+  EXPECT_TRUE(iree_task_list_is_empty(&list_a));
+  iree_task_list_append(&list_a, &list_b);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task1);
+  iree_task_list_push_back(&list_b, task2);
+
+  iree_task_list_push_back(&list_b, task3);
+  iree_task_list_push_back(&list_a, task0);
+
+  iree_task_list_append(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, AppendInto2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list_a, list_b;
+  iree_task_list_initialize(&list_a);
+  iree_task_list_initialize(&list_b);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list_b, task2);
+  iree_task_list_push_back(&list_b, task3);
+
+  iree_task_list_push_back(&list_a, task0);
+  iree_task_list_push_back(&list_a, task1);
+
+  iree_task_list_append(&list_a, &list_b);
+
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list_a));
+  EXPECT_TRUE(CheckListOrderFIFO(&list_a));
+  EXPECT_TRUE(iree_task_list_is_empty(&list_b));
+}
+
+TEST(TaskListTest, Reverse0) {
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+}
+
+TEST(TaskListTest, Reverse1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+
+  iree_task_list_push_back(&list, task0);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  EXPECT_EQ(2, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, Reverse4) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+  iree_task_list_reverse(&list);
+  EXPECT_TRUE(CheckListOrderLIFO(&list));
+}
+
+TEST(TaskListTest, SplitEmpty) {
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+  EXPECT_TRUE(iree_task_list_is_empty(&tail_list));
+}
+
+TEST(TaskListTest, Split1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  iree_task_list_push_back(&head_list, task0);
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_TRUE(iree_task_list_is_empty(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+}
+
+TEST(TaskListTest, Split2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split3) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(1, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, Split4) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/64, &tail_list);
+
+  EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks1) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/1, &tail_list);
+
+  EXPECT_EQ(3, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(1, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+TEST(TaskListTest, SplitMaxTasks2) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t head_list;
+  iree_task_list_initialize(&head_list);
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+
+  iree_task_list_push_back(&head_list, task0);
+  iree_task_list_push_back(&head_list, task1);
+  iree_task_list_push_back(&head_list, task2);
+  iree_task_list_push_back(&head_list, task3);
+
+  iree_task_list_t tail_list;
+  iree_task_list_split(&head_list, /*max_tasks=*/2, &tail_list);
+
+  EXPECT_EQ(2, iree_task_list_calculate_size(&head_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&head_list));
+  EXPECT_EQ(2, iree_task_list_calculate_size(&tail_list));
+  EXPECT_TRUE(CheckListOrderFIFO(&tail_list));
+}
+
+}  // namespace

diff --git a/iree/task/pool.c b/iree/task/pool.c
new file mode 100644
index 0000000..172a2ce
--- /dev/null
+++ b/iree/task/pool.c

@@ -0,0 +1,294 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/pool.h"
+
+#include "iree/base/math.h"
+
+// Minimum byte size of a block in bytes, including the tasks as well as the
+// allocation header. This is here to allow us to reduce the number of times
+// we go to the allocator and amortize the overhead of our block header.
+#define IREE_TASK_POOL_MIN_BLOCK_SIZE (4 * 1024)
+
+// Alignment for block allocations; roughly a (likely) page size.
+// Since many allocators after the small byte range (~thousands of bytes) will
+// round up this just prevents us from being 1 over the allocator block size and
+// wasting space in a larger bucket.
+#define IREE_TASK_POOL_BLOCK_ALIGNMENT (4 * 1024)
+
+// The minimum number of tasks that will be allocated when growth is needed.
+// The total number may be larger once rounded to meet block size and alignment
+// requirements. Note that we leave a bit of room here for the block header
+// such that we don't always allocate a nice round number + N bytes that then
+// bumps us into the next power of two bucket.
+#define IREE_TASK_POOL_MIN_GROWTH_CAPACITY (255)
+
+// Grows the task pool by at least |minimum_capacity| on top of its current
+// capacity. The actual number of tasks available may be rounded up to make the
+// allocated blocks more allocator-friendly sizes.
+//
+// As an optimization for on-demand growth cases an |out_task| can be specified
+// to receive a task without the need for acquiring one from the pool
+// immediately after the growth completes. This avoids a race condition where
+// another thread could snipe the tasks we just allocated for the caller prior
+// to the caller getting a chance to acquire one.
+static iree_status_t iree_task_pool_grow(iree_task_pool_t* pool,
+                                         iree_host_size_t minimum_capacity,
+                                         iree_task_t** out_task) {
+  if (IREE_UNLIKELY(!minimum_capacity)) return iree_ok_status();
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Allocate a new block of tasks. To try to prevent the allocator from
+  // fragmenting we try to always allocate blocks that are page-aligned and
+  // powers of two.
+  //
+  // Note that we pad out our header to iree_max_align_t bytes so that all tasks
+  // are aligned on the same boundaries as required by atomic operations.
+  iree_host_size_t header_size =
+      iree_math_align(sizeof(iree_task_allocation_header_t), iree_max_align_t);
+  iree_host_size_t pow2_block_size = iree_math_round_up_to_pow2_u64(
+      header_size + minimum_capacity * pool->task_size);
+  iree_host_size_t aligned_block_size =
+      iree_math_align(pow2_block_size, IREE_TASK_POOL_BLOCK_ALIGNMENT);
+  if (aligned_block_size < IREE_TASK_POOL_MIN_BLOCK_SIZE) {
+    aligned_block_size = IREE_TASK_POOL_MIN_BLOCK_SIZE;
+  }
+  iree_task_allocation_header_t* allocation = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(pool->allocator, aligned_block_size,
+                                (void**)&allocation));
+
+  // Insert the allocation into the tracking list. Nothing reads the list until
+  // the pool is trimmed/deinitialized so it's safe to do now prior to
+  // populating anything. It's all just empty data anyway.
+  iree_atomic_task_allocation_slist_push(&pool->allocations_slist, allocation);
+
+  // Since we may have rounded up the allocation we may have gotten more space
+  // for tasks than we were asked for. Ensure we actually make use of them.
+  iree_host_size_t actual_capacity =
+      (aligned_block_size - header_size) / pool->task_size;
+
+  // Stitch together the tasks by setting all next pointers.
+  // Since we are going to be touching all the pages the order here is important
+  // as once we insert these new tasks into the available_slist they'll be
+  // popped out head->tail. To ensure the head that gets popped first is still
+  // warm in cache we construct the list backwards, with the tail tasks being
+  // fine to be evicted.
+  //
+  // The nice thing about this walk is that it ensures that if there were any
+  // zero-fill-on-demand trickery going on the pages are all wired here vs.
+  // when the tasks are first acquired from the list where it'd be harder to
+  // track.
+  uintptr_t p = ((uintptr_t)allocation + aligned_block_size) - pool->task_size;
+  iree_task_t* head = (iree_task_t*)p;
+  iree_task_t* tail = head;
+  head->next_task = NULL;
+  for (iree_host_size_t i = 0; i < actual_capacity; ++i, p -= pool->task_size) {
+    iree_task_t* task = (iree_task_t*)p;
+    task->next_task = head;
+    head = task;
+  }
+
+  // If the caller needs a task we can slice off the head to return prior to
+  // adding it to the slist where it may get stolen.
+  if (out_task) {
+    *out_task = head;
+    head = head->next_task;
+  }
+
+  // Concatenate the list of new free tasks into the pool.
+  iree_atomic_task_slist_concat(&pool->available_slist, head, tail);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+                                        iree_host_size_t task_size,
+                                        iree_host_size_t initial_capacity,
+                                        iree_task_pool_t* out_pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, task_size);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, initial_capacity);
+
+  out_pool->allocator = allocator;
+  out_pool->task_size = task_size;
+  iree_atomic_task_allocation_slist_initialize(&out_pool->allocations_slist);
+  iree_atomic_task_slist_initialize(&out_pool->available_slist);
+  iree_status_t status =
+      iree_task_pool_grow(out_pool, initial_capacity, /*out_task=*/NULL);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+void iree_task_pool_deinitialize(iree_task_pool_t* pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_allocation_header_t* allocation = NULL;
+  if (iree_atomic_task_allocation_slist_flush(
+          &pool->allocations_slist,
+          IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation, NULL)) {
+    while (allocation) {
+      iree_task_allocation_header_t* next =
+          iree_atomic_task_allocation_slist_get_next(allocation);
+      iree_allocator_free(pool->allocator, allocation);
+      allocation = next;
+    }
+  }
+  iree_atomic_task_allocation_slist_deinitialize(&pool->allocations_slist);
+  iree_atomic_task_slist_deinitialize(&pool->available_slist);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_pool_trim(iree_task_pool_t* pool) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // NOTE: this is only safe if there are no outstanding tasks.
+  // Hopefully the caller read the docstring!
+
+  // We only need to flush the list to empty it - these are just references into
+  // the allocations and don't need to be released.
+  iree_task_t* task_head = NULL;
+  iree_atomic_task_slist_flush(&pool->available_slist,
+                               IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+                               &task_head, /*tail=*/NULL);
+
+  iree_task_allocation_header_t* allocation_head = NULL;
+  if (iree_atomic_task_allocation_slist_flush(
+          &pool->allocations_slist,
+          IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO, &allocation_head,
+          /*tail=*/NULL)) {
+    do {
+      iree_task_allocation_header_t* next =
+          iree_atomic_task_allocation_slist_get_next(allocation_head);
+      iree_allocator_free(pool->allocator, allocation_head);
+      allocation_head = next;
+    } while (allocation_head != NULL);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+                                     iree_task_t** out_task) {
+  if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+  // Attempt to acquire a task from the available list.
+  iree_task_t* task = iree_atomic_task_slist_pop(&pool->available_slist);
+  if (task) {
+    *out_task = task;
+    return iree_ok_status();
+  }
+
+  // No tasks were available when we tried; force growth now.
+  // Note that due to races it's possible that there are now tasks that have
+  // been released back into the pool, but the fact that we failed once means
+  // we are sitting right at the current limit of the pool and growing will
+  // help ensure we go down the fast path more frequently in the future.
+  return iree_task_pool_grow(pool, IREE_TASK_POOL_MIN_GROWTH_CAPACITY,
+                             out_task);
+}
+
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+                                          iree_host_size_t count,
+                                          iree_task_list_t* out_list) {
+  if (!pool) return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED);
+
+  // If we acquire more than the requested count we need to give those leftovers
+  // back to the pool before we leave.
+  iree_task_list_t leftover_tasks;
+  iree_task_list_initialize(&leftover_tasks);
+  iree_task_list_initialize(out_list);
+
+  iree_status_t status = iree_ok_status();
+  while (count) {
+    // Flush the entire available list so we can start operating on it.
+    // This is where the potential race comes in: if another thread goes to
+    // acquire a task while we have the list local here it'll grow the list so
+    // it can meet its demand. That's still correct behavior but will result in
+    // potentially more wasted memory than if the other thread would have
+    // waited. Thankfully we save memory in so many other places that in the
+    // rare case there are multiple concurrent schedulers acquiring tasks it's
+    // not the end of the world.
+    iree_task_list_t acquired_tasks;
+    iree_task_list_initialize(&acquired_tasks);
+    if (iree_atomic_task_slist_flush(
+            &pool->available_slist,
+            IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+            &acquired_tasks.head,
+            /*tail=*/NULL)) {
+      // Had some items in the pool; eat up to the requested count.
+      // Note that we may run out and need to allocate more or have gotten
+      // too many during the flush and need to track those leftovers.
+      //
+      // Instead of having the slist flush walk the list and give us a tail we
+      // do that here: we need to walk the list anyway to partition it.
+      iree_task_t* p = acquired_tasks.head;
+      while (count > 0) {
+        p = iree_atomic_task_slist_get_next(p);
+        if (!p) break;
+        acquired_tasks.tail = p;
+        --count;
+      }
+
+      // If we got everything we need then we have to put all of the flushed
+      // tasks we didn't use into the leftover list.
+      if (count == 0) {
+        iree_task_list_t acquire_leftovers;
+        iree_task_list_initialize(&acquire_leftovers);
+        acquire_leftovers.head =
+            iree_atomic_task_slist_get_next(acquired_tasks.tail);
+        iree_atomic_task_slist_set_next(acquired_tasks.tail, NULL);
+        p = acquire_leftovers.head;
+        iree_task_t* next;
+        while ((next = iree_atomic_task_slist_get_next(p))) p = next;
+        acquire_leftovers.tail = p;
+        iree_task_list_append(&leftover_tasks, &acquire_leftovers);
+      }
+
+      // Add the tasks we did acquire to our result list.
+      // NOTE: this is unmeasured but the intuition is that we want to put the
+      // tasks we just acquired at the head of the list so that they are warm
+      // upon return to the caller who will then be touching the head of the
+      // list immediately.
+      iree_task_list_prepend(out_list, &acquired_tasks);
+    }
+
+    // If we still need more tasks but ran out of ones in the flush list then we
+    // need to grow some more.
+    if (count > 0) {
+      status = iree_task_pool_grow(pool, count, /*out_task=*/NULL);
+      if (IREE_UNLIKELY(!iree_status_is_ok(status))) break;
+    }
+  }
+
+  // Return leftovers that we acquired but didn't need to the pool.
+  iree_atomic_task_slist_concat(&pool->available_slist, leftover_tasks.head,
+                                leftover_tasks.tail);
+
+  // Upon failure return any tasks we may have already acquired from the pool.
+  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+    iree_atomic_task_slist_concat(&pool->available_slist, out_list->head,
+                                  out_list->tail);
+  }
+
+  return status;
+}
+
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task) {
+  if (!pool) return;
+  assert(task->pool == pool);
+  iree_atomic_task_slist_push(&pool->available_slist, task);
+}

diff --git a/iree/task/pool.h b/iree/task/pool.h
new file mode 100644
index 0000000..7610ff7
--- /dev/null
+++ b/iree/task/pool.h

@@ -0,0 +1,122 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_POOL_H_
+#define IREE_TASK_POOL_H_
+
+#include <stdint.h>
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An allocation of tasks in a task pool containing multiple tasks.
+// This struct is at the head of all task allocations made from the allocator.
+// It is used to form a linked list of all allocations made so that they can be
+// easily freed during pool teardown.
+typedef struct {
+  // Next allocation in the linked list of allocations.
+  iree_atomic_slist_intrusive_ptr_t* next;
+} iree_task_allocation_header_t;
+
+// An atomic approximately LIFO singly-linked list.
+IREE_TYPED_ATOMIC_SLIST_WRAPPER(iree_atomic_task_allocation,
+                                iree_task_allocation_header_t,
+                                offsetof(iree_task_allocation_header_t, next));
+
+// Shared thread-safe pool of iree_task_t structures of a particular size.
+// This can be used to quickly allocate blocks of tasks to be initialized by
+// task producers, enqueued, and then eventually recycled back to the pool.
+//
+// The lifetime of all tasks must be less than the pool they were acquired
+// from. Tasks acquired from one pool must not be released to another pool or
+// via any other mechanism.
+//
+// Pools can either be fixed-size with a maximum number of available tasks that
+// can be outstanding at any time or growable to allow the pool to be grown
+// unbounded after initialization.
+typedef struct iree_task_pool_s {
+  // Allocator used for allocating/freeing each allocation block.
+  iree_allocator_t allocator;
+
+  // Task size, in bytes.
+  iree_host_size_t task_size;
+
+  // NOTE: we don't track current usage count as that would introduce additional
+  // contention as tasks are acquired/released. If we end up finding a lot of
+  // memory idling here we can add a threshold over which we reclaim it, but the
+  // easiest (and most efficient) solution is to force the user to synchronize
+  // with the executor on a low memory event and use iree_task_pool_trim.
+
+  // Head of a linked list of all allocations made by the pool.
+  iree_atomic_task_allocation_slist_t allocations_slist;
+
+  // Linked list of free tasks used as a stack (LIFO).
+  // This is not a great structure for this as over time the tasks will get out
+  // of order and walking the linked list will incur cache misses. We offset
+  // that cost a bit by knowing that the time between walking the list to
+  // acquire tasks and when we initialize the tasks is short and that we would
+  // have triggered a cache miss anyway. In the future we can explore other
+  // approaches (such as small chunked linear lists) that better exploit spatial
+  // locality, if needed.
+  iree_atomic_task_slist_t available_slist;
+} iree_task_pool_t;
+
+// Initializes a task pool and optionally performs an initial task allocation.
+iree_status_t iree_task_pool_initialize(iree_allocator_t allocator,
+                                        iree_host_size_t task_size,
+                                        iree_host_size_t initial_capacity,
+                                        iree_task_pool_t* out_pool);
+
+// Deinitializes a task pool and releases all task allocations back to the
+// allocator specified during initialization. All tasks must have already been
+// released back to the pool.
+void iree_task_pool_deinitialize(iree_task_pool_t* pool);
+
+// Attempts to trim unused allocations from the task pool.
+// Must not be called while any tasks that were acquired from this pool are
+// still live; callers must synchronize with the executor and ensure they aren't
+// pushing any more work during the trim operation.
+void iree_task_pool_trim(iree_task_pool_t* pool);
+
+// Acquires a task from the task pool. The returned task will have undefined
+// contents and must be intialized by the caller.
+iree_status_t iree_task_pool_acquire(iree_task_pool_t* pool,
+                                     iree_task_t** out_task);
+
+// Acquires a set of tasks from the task pool. The returned tasks will have
+// undefined contents besides their intrusive next pointers and must be
+// intialized by the caller.
+//
+// WARNING: this may cause growth during races if multiple threads are trying to
+// acquire at the same time. Our usage patterns here are such that this is never
+// the case, though, as all acquisition from the internal executor pools happens
+// with the coordination lock held.
+iree_status_t iree_task_pool_acquire_many(iree_task_pool_t* pool,
+                                          iree_host_size_t count,
+                                          iree_task_list_t* out_list);
+
+// Releases a task to the task pool.
+// Callers must ensure the task is no longer in use.
+void iree_task_pool_release(iree_task_pool_t* pool, iree_task_t* task);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_POOL_H_

diff --git a/iree/task/pool_test.cc b/iree/task/pool_test.cc
new file mode 100644
index 0000000..641292f
--- /dev/null
+++ b/iree/task/pool_test.cc

@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/pool.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(PoolTest, Any) {
+  // TODO(benvanik): tests.
+}
+
+}  // namespace

diff --git a/iree/task/post_batch.c b/iree/task/post_batch.c
new file mode 100644
index 0000000..111c39c
--- /dev/null
+++ b/iree/task/post_batch.c

@@ -0,0 +1,189 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/post_batch.h"
+
+#include "iree/base/math.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/executor_impl.h"
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+                                     iree_task_worker_t* current_worker,
+                                     iree_task_post_batch_t* out_post_batch) {
+  out_post_batch->executor = executor;
+  out_post_batch->current_worker = current_worker;
+  out_post_batch->worker_pending_mask = 0;
+  memset(&out_post_batch->worker_pending_lifos, 0,
+         executor->worker_count * sizeof(iree_task_list_t));
+}
+
+iree_host_size_t iree_task_post_batch_worker_count(
+    const iree_task_post_batch_t* post_batch) {
+  return post_batch->executor->worker_count;
+}
+
+static iree_host_size_t iree_task_post_batch_select_random_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+  iree_task_affinity_set_t worker_live_mask =
+      iree_atomic_task_affinity_set_load(
+          &post_batch->executor->worker_live_mask, iree_memory_order_relaxed);
+  iree_task_affinity_set_t valid_worker_mask = affinity_set & worker_live_mask;
+  if (!valid_worker_mask) {
+    // No valid workers as desired; for now just bail to worker 0.
+    return 0;
+  }
+
+  // TODO(benvanik): rotate through workers here. Instead, if the affinity set
+  // has the current_worker allowed we just use that to avoid needing a
+  // cross-thread hop.
+  return iree_task_affinity_set_count_trailing_zeros(valid_worker_mask);
+}
+
+iree_host_size_t iree_task_post_batch_select_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set) {
+  if (post_batch->current_worker) {
+    // Posting from a worker - prefer sending right back to this worker.
+    if (affinity_set & post_batch->current_worker->worker_bit) {
+      return iree_task_affinity_set_count_trailing_zeros(
+          post_batch->current_worker->worker_bit);
+    }
+  }
+
+  // Prefer workers that are idle as though they'll need to wake up it is
+  // guaranteed that they aren't working on something else and the latency of
+  // waking should (hopefully) be less than the latency of waiting for a
+  // worker's queue to finish.
+  iree_task_affinity_set_t worker_idle_mask =
+      iree_atomic_task_affinity_set_load(
+          &post_batch->executor->worker_idle_mask, iree_memory_order_relaxed);
+  iree_task_affinity_set_t idle_affinity_set = affinity_set & worker_idle_mask;
+  if (idle_affinity_set) {
+    return iree_task_post_batch_select_random_worker(post_batch,
+                                                     idle_affinity_set);
+  }
+
+  // No more workers are idle; farm out at random. In the worst case work
+  // stealing will help balance things out on the backend.
+  return iree_task_post_batch_select_random_worker(post_batch, affinity_set);
+}
+
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+                                  iree_host_size_t worker_index,
+                                  iree_task_t* task) {
+  iree_task_list_push_front(&post_batch->worker_pending_lifos[worker_index],
+                            task);
+  post_batch->worker_pending_mask |=
+      iree_task_affinity_for_worker(worker_index);
+}
+
+// Wakes each worker indicated in the |wake_mask|, if needed.
+static void iree_task_post_batch_wake_workers(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t wake_mask) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, iree_math_count_ones_u64(wake_mask));
+
+  iree_task_executor_t* executor = post_batch->executor;
+
+  // Wake workers that may be suspended. We fetch the set of workers we need to
+  // wake (hopefully none in the common case) and mark that we've woken them so
+  // that we don't double-resume.
+  iree_task_affinity_set_t resume_mask =
+      iree_atomic_task_affinity_set_fetch_and(&executor->worker_suspend_mask,
+                                              ~wake_mask,
+                                              iree_memory_order_acquire);
+  resume_mask &= wake_mask;
+  if (IREE_UNLIKELY(resume_mask)) {
+    int resume_count = iree_task_affinity_set_count_ones(resume_mask);
+    int worker_index = 0;
+    for (int i = 0; i < resume_count; ++i) {
+      int offset = iree_task_affinity_set_count_trailing_zeros(resume_mask) + 1;
+      int resume_index = worker_index + offset;
+      worker_index += offset + 1;
+      resume_mask = resume_mask >> (offset + 1);
+      iree_thread_resume(executor->workers[resume_index].thread);
+    }
+  }
+
+  // TODO(#4016): use a FUTEX_WAKE_BITSET here to wake all of the workers that
+  // have pending work in a single syscall (vs. popcnt(worker_pending_mask)
+  // syscalls). This will reduce wake latency for workers later in the set;
+  // for example today worker[31] will wait until workers[0-30] have had their
+  // syscalls performed before it's even requested to wake. This also loses
+  // information the kernel could use to avoid core migration as it knows when N
+  // threads will be needed simultaneously and can hopefully perform any needed
+  // migrations prior to beginning execution.
+  int wake_count = iree_task_affinity_set_count_ones(wake_mask);
+  int worker_index = 0;
+  for (int i = 0; i < wake_count; ++i) {
+    int offset = iree_task_affinity_set_count_trailing_zeros(wake_mask);
+    int wake_index = worker_index + offset;
+    worker_index += offset + 1;
+    wake_mask = wake_mask >> (offset + 1);
+
+    // Wake workers if they are waiting - workers are the only thing that can
+    // wait on this notification so this should almost always be either free (an
+    // atomic load) if a particular worker isn't waiting or it's required to
+    // actually wake it and we can't avoid it.
+    iree_task_worker_t* worker = &executor->workers[wake_index];
+    iree_notification_post(&worker->wake_notification, 1);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch) {
+  if (!post_batch->worker_pending_mask) return false;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Run through each worker that has a bit set in the pending mask and post
+  // the pending tasks.
+  iree_task_affinity_set_t worker_mask = post_batch->worker_pending_mask;
+  post_batch->worker_pending_mask = 0;
+  int worker_index = 0;
+  int post_count = iree_task_affinity_set_count_ones(worker_mask);
+  iree_task_affinity_set_t worker_wake_mask = 0;
+  for (int i = 0; i < post_count; ++i) {
+    int offset = iree_task_affinity_set_count_trailing_zeros(worker_mask);
+    int target_index = worker_index + offset;
+    worker_index += offset + 1;
+    worker_mask = worker_mask >> (offset + 1);
+
+    iree_task_worker_t* worker = &post_batch->executor->workers[target_index];
+    iree_task_list_t* target_pending_lifo =
+        &post_batch->worker_pending_lifos[target_index];
+    if (worker == post_batch->current_worker) {
+      // Fast-path for posting to self; this happens when a worker plays the
+      // role of coordinator and we want to ensure we aren't doing a fully
+      // block-and-flush loop when we could just be popping the next new task
+      // off the list.
+      iree_task_queue_append_from_lifo_list_unsafe(&worker->local_task_queue,
+                                                   target_pending_lifo);
+    } else {
+      iree_task_worker_post_tasks(worker, target_pending_lifo);
+      worker_wake_mask |= iree_task_affinity_for_worker(target_index);
+    }
+  }
+
+  // Wake all workers that now have pending work. If a worker is not already
+  // waiting this will be cheap (no syscall).
+  if (worker_wake_mask != 0) {
+    iree_task_post_batch_wake_workers(post_batch, worker_wake_mask);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return post_count != 0;
+}

diff --git a/iree/task/post_batch.h b/iree/task/post_batch.h
new file mode 100644
index 0000000..944fdd7
--- /dev/null
+++ b/iree/task/post_batch.h

@@ -0,0 +1,77 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_POST_BATCH_H_
+#define IREE_TASK_POST_BATCH_H_
+
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_task_worker_s iree_task_worker_t;
+
+// Transient/stack-allocated structure for batching up tasks for posting to
+// worker mailboxes in single operations. This avoids the need to repeatedly
+// thrash caches during coordination as only during submission are the worker
+// mailboxes touched and only once per worker.
+typedef struct {
+  iree_task_executor_t* executor;
+
+  // Local worker constructing the post batch.
+  // This is used to know when lighter-weight queuing can occur (no need to
+  // post across a mailbox channel to yourself!).
+  // May be NULL if not being posted from a worker (such as a submission).
+  iree_task_worker_t* current_worker;
+
+  // A bitmask of workers indicating which have pending tasks in their lists.
+  // Used to quickly scan the lists and perform the posts only when required.
+  iree_task_affinity_set_t worker_pending_mask;
+
+  // A per-worker LIFO task list waiting to be posted.
+  iree_task_list_t worker_pending_lifos[0];
+} iree_task_post_batch_t;
+
+void iree_task_post_batch_initialize(iree_task_executor_t* executor,
+                                     iree_task_worker_t* current_worker,
+                                     iree_task_post_batch_t* out_post_batch);
+
+// Returns the total number of workers that the post batch is targeting.
+iree_host_size_t iree_task_post_batch_worker_count(
+    const iree_task_post_batch_t* post_batch);
+
+// Selects a random worker from the given affinity set.
+iree_host_size_t iree_task_post_batch_select_worker(
+    iree_task_post_batch_t* post_batch, iree_task_affinity_set_t affinity_set);
+
+// Enqueues a task to the given worker. Note that the pending work lists for
+// each work is kept in LIFO order so that we can easily concatenate it with the
+// worker mailbox slist that's in LIFO order.
+void iree_task_post_batch_enqueue(iree_task_post_batch_t* post_batch,
+                                  iree_host_size_t worker_index,
+                                  iree_task_t* task);
+
+// Submits all pending tasks to their worker mailboxes and resets state.
+// Returns true if any tasks were posted to workers.
+bool iree_task_post_batch_submit(iree_task_post_batch_t* post_batch);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_POST_BATCH_H_

diff --git a/iree/task/queue.c b/iree/task/queue.c
new file mode 100644
index 0000000..d546b0d
--- /dev/null
+++ b/iree/task/queue.c

@@ -0,0 +1,97 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/queue.h"
+
+#include <assert.h>
+
+void iree_task_queue_initialize(iree_task_queue_t* out_queue) {
+  memset(out_queue, 0, sizeof(*out_queue));
+  iree_slim_mutex_initialize(&out_queue->mutex);
+  iree_task_list_initialize(&out_queue->list);
+}
+
+void iree_task_queue_deinitialize(iree_task_queue_t* queue) {
+  iree_task_list_discard(&queue->list);
+  iree_slim_mutex_deinitialize(&queue->mutex);
+}
+
+bool iree_task_queue_is_empty(iree_task_queue_t* queue) {
+  iree_slim_mutex_lock(&queue->mutex);
+  bool is_empty = iree_task_list_is_empty(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+  return is_empty;
+}
+
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task) {
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_list_push_front(&queue->list, task);
+  iree_slim_mutex_unlock(&queue->mutex);
+}
+
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+                                                  iree_task_list_t* list) {
+  // NOTE: reversing the list outside of the lock.
+  iree_task_list_reverse(list);
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_list_append(&queue->list, list);
+  iree_slim_mutex_unlock(&queue->mutex);
+}
+
+iree_task_t* iree_task_queue_append_from_lifo_slist(
+    iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist) {
+  // Perform the flush and swap outside of the lock; acquiring the list is
+  // atomic and then we own it exclusively.
+  iree_task_list_t suffix;
+  iree_task_list_initialize(&suffix);
+  bool did_flush = iree_atomic_task_slist_flush(
+      source_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_FIFO,
+      &suffix.head, &suffix.tail);
+
+  // Append the tasks and pop off the front for return.
+  iree_slim_mutex_lock(&queue->mutex);
+  if (did_flush) iree_task_list_append(&queue->list, &suffix);
+  iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+
+  return next_task;
+}
+
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue) {
+  iree_slim_mutex_lock(&queue->mutex);
+  iree_task_t* next_task = iree_task_list_pop_front(&queue->list);
+  iree_slim_mutex_unlock(&queue->mutex);
+  return next_task;
+}
+
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+                                       iree_task_queue_t* target_queue,
+                                       iree_host_size_t max_tasks) {
+  // First attempt to steal up to max_tasks from the source queue.
+  iree_task_list_t stolen_tasks;
+  iree_task_list_initialize(&stolen_tasks);
+  iree_slim_mutex_lock(&source_queue->mutex);
+  iree_task_list_split(&source_queue->list, max_tasks, &stolen_tasks);
+  iree_slim_mutex_unlock(&source_queue->mutex);
+
+  // Add any stolen tasks to the target queue and pop off the head for return.
+  iree_task_t* next_task = NULL;
+  if (!iree_task_list_is_empty(&stolen_tasks)) {
+    iree_slim_mutex_lock(&target_queue->mutex);
+    iree_task_list_append(&target_queue->list, &stolen_tasks);
+    next_task = iree_task_list_pop_front(&target_queue->list);
+    iree_slim_mutex_unlock(&target_queue->mutex);
+  }
+  return next_task;
+}

diff --git a/iree/task/queue.h b/iree/task/queue.h
new file mode 100644
index 0000000..6248551
--- /dev/null
+++ b/iree/task/queue.h

@@ -0,0 +1,172 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_QUEUE_H_
+#define IREE_TASK_QUEUE_H_
+
+#include "iree/base/api.h"
+#include "iree/base/synchronization.h"
+#include "iree/task/list.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A simple work-stealing LIFO queue modeled on a Chase-Lev concurrent deque.
+// This is used by workers to maintain their thread-local working lists. The
+// workers keep the tasks they will process in FIFO order. They allow it to
+// empty and then refresh it with more tasks from the incoming worker mailbox.
+// The performance bias here is to the workers as they are >90% of the
+// accesses and the only other accesses are thieves that hopefully we can just
+// improve our distribution to vs. introducing a slowdown here.
+//
+// A futex is used to synchronize access; because the common case is that of
+// only the worker that owns the queue touching it for pushing and popping items
+// this puts us into the sweet-spot of uncontended lightweight exclusive locks.
+// Since futices are effectively just single machine words managed with atomic
+// ops we can avoid a lot of the traditional atomic tomfoolery one finds in
+// systems like these that originated prior to the introduction of futices while
+// also keeping the tiny overhead of the pure atomic solutions.
+//
+// We can also take advantage of the futex providing an actual exclusive region
+// such that our data structure can be whatever we want as opposed to needing to
+// be something that someone had figured out how to make atomic. For example,
+// common implementations of work-stealing queues are all bounded as unbounded
+// atomic deques are an unsolved problem in CS.
+//
+// Very rarely when another worker runs out of work it'll try to steal tasks
+// from nearby workers and use this queue type to do it: the assumption is that
+// it's better to take the last task the victim worker will get to so that in a
+// long list of tasks it remains chugging through the head of the list with good
+// cache locality. If we end up with a lot of theft, though, it's possible for
+// the cache benefits of the pop_back approach to the worker to outweigh the
+// cache pessimism for all thieves. Let's hope we can schedule deterministic-
+// enough tiles such that theft is rare!
+//
+// Our queue variant here is tuned for the use case we have: we exclusively
+// push in multiple tasks at a time (flushed from the mailbox) and exclusively
+// pop a single task a time (what to work on next). The stealing part is batched
+// so that when a remote worker has to perform a theft it takes a good chunk of
+// tasks in one go (hopefully roughly half) to reduce the total overhead when
+// there is high imbalance in workloads.
+//
+// Flushing from the mailbox slist (LIFO) to our list (FIFO) requires a full
+// walk of the incoming task linked list. This is generally fine as the number
+// of tasks in any given flush is low(ish) and by walking in reverse order to
+// then process forward the cache should be hot as the worker starts making its
+// way back through the tasks. As we walk forward we'll be using the task fields
+// for execution and retiring of tasks (notifing dependencies/etc) and the
+// intrusive next pointer sitting next to those should be in-cache when we need
+// to access it. This, combined with slab allocation of tasks in command buffers
+// to begin with gives us the (probabilistically) same characteristics of a flat
+// array walked with an index as is common in other work queues but with the
+// flexibility to reorder tasks as we see fit (theft, redistribution/rotation,
+// reprioritization, etc).
+//
+// Similar concepts, though implemented with atomics:
+//   "Dynamic Circular Work-Stealing Deque":
+//   http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.170.1097&rep=rep1&type=pdf
+//   "Correct and Efficient Work-Stealing for Weak Memory Models":
+//   https://fzn.fr/readings/ppopp13.pdf
+//   Motivating article:
+//   https://blog.molecular-matters.com/2015/08/24/job-system-2-0-lock-free-work-stealing-part-1-basics/
+//
+// Useful diagram from https://github.com/injinj/WSQ
+// Much of this implementation is inspired from that; though significant
+// reworking was required for our FIFO->LIFO->FIFO sandwich.
+//  +--------+ <- tasks[0]
+//  |  top   | <- stealers consume here: task = tasks[top++]
+//  |        |
+//  |   ||   |
+//  |        |
+//  |   vv   |
+//  | bottom | <- owner pushes here:    tasks[bottom++] = task
+//  |        |    owner consumes here:  task = tasks[--bottom]
+//  |        |
+//  +--------+ <- tasks[IREE_TASK_QUEUE_CAPACITY-1]
+//
+// Unlike that implementation, though, our task list is unbounded because we use
+// a linked list. To keep our options open, though, I've left the API of this
+// implementation compatible with classic atomic work-stealing queues. I'm
+// hopeful this will not need to be revisted for awhile, though!
+//
+// Future improvement idea: have the owner of the queue maintain a theft point
+// skip list that makes it possible for thieves to quickly come in and slice
+// off batches of tasks at the tail of the queue. Since we are a singly-linked
+// list we can't easily just walk backward and we don't want to be introducing
+// cache line contention as thieves start touching the same tasks as the worker
+// is while processing.
+typedef struct {
+  // Must be held when manipulating the queue. >90% accesses are by the owner.
+  iree_slim_mutex_t mutex;
+
+  // FIFO task list.
+  iree_task_list_t list IREE_GUARDED_BY(mutex);
+} iree_task_queue_t;
+
+// Initializes a work-stealing task queue in-place.
+void iree_task_queue_initialize(iree_task_queue_t* out_queue);
+
+// Deinitializes a task queue and clears all references.
+// Must not be called while any other worker may be attempting to steal tasks.
+void iree_task_queue_deinitialize(iree_task_queue_t* queue);
+
+// Returns true if the queue is empty.
+// Note that due to races this may return both false-positives and -negatives.
+bool iree_task_queue_is_empty(iree_task_queue_t* queue);
+
+// Pushes a task to the front of the queue.
+// Always prefer the multi-push variants (prepend/append) when adding more than
+// one task to the queue. This is mostly useful for exceptional cases such as
+// when a task may yield and need to be reprocessed after the worker resumes.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_push_front(iree_task_queue_t* queue, iree_task_t* task);
+
+// Appends a LIFO |list| of tasks to the queue.
+//
+// Must only be called from the owning worker's thread.
+void iree_task_queue_append_from_lifo_list_unsafe(iree_task_queue_t* queue,
+                                                  iree_task_list_t* list);
+
+// Flushes the |source_slist| LIFO mailbox into the task queue in FIFO order.
+// Returns the first task in the queue upon success; the task may be
+// pre-existing or from the newly flushed tasks.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_append_from_lifo_slist(
+    iree_task_queue_t* queue, iree_atomic_task_slist_t* source_slist);
+
+// Pops a task from the front of the queue if any are available.
+//
+// Must only be called from the owning worker's thread.
+iree_task_t* iree_task_queue_pop_front(iree_task_queue_t* queue);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the |source_queue| will be moved to the
+// |target_queue| and the first of the stolen tasks is returned.
+//
+// It's expected this is not called from the queue's owning worker, though it's
+// valid to do so.
+iree_task_t* iree_task_queue_try_steal(iree_task_queue_t* source_queue,
+                                       iree_task_queue_t* target_queue,
+                                       iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_QUEUE_H_

diff --git a/iree/task/queue_test.cc b/iree/task/queue_test.cc
new file mode 100644
index 0000000..9b8b448
--- /dev/null
+++ b/iree/task/queue_test.cc

@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/queue.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(QueueTest, Any) {
+  // TODO(benvanik): tests.
+}
+
+}  // namespace

diff --git a/iree/task/scope.c b/iree/task/scope.c
new file mode 100644
index 0000000..f35a687
--- /dev/null
+++ b/iree/task/scope.c

@@ -0,0 +1,128 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/scope.h"
+
+void iree_task_scope_initialize(iree_string_view_t name,
+                                iree_task_scope_t* out_scope) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_scope, 0, sizeof(*out_scope));
+
+  iree_host_size_t name_length =
+      iree_min(name.size, IREE_ARRAYSIZE(out_scope->name) - 1);
+  memcpy(out_scope->name, name.data, name_length);
+  out_scope->name[name.size] = 0;
+
+  // TODO(benvanik): pick trace colors based on name hash.
+  IREE_TRACE(out_scope->task_trace_color = 0xFFFF0000u);
+
+  iree_notification_initialize(&out_scope->idle_notification);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_deinitialize(iree_task_scope_t* scope) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  assert(iree_task_scope_is_idle(scope) &&
+         "pending submissions must be aborted prior to deinitializing their "
+         "scope");
+
+  // Makes it easier to see if we were incorrectly using the name even after the
+  // scope is deinitialized. Since scopes may be stack allocated we don't want
+  // to have anyone trying to access them (like tracy).
+  memset(scope->name, 0xCD, sizeof(scope->name));
+
+  // In most cases the status will have been consumed by the scope owner.
+  iree_status_t status = (iree_status_t)iree_atomic_exchange_ptr(
+      &scope->permanent_status, (uintptr_t)NULL, iree_memory_order_acquire);
+  IREE_IGNORE_ERROR(status);
+
+  iree_notification_deinitialize(&scope->idle_notification);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_task_scope_try_set_status(iree_task_scope_t* scope,
+                                           iree_status_t new_status) {
+  if (IREE_UNLIKELY(iree_status_is_ok(new_status))) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, "failed: ");
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(new_status)));
+
+  iree_status_t old_status = iree_ok_status();
+  if (!iree_atomic_compare_exchange_strong_ptr(
+          &scope->permanent_status, (uintptr_t*)&old_status,
+          (uintptr_t)new_status, iree_memory_order_seq_cst,
+          iree_memory_order_seq_cst)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(new_status);
+  }
+
+  // TODO(#4026): poke to wake idle waiters.
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_scope_abort(iree_task_scope_t* scope) {
+  iree_status_t status =
+      iree_make_status(IREE_STATUS_ABORTED, "entire scope aborted by user");
+  iree_task_scope_try_set_status(scope, status);
+}
+
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_task_t* task,
+                          iree_status_t status) {
+  // TODO(benvanik): logging/tracing based on task.
+  iree_task_scope_try_set_status(scope, status);
+}
+
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope) {
+  iree_status_t old_status = iree_ok_status();
+  iree_status_t new_status = iree_ok_status();
+  while (!iree_atomic_compare_exchange_strong_ptr(
+      &scope->permanent_status, (uintptr_t*)&old_status, (uintptr_t)new_status,
+      iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+    // Previous status was not OK; we have it now though and can try again.
+    new_status = iree_status_from_code(iree_status_code(new_status));
+  }
+  return old_status;
+}
+
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+    iree_task_scope_t* scope) {
+  iree_task_dispatch_statistics_t result = scope->dispatch_statistics;
+  memset(&scope->dispatch_statistics, 0, sizeof(scope->dispatch_statistics));
+  return result;
+}
+
+bool iree_task_scope_is_idle(iree_task_scope_t* scope) {
+  return iree_atomic_load_int32(&scope->pending_submissions,
+                                iree_memory_order_relaxed) == 0;
+}
+
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+                                        iree_time_t deadline_ns) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Wait for the scope to enter the idle state.
+  // NOTE: we are currently ignoring |deadline_ns|.
+  iree_notification_await(&scope->idle_notification,
+                          (iree_condition_fn_t)iree_task_scope_is_idle, scope);
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}

diff --git a/iree/task/scope.h b/iree/task/scope.h
new file mode 100644
index 0000000..ec10a34
--- /dev/null
+++ b/iree/task/scope.h

@@ -0,0 +1,136 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_SCOPE_H_
+#define IREE_TASK_SCOPE_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomics.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/tracing.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A loose way of grouping tasks within the task system.
+// Each scope represents a unique collection of tasks that have some related
+// properties - most often their producer - that need to carry along some
+// tracking information to act on all related tasks at once. They do not
+// indicate any particular ordering of tasks or how the tasks are to be treated
+// by executors.
+//
+// Scopes can be used to signal, propagate, and retrieve failure statuses. As
+// the executor processes tasks in an unordered fashion this is the only way to
+// perform cross-task operations such as "abort all of the tasks from this
+// producer" or "wait until all tasks from this producer finish." In addition
+// there are statistics that can be aggregated across all tasks attributed to
+// the scope that allows for an efficient roll-up of activity over specific
+// durations.
+//
+// Task producers can decide whether to create new scopes for each batch of
+// tasks they submit or reuse scopes for the lifetime of their subprocess. Scope
+// overhead is low and the only advantage of reusing them is that lifetime can
+// become easier to manage by tying them 1:1 with producers.
+//
+// Thread-safe; once created scopes are modified exclusively via atomic
+// operations.
+typedef struct iree_task_scope_s {
+  // Name used for logging and tracing.
+  char name[16];
+
+  // Base color used for tasks in this scope.
+  // The color will be modulated based on task type.
+  IREE_TRACE(uint32_t task_trace_color;)
+
+  // A permanent status code set when a task within the scope fails. All pending
+  // tasks will be cancelled, though any in-flight tasks may continue executing
+  // to completion.
+  iree_atomic_ptr_t permanent_status;
+
+  // Dispatch statistics aggregated from all dispatches in this scope. Updated
+  // relatively infrequently and must not be used for task control as values
+  // are undefined in the case of failure and may tear.
+  iree_task_dispatch_statistics_t dispatch_statistics;
+
+  // A count of pending submissions within this scope. 0 indicates idle.
+  // Each submission has a fence that references this value and decrements it
+  // as it is reached indicating that all memory used by all tasks within that
+  // submission is available for reuse.
+  iree_atomic_int32_t pending_submissions;
+
+  // A notification signaled when the scope transitions to having no pending
+  // tasks or completes all pending tasks after a failure.
+  iree_notification_t idle_notification;
+} iree_task_scope_t;
+
+// Initializes a caller-allocated scope.
+// Callers must ensure the scope remains live for as long as there are any
+// tasks that may reference it.
+void iree_task_scope_initialize(iree_string_view_t name,
+                                iree_task_scope_t* out_scope);
+
+// Deinitializes an task scope.
+// No tasks may be pending and the scope must be idle.
+void iree_task_scope_deinitialize(iree_task_scope_t* scope);
+
+// Marks the scope as having been aborted by the user with IREE_STATUS_ABORTED.
+// All pending tasks will be dropped though in-flight tasks may complete
+// execution. Callers must use iree_task_scope_wait_idle to ensure the scope
+// state synchronizes prior to deinitializing. If the scope has already been
+// aborted or failed with a permanent error then the operation is ignored and
+// the previous error status is preserved.
+void iree_task_scope_abort(iree_task_scope_t* scope);
+
+// Marks the scope as having encountered an error while processing |task|.
+// The scope will be moved into a permanent failure state and all pending tasks
+// will be aborted. In-flight tasks may continue executing prior to
+// iree_task_scope_wait_idle returning true. If the scope has already been
+// marked as failing then the status is ignored.
+void iree_task_scope_fail(iree_task_scope_t* scope, iree_task_t* task,
+                          iree_status_t status);
+
+// Returns the permanent scope failure status to the caller (transfering
+// ownership). The scope will remain in a failed state with the status code.
+iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope);
+
+// Returns and resets the statistics for the scope.
+// Statistics may experience tearing (non-atomic update across fields) if this
+// is performed while tasks are in-flight.
+iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
+    iree_task_scope_t* scope);
+
+// Returns true if the scope has no pending or in-flight tasks.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers are expected to use this only when it is safe.
+bool iree_task_scope_is_idle(iree_task_scope_t* scope);
+
+// Waits for the scope to become idle indicating that all pending and in-flight
+// tasks have completed. If the scope is aborted or marked for permanent failure
+// then the wait will only return after it is guaranteed no more tasks will ever
+// be issued by the task system.
+//
+// May race with other threads enqueuing work and be out of date immediately
+// upon return; callers must ensure this is used for command and control
+// decisions only when no other threads may be enqueuing more work.
+iree_status_t iree_task_scope_wait_idle(iree_task_scope_t* scope,
+                                        iree_time_t deadline_ns);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_SCOPE_H_

diff --git a/iree/task/scope_test.cc b/iree/task/scope_test.cc
new file mode 100644
index 0000000..9f42dd4
--- /dev/null
+++ b/iree/task/scope_test.cc

@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/scope.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(ScopeTest, Any) {
+  // TODO(benvanik): tests.
+}
+
+}  // namespace

diff --git a/iree/task/submission.c b/iree/task/submission.c
new file mode 100644
index 0000000..6a1aefe
--- /dev/null
+++ b/iree/task/submission.c

@@ -0,0 +1,65 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/submission.h"
+
+#include "iree/base/debugging.h"
+
+void iree_task_submission_initialize(iree_task_submission_t* out_submission) {
+  iree_task_list_initialize(&out_submission->ready_list);
+  iree_task_list_initialize(&out_submission->waiting_list);
+}
+
+void iree_task_submission_initialize_from_lifo_slist(
+    iree_atomic_task_slist_t* ready_slist,
+    iree_task_submission_t* out_submission) {
+  // Flush from the LIFO ready list to the LIFO submission queue.
+  // We have to walk everything here to get the tail pointer, which could be
+  // improved by sourcing from something other than an slist.
+  iree_task_submission_initialize(out_submission);
+  iree_atomic_task_slist_flush(
+      ready_slist, IREE_ATOMIC_SLIST_FLUSH_ORDER_APPROXIMATE_LIFO,
+      &out_submission->ready_list.head, &out_submission->ready_list.tail);
+}
+
+void iree_task_submission_reset(iree_task_submission_t* submission) {
+  memset(&submission->ready_list, 0, sizeof(submission->ready_list));
+  memset(&submission->waiting_list, 0, sizeof(submission->waiting_list));
+}
+
+void iree_task_submission_discard(iree_task_submission_t* submission) {
+  iree_task_list_discard(&submission->ready_list);
+  iree_task_list_discard(&submission->waiting_list);
+}
+
+bool iree_task_submission_is_empty(iree_task_submission_t* submission) {
+  return iree_task_list_is_empty(&submission->ready_list) &&
+         iree_task_list_is_empty(&submission->waiting_list);
+}
+
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+                                  iree_task_t* task) {
+  IREE_ASSERT_TRUE(iree_task_is_ready(task),
+                   "must be a root task to be enqueued on a submission");
+  if (task->type == IREE_TASK_TYPE_WAIT &&
+      (task->flags & IREE_TASK_FLAG_WAIT_COMPLETED) == 0) {
+    // A wait that we know is unresolved and can immediately route to the
+    // waiting list. This avoids the need to try to schedule the wait when it's
+    // almost certain that the wait would not be satisfied.
+    iree_task_list_push_front(&submission->waiting_list, task);
+  } else {
+    // Task is ready to execute immediately.
+    iree_task_list_push_front(&submission->ready_list, task);
+  }
+}

diff --git a/iree/task/submission.h b/iree/task/submission.h
new file mode 100644
index 0000000..0a1e84e
--- /dev/null
+++ b/iree/task/submission.h

@@ -0,0 +1,102 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_SUBMISSION_H_
+#define IREE_TASK_SUBMISSION_H_
+
+#include "iree/base/api.h"
+#include "iree/task/list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A pending submission to a task queue made up of a DAG of tasks.
+// Tasks are executed when ready in the order they were enqueued while observing
+// all dependencies. This means that two tasks that have no dependencies may
+// execute out of order/overlap.
+//
+// By keeping track of which tasks are ready for execution (ready_list) upon
+// submission to a queue we avoid the need to walk the task list again and
+// instead only touch the waiting tasks during construction and as they are made
+// ready, avoiding needless work and cache thrashing.
+//
+// Waiting tasks (waiting_list) are those waiting on external dependencies such
+// as file descriptor wait handles. Because we track all of these the executor
+// can perform an efficient multi-wait across queues without needing to block
+// (or even check) every waiting task individually.
+//
+// Because we only track roots of the DAG to release all tasks in a submission
+// early (due to failure or shutdown) the DAG must be walked. Releasing just the
+// lists will only handle the roots and leave all the rest of the tasks
+// dangling.
+//
+// Thread-compatible; designed to be used from a single thread producing the
+// submission.
+typedef struct {
+  // List of tasks that are ready for execution immediately. Upon submission to
+  // a queue the tasks will be passed on to the executor with no delay.
+  //
+  // Tasks are stored in LIFO order; this allows us to quickly concat them with
+  // incoming/mailbox slists that are naturally in LIFO order and that may
+  // contain tasks from prior submissions. Note that we are representing a
+  // ready list - meaning that all tasks are able to start simultaneously (in
+  // the best case where tasks <= workers); this means that the ordering
+  // requirements here are purely for performance and ease of debugging. In
+  // cases where tasks >> workers we could also see some benefits from the
+  // eventual FIFO order matching how the tasks were allocated.
+  iree_task_list_t ready_list;
+
+  // List of tasks that are waiting for execution on external dependencies.
+  // These are root tasks that have no internal task dependencies.
+  // Order is not important here; the assumption is that all waiting tasks are
+  // more of a set than an ordered list and that they can all be waited on as a
+  // multi-wait-any.
+  iree_task_list_t waiting_list;
+} iree_task_submission_t;
+
+// Initializes a task submission.
+void iree_task_submission_initialize(iree_task_submission_t* out_submission);
+
+// Flushes the given |ready_slist| and initializes the submission with all tasks
+// to the submission in LIFO order. All tasks in |ready_slist| are assumed to be
+// ready for execution immediately.
+void iree_task_submission_initialize_from_lifo_slist(
+    iree_atomic_task_slist_t* ready_slist,
+    iree_task_submission_t* out_submission);
+
+// Resets the submission by dropping the list references.
+void iree_task_submission_reset(iree_task_submission_t* submission);
+
+// Discards all pending tasks in the submission. This is only safe to call if
+// the submission has not yet been submitted to a queue for execution and should
+// be used for failure cleanup during submission construction.
+void iree_task_submission_discard(iree_task_submission_t* submission);
+
+// Returns true if the submission has no tasks.
+bool iree_task_submission_is_empty(iree_task_submission_t* submission);
+
+// Enqueues |task| to the pending |submission|.
+// The task will be checked to see whether it is immediately ready to execute
+// and placed in an appropriate list; all dependencies must be declared prior to
+// calling this method. After returning new tasks that depend on this task may
+// still be defined. The submission takes ownership of the |task|.
+void iree_task_submission_enqueue(iree_task_submission_t* submission,
+                                  iree_task_t* task);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_SUBMISSION_H_

diff --git a/iree/task/task.c b/iree/task/task.c
new file mode 100644
index 0000000..38c708b
--- /dev/null
+++ b/iree/task/task.c

@@ -0,0 +1,793 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/task.h"
+
+#include <stdio.h>
+
+#include "iree/task/task_impl.h"
+
+//==============================================================================
+// Task bookkeeping
+//==============================================================================
+
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+                          iree_task_t* out_task) {
+  //  NOTE: only clears the header, not the task body.
+  memset(out_task, 0, sizeof(*out_task));
+  out_task->scope = scope;
+  out_task->affinity_set = iree_task_affinity_for_any_worker();
+  out_task->type = type;
+}
+
+void iree_task_set_completion_task(iree_task_t* task,
+                                   iree_task_t* completion_task) {
+  assert(!task->completion_task);
+  task->completion_task = completion_task;
+  iree_atomic_fetch_add_int32(&completion_task->pending_dependency_count, 1,
+                              iree_memory_order_seq_cst);
+}
+
+bool iree_task_is_ready(iree_task_t* task) {
+  if (iree_atomic_load_int32(&task->pending_dependency_count,
+                             iree_memory_order_relaxed) > 0) {
+    // At least one dependency is still pending.
+    return false;
+  }
+  return true;
+}
+
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist) {
+  // NOTE: we always try adding to the head of the discard_worklist so that
+  // we hopefully get some locality benefits. This models a DFS discard in
+  // our non-recursive approach.
+
+  // Almost all tasks will have a completion task; some may have additional
+  // dependent tasks (like barriers) that will be handled below.
+  if (task->completion_task) {
+    iree_task_list_push_front(discard_worklist, task->completion_task);
+  }
+
+  switch (task->type) {
+    default:
+    case IREE_TASK_TYPE_NOP:
+    case IREE_TASK_TYPE_CALL:
+      break;
+    case IREE_TASK_TYPE_BARRIER: {
+      iree_task_barrier_t* barrier_task = (iree_task_barrier_t*)task;
+      for (uint32_t i = 0; i < barrier_task->dependent_task_count; ++i) {
+        iree_task_list_push_front(discard_worklist,
+                                  barrier_task->dependent_tasks[i]);
+      }
+      break;
+    }
+    case IREE_TASK_TYPE_FENCE: {
+      // TODO(benvanik): signal as error.
+      // iree_task_fence_t* fence_task = (iree_task_fence_t*)task;
+      iree_atomic_fetch_sub_int32(&task->scope->pending_submissions, 1,
+                                  iree_memory_order_relaxed);
+      break;
+    }
+    case IREE_TASK_TYPE_WAIT:
+    case IREE_TASK_TYPE_DISPATCH:
+    case IREE_TASK_TYPE_DISPATCH_SLICE:
+      break;
+  }
+
+  // Release the task back to the pool it was allocated from, if any.
+  // Some tasks are allocated from arenas and may not be able to be freed
+  // individually.
+  if (task->pool) {
+    iree_task_pool_release(task->pool, task);
+  }
+
+  // NOTE: task is invalidated here and cannot be used!
+}
+
+static void iree_task_retire(iree_task_t* task,
+                             iree_task_submission_t* pending_submission) {
+  // Decrement the pending count on the completion task, if any.
+  iree_task_t* completion_task = task->completion_task;
+  if (completion_task &&
+      iree_atomic_fetch_sub_int32(&completion_task->pending_dependency_count, 1,
+                                  iree_memory_order_acq_rel) == 1) {
+    // The completion task has retired and can now be made ready.
+    iree_task_submission_enqueue(pending_submission, completion_task);
+  }
+  task->completion_task = NULL;
+
+  // Return the task to the pool it was allocated from.
+  // Some tasks are allocated as part of arenas/ringbuffers and won't have a
+  // pool as they'll be cleaned up as part of a larger operation.
+  if (task->pool) {
+    iree_task_pool_release(task->pool, task);
+  }
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+                              iree_task_nop_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_NOP, scope, &out_task->header);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+                               iree_task_closure_t closure,
+                               iree_task_call_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_CALL, scope, &out_task->header);
+  out_task->closure = closure;
+}
+
+iree_status_t iree_task_call_execute(
+    iree_task_call_t* task, iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_status_t status =
+      task->closure.fn(task->closure.user_context, /*task_context=*/0);
+
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+                                  iree_host_size_t dependent_task_count,
+                                  iree_task_t* const* dependent_tasks,
+                                  iree_task_barrier_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_BARRIER, scope, &out_task->header);
+  out_task->dependent_task_count = dependent_task_count;
+  out_task->dependent_tasks = dependent_tasks;
+  for (iree_host_size_t i = 0; i < out_task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task = out_task->dependent_tasks[i];
+    iree_atomic_fetch_add_int32(&dependent_task->pending_dependency_count, 1,
+                                iree_memory_order_relaxed);
+  }
+}
+
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+                              iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // NOTE: we walk in reverse so that we enqueue in LIFO order.
+  for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task =
+        task->dependent_tasks[task->dependent_task_count - i - 1];
+    if (iree_atomic_fetch_sub_int32(&dependent_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1) {
+      // The dependent task has retired and can now be made ready.
+      iree_task_submission_enqueue(pending_submission, dependent_task);
+    }
+  }
+
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+                                iree_task_fence_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_FENCE, scope, &out_task->header);
+  iree_atomic_fetch_add_int32(&scope->pending_submissions, 1,
+                              iree_memory_order_relaxed);
+}
+
+void iree_task_fence_retire(iree_task_fence_t* task,
+                            iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_scope_t* scope = task->header.scope;
+  if (iree_atomic_fetch_sub_int32(&scope->pending_submissions, 1,
+                                  iree_memory_order_acq_rel) == 1) {
+    // All submissions have completed in this scope - notify any waiters.
+    iree_notification_post(&scope->idle_notification, IREE_ALL_WAITERS);
+  }
+
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+                               iree_wait_handle_t wait_handle,
+                               iree_task_wait_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_WAIT, scope, &out_task->header);
+  out_task->wait_handle = wait_handle;
+}
+
+bool iree_task_wait_check_condition(iree_task_wait_t* task) {
+  // TODO(benvanik): conditions.
+  task->header.flags |= IREE_TASK_FLAG_WAIT_COMPLETED;
+  return true;
+}
+
+void iree_task_wait_retire(iree_task_wait_t* task,
+                           iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // TODO(benvanik): allow deinit'ing the wait handle (if transient).
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* utilities
+//==============================================================================
+
+// Returns an XXBBGGRR color (red in the lowest bits).
+// Must not be 0 (tracy will ignore).
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context);
+
+#if defined(IREE_TASK_TRACING_PER_TILE_COLORS)
+
+// TODO(#4017): optimize this to compute entire slices at once and fold in the
+// work grid location code.
+static uint32_t iree_math_hsv_to_xrgb(const uint8_t h, const uint8_t s,
+                                      const uint8_t v) {
+  // NOTE: this is matching with tracy's TracyColor.cpp implementation so that
+  // our colors fit nicely in the UI.
+  const uint8_t reg = h / 43;
+  const uint8_t rem = (h - (reg * 43)) * 6;
+  const uint8_t p = (v * (255 - s)) >> 8;
+  const uint8_t q = (v * (255 - ((s * rem) >> 8))) >> 8;
+  const uint8_t t = (v * (255 - ((s * (255 - rem)) >> 8))) >> 8;
+
+  // clang-format off
+  uint8_t r, g, b;
+  switch (reg) {
+    case 0:  r = v; g = t; b = p; break;
+    case 1:  r = q; g = v; b = p; break;
+    case 2:  r = p; g = v; b = t; break;
+    case 3:  r = p; g = q; b = v; break;
+    case 4:  r = t; g = p; b = v; break;
+    default: r = v; g = p; b = q; break;
+  }
+  // clang-format on
+
+  uint32_t xrgb = (r << 16) | (g << 8) | b;
+  xrgb |= (xrgb ? 0 : 1);  // ensure never zero
+  return xrgb;
+}
+
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context) {
+  // TODO(#4017): optimize such that it's always on when tracing is
+  // enabled by amortizing the cost across the entire slice.
+
+  // Picked to try to make it easy to see gradients from tiles along the same x,
+  // y, and z (in that order). x is the fastest changing dimension and as such
+  // should all have the same hue, while z is the slowest changing dimension and
+  // should have different hues.
+  uint8_t h = (tile_context->workgroup_xyz[1] /
+               (float)(tile_context->workgroup_count[1])) *
+              255;
+  h = (h * 11400714819323198485ull) & 0xFF;
+  uint8_t s = 100 - (tile_context->workgroup_xyz[2] /
+                     (float)(tile_context->workgroup_count[2])) *
+                        100;
+  uint8_t v = (tile_context->workgroup_xyz[0] /
+               (float)(tile_context->workgroup_count[0])) *
+                  50 +
+              50;
+  return iree_math_hsv_to_xrgb(h, s, v);
+}
+
+#else
+
+static uint32_t iree_task_tile_to_color(
+    const iree_task_tile_context_t* tile_context) {
+  return 0;  // use default tracy colors
+}
+
+#endif  // IREE_TASK_TRACING_PER_TILE_COLORS
+
+void iree_task_dispatch_statistics_merge(
+    const iree_task_dispatch_statistics_t* source,
+    iree_task_dispatch_statistics_t* target) {
+  // TODO(benvanik): statistics.
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+static void iree_task_dispatch_initialize_base(iree_task_scope_t* scope,
+                                               iree_task_closure_t closure,
+                                               const uint32_t workgroup_size[3],
+                                               iree_task_dispatch_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_DISPATCH, scope, &out_task->header);
+  out_task->closure = closure;
+  memcpy(out_task->workgroup_size, workgroup_size,
+         sizeof(out_task->workgroup_size));
+  out_task->shared_memory_size = 0;
+  memset(&out_task->statistics, 0, sizeof(out_task->statistics));
+}
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+                                   iree_task_closure_t closure,
+                                   const uint32_t workgroup_size[3],
+                                   const uint32_t workgroup_count[3],
+                                   iree_task_dispatch_t* out_task) {
+  iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+  memcpy(out_task->workgroup_count.value, workgroup_count,
+         sizeof(out_task->workgroup_count.value));
+}
+
+void iree_task_dispatch_initialize_indirect(iree_task_scope_t* scope,
+                                            iree_task_closure_t closure,
+                                            const uint32_t workgroup_size[3],
+                                            const uint32_t* workgroup_count_ptr,
+                                            iree_task_dispatch_t* out_task) {
+  iree_task_dispatch_initialize_base(scope, closure, workgroup_size, out_task);
+  out_task->header.flags |= IREE_TASK_FLAG_DISPATCH_INDIRECT;
+  out_task->workgroup_count.ptr = workgroup_count_ptr;
+}
+
+void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
+                                     iree_task_pool_t* slice_task_pool,
+                                     iree_task_submission_t* pending_submission,
+                                     iree_task_post_batch_t* post_batch) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Mark the dispatch as having been issued; the next time it retires it'll be
+  // because all work has completed.
+  dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
+
+  // Fetch the workgroup count (directly or indirectly).
+  // By the task being ready to execute we know any dependencies on the
+  // indirection buffer have been satisfied and its safe to read.
+  uint32_t workgroup_count[3];
+  if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
+    memcpy(workgroup_count, dispatch_task->workgroup_count.ptr,
+           sizeof(workgroup_count));
+  } else {
+    memcpy(workgroup_count, dispatch_task->workgroup_count.value,
+           sizeof(workgroup_count));
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  char xyz_string[32];
+  int xyz_string_length =
+      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+               workgroup_count[0], workgroup_count[1], workgroup_count[2]);
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  // Divide up all tiles into slices, our finest-granularity scheduling task.
+  const uint32_t tiles_per_slice_x = IREE_TASK_DISPATCH_TILES_PER_SLICE_X;
+  const uint32_t tiles_per_slice_y = IREE_TASK_DISPATCH_TILES_PER_SLICE_Y;
+  const uint32_t tiles_per_slice_z = IREE_TASK_DISPATCH_TILES_PER_SLICE_Z;
+  uint32_t slice_count_x = iree_max(1, workgroup_count[0] / tiles_per_slice_x);
+  uint32_t slice_count_y = iree_max(1, workgroup_count[1] / tiles_per_slice_y);
+  uint32_t slice_count_z = iree_max(1, workgroup_count[2] / tiles_per_slice_z);
+
+  // Compute how many slices each worker will process.
+  uint32_t slice_count = slice_count_x * slice_count_y * slice_count_z;
+  iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
+  uint32_t slices_per_worker = iree_max(1, slice_count / worker_count);
+
+  // Randomize starting worker.
+  iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
+      post_batch, dispatch_task->header.affinity_set);
+  iree_host_size_t worker_index = worker_offset;
+
+  // TODO(benvanik): rework this with some science. For now we just iteratively
+  // divide up the space from outer->inner scheduling dimension, but ideally
+  // we'd use some fun cray-style torus scheduling or hilbert curve magic to
+  // try to ensure better locality using worker constructive sharing masks.
+  // TODO(benvanik): observe affinity_set here when dividing ranges.
+  iree_host_size_t worker_slice_count = 0;
+  for (uint32_t slice_z = 0; slice_z < slice_count_z; ++slice_z) {
+    for (uint32_t slice_y = 0; slice_y < slice_count_y; ++slice_y) {
+      for (uint32_t slice_x = 0; slice_x < slice_count_x; ++slice_x) {
+        uint32_t workgroup_base[3];
+        workgroup_base[0] = slice_x * tiles_per_slice_x;
+        workgroup_base[1] = slice_y * tiles_per_slice_y;
+        workgroup_base[2] = slice_z * tiles_per_slice_z;
+        uint32_t workgroup_range[3];
+        workgroup_range[0] = iree_min(
+            workgroup_count[0], workgroup_base[0] + tiles_per_slice_x - 1);
+        workgroup_range[1] = iree_min(
+            workgroup_count[1], workgroup_base[1] + tiles_per_slice_y - 1);
+        workgroup_range[2] = iree_min(
+            workgroup_count[2], workgroup_base[2] + tiles_per_slice_z - 1);
+
+        // Allocate and initialize the slice.
+        iree_task_dispatch_slice_t* slice_task =
+            iree_task_dispatch_slice_allocate(dispatch_task, workgroup_base,
+                                              workgroup_range, workgroup_count,
+                                              slice_task_pool);
+
+        // Enqueue on the worker selected for the task.
+        iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
+                                     &slice_task->header);
+        if (++worker_slice_count >= slices_per_worker) {
+          ++worker_index;
+          worker_slice_count = 0;
+        }
+      }
+    }
+  }
+
+  // NOTE: the dispatch is not retired until all slices complete. Upon the last
+  // slice completing the lucky worker will retire the task inline and
+  // potentially queue up more ready tasks that follow.
+  //
+  // The gotcha here is that it's possible for there to be zero slices within
+  // a dispatch (if, for example, and indirect dispatch had its workgroup counts
+  // set to zero to prevent it from running). We check for that here.
+  if (slice_count == 0) {
+    iree_task_dispatch_retire(dispatch_task, pending_submission);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_dispatch_issue_sharded(
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
+    iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Mark the dispatch as having been issued; the next time it retires it'll be
+  // because all work has completed.
+  dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
+
+  iree_task_dispatch_shard_state_t* shared_state =
+      &dispatch_task->shared.shard_state;
+  shared_state->dispatch_task = dispatch_task;
+
+  // Fetch the workgroup count (directly or indirectly).
+  // By the task being ready to execute we know any dependencies on the
+  // indirection buffer have been satisfied and its safe to read.
+  if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
+    memcpy(shared_state->workgroup_count, dispatch_task->workgroup_count.ptr,
+           sizeof(shared_state->workgroup_count));
+  } else {
+    memcpy(shared_state->workgroup_count, dispatch_task->workgroup_count.value,
+           sizeof(shared_state->workgroup_count));
+  }
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  char xyz_string[32];
+  int xyz_string_length = snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string),
+                                   "%ux%ux%u", shared_state->workgroup_count[0],
+                                   shared_state->workgroup_count[1],
+                                   shared_state->workgroup_count[2]);
+  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+
+  // TODO(benvanik): shared memory; likely pulled from a ringbuffer.
+  // We'll have to ensure we have the memory available prior to scheduling the
+  // dispatch and probably just pass it in as an argument in here.
+  shared_state->shared_memory = iree_make_byte_span(NULL, 0);
+
+  // Setup the iteration space for shards to pull work from the complete grid.
+  iree_atomic_store_int32(&shared_state->tile_index, 0,
+                          iree_memory_order_relaxed);
+  shared_state->tile_count = shared_state->workgroup_count[0] *
+                             shared_state->workgroup_count[1] *
+                             shared_state->workgroup_count[2];
+
+  // Compute shard count - almost always worker_count unless we are a very small
+  // dispatch (1x1x1, etc).
+  iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
+  iree_host_size_t shard_count =
+      iree_min(shared_state->tile_count, worker_count);
+
+  // Compute how many tiles we want each shard to reserve at a time from the
+  // larger grid. A higher number reduces overhead and improves locality while
+  // a lower number reduces maximum worst-case latency (coarser work stealing).
+  if (shared_state->tile_count <
+      worker_count * IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION) {
+    // Grid is small - allow it to be eagerly sliced up.
+    shared_state->tiles_per_reservation = 1;
+  } else {
+    shared_state->tiles_per_reservation =
+        IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION;
+  }
+
+  // Randomize starting worker.
+  iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
+      post_batch, dispatch_task->header.affinity_set);
+  iree_host_size_t worker_index = worker_offset;
+
+  for (iree_host_size_t i = 0; i < shard_count; ++i) {
+    // Allocate and initialize the shard.
+    iree_task_dispatch_shard_t* shard_task = iree_task_dispatch_shard_allocate(
+        dispatch_task, shared_state, shard_task_pool);
+
+    // Enqueue on the worker selected for the task.
+    iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
+                                 &shard_task->header);
+    ++worker_index;
+  }
+
+  // NOTE: the dispatch is not retired until all shards complete. Upon the last
+  // shard completing the lucky worker will retire the task inline and
+  // potentially queue up more ready tasks that follow.
+  //
+  // The gotcha here is that it's possible for there to be zero shards within
+  // a dispatch (if, for example, and indirect dispatch had its workgroup counts
+  // set to zero to prevent it from running). We check for that here.
+  if (shard_count == 0) {
+    iree_task_dispatch_retire(dispatch_task, pending_submission);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+                               iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): attach statistics to the tracy zone.
+
+  // Merge the statistics from the dispatch into the scope so we can track all
+  // of the work without tracking all the dispatches at a global level.
+  iree_task_dispatch_statistics_merge(
+      &dispatch_task->statistics,
+      &dispatch_task->header.scope->dispatch_statistics);
+
+  iree_task_retire(&dispatch_task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
+                                         const uint32_t workgroup_base[3],
+                                         const uint32_t workgroup_range[3],
+                                         const uint32_t workgroup_count[3],
+                                         iree_task_dispatch_slice_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SLICE,
+                       dispatch_task->header.scope, &out_task->header);
+  iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
+  out_task->closure = dispatch_task->closure;
+
+  memcpy(out_task->workgroup_base, workgroup_base,
+         sizeof(out_task->workgroup_base));
+  memcpy(out_task->workgroup_range, workgroup_range,
+         sizeof(out_task->workgroup_range));
+  memcpy(out_task->workgroup_size, dispatch_task->workgroup_size,
+         sizeof(out_task->workgroup_size));
+  memcpy(out_task->workgroup_count, workgroup_count,
+         sizeof(out_task->workgroup_count));
+
+  // TODO(benvanik): shared memory; likely pulled from a ringbuffer.
+  // We'll have to ensure we have the memory available prior to scheduling the
+  // dispatch and probably just pass it in as an argument in here.
+  out_task->shared_memory = iree_make_byte_span(NULL, 0);
+
+  // Wire up dispatch statistics; we'll track on the slice while we run and
+  // then the per-slice statistics will roll up into the dispatch statistics.
+  out_task->dispatch_statistics = &dispatch_task->statistics;
+  memset(&out_task->slice_statistics, 0, sizeof(out_task->slice_statistics));
+}
+
+iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
+    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
+    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
+    iree_task_pool_t* slice_task_pool) {
+  iree_task_dispatch_slice_t* slice_task = NULL;
+  iree_status_t status =
+      iree_task_pool_acquire(slice_task_pool, (iree_task_t**)&slice_task);
+  if (!iree_status_is_ok(status)) {
+    iree_status_ignore(status);
+    return NULL;
+  }
+  iree_task_dispatch_slice_initialize(dispatch_task, workgroup_base,
+                                      workgroup_range, workgroup_count,
+                                      slice_task);
+  slice_task->header.pool = slice_task_pool;
+  return slice_task;
+}
+
+iree_status_t iree_task_dispatch_slice_execute(
+    iree_task_dispatch_slice_t* task,
+    iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // TODO(benvanik): coroutine support. Ideally this function can be called
+  // multiple times for the same slice, and we'll have a way to ready up the
+  // slices on the same workers (some per-worker suspended list?).
+
+  // Prepare context shared for all tiles in the slice.
+  iree_task_tile_context_t tile_context;
+  memcpy(&tile_context.workgroup_size, task->workgroup_size,
+         sizeof(tile_context.workgroup_size));
+  memcpy(&tile_context.workgroup_count, task->workgroup_count,
+         sizeof(tile_context.workgroup_count));
+  tile_context.shared_memory = task->shared_memory;
+  tile_context.statistics = &task->slice_statistics;
+
+  const uint32_t base_x = task->workgroup_base[0];
+  const uint32_t base_y = task->workgroup_base[1];
+  const uint32_t base_z = task->workgroup_base[2];
+  const uint32_t range_x = task->workgroup_range[0];
+  const uint32_t range_y = task->workgroup_range[1];
+  const uint32_t range_z = task->workgroup_range[2];
+  for (uint32_t z = base_z; z <= range_z; ++z) {
+    tile_context.workgroup_xyz[2] = z;
+    for (uint32_t y = base_y; y <= range_y; ++y) {
+      tile_context.workgroup_xyz[1] = y;
+      for (uint32_t x = base_x; x <= range_x; ++x) {
+        tile_context.workgroup_xyz[0] = x;
+        IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
+                                    "iree_task_dispatch_slice_execute_tile");
+        IREE_TRACE_ZONE_SET_COLOR(z_tile,
+                                  iree_task_tile_to_color(&tile_context));
+
+        // NOTE: these are useful for debugging but dramatically increase our
+        // cost here; only enable if needed for tracking work distribution:
+        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, x);
+        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, y);
+        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, z);
+        // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
+
+        iree_status_t status = task->closure.fn(task->closure.user_context,
+                                                (uintptr_t)&tile_context);
+
+        IREE_TRACE_ZONE_END(z_tile);
+        if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+          // NOTE: we don't bother to update statistics here on failure as the
+          // partial results won't really help much.
+          IREE_TRACE_ZONE_END(z0);
+          return status;
+        }
+      }
+    }
+  }
+
+  // Push aggregate statistics up to the dispatch.
+  if (task->dispatch_statistics) {
+    iree_task_dispatch_statistics_merge(&task->slice_statistics,
+                                        task->dispatch_statistics);
+  }
+
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+void iree_task_dispatch_shard_initialize(
+    iree_task_dispatch_t* dispatch_task,
+    iree_task_dispatch_shard_state_t* shared_state,
+    iree_task_dispatch_shard_t* out_task) {
+  iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SHARD,
+                       dispatch_task->header.scope, &out_task->header);
+  iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
+  out_task->shared_state = shared_state;
+}
+
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+    iree_task_dispatch_t* dispatch_task,
+    iree_task_dispatch_shard_state_t* shared_state,
+    iree_task_pool_t* shard_task_pool) {
+  iree_task_dispatch_shard_t* shard_task = NULL;
+  iree_status_t status =
+      iree_task_pool_acquire(shard_task_pool, (iree_task_t**)&shard_task);
+  if (!iree_status_is_ok(status)) {
+    iree_status_ignore(status);
+    return NULL;
+  }
+  iree_task_dispatch_shard_initialize(dispatch_task, shared_state, shard_task);
+  shard_task->header.pool = shard_task_pool;
+  return shard_task;
+}
+
+iree_status_t iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task,
+    iree_task_submission_t* pending_submission) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_dispatch_shard_state_t* shared_state = task->shared_state;
+  iree_task_dispatch_t* dispatch_task = shared_state->dispatch_task;
+
+  // Prepare context shared for all tiles in the shard.
+  iree_task_tile_context_t tile_context;
+  memcpy(&tile_context.workgroup_size, dispatch_task->workgroup_size,
+         sizeof(tile_context.workgroup_size));
+  memcpy(&tile_context.workgroup_count, task->shared_state->workgroup_count,
+         sizeof(tile_context.workgroup_count));
+  tile_context.shared_memory = shared_state->shared_memory;
+  uint32_t workgroup_count_x = tile_context.workgroup_count[0];
+  uint32_t workgroup_count_y = tile_context.workgroup_count[1];
+
+  // We perform all our shard statistics work locally here and only push back to
+  // the dispatch at the end; this avoids contention from each shard trying to
+  // update the statistics together.
+  iree_task_dispatch_statistics_t shard_statistics;
+  memset(&shard_statistics, 0, sizeof(shard_statistics));
+  tile_context.statistics = &shard_statistics;
+
+  // Loop over all tiles until they are all processed.
+  const uint32_t tile_count = shared_state->tile_count;
+  const uint32_t tiles_per_reservation = shared_state->tiles_per_reservation;
+  uint32_t tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+                                                   tiles_per_reservation,
+                                                   iree_memory_order_relaxed);
+  while (tile_base < tile_count) {
+    const uint32_t next_tile_base = iree_atomic_fetch_add_int32(
+        &shared_state->tile_index, tiles_per_reservation,
+        iree_memory_order_relaxed);
+
+    const uint32_t tile_range =
+        iree_min(tile_base + tiles_per_reservation, tile_count);
+    for (uint32_t tile_index = tile_base; tile_index < tile_range;
+         ++tile_index) {
+      // TODO(benvanik): faster math here, especially knowing we pull off N
+      // sequential indices per reservation.
+      uint32_t tile_i = tile_index;
+      tile_context.workgroup_xyz[0] = tile_i % (workgroup_count_x + 1);
+      tile_i /= (workgroup_count_x + 1);
+      tile_context.workgroup_xyz[1] = tile_i % (workgroup_count_y + 1);
+      tile_i /= (workgroup_count_y + 1);
+      tile_context.workgroup_xyz[2] = tile_i;
+
+      IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
+                                  "iree_task_dispatch_shard_execute_tile");
+      IREE_TRACE_ZONE_SET_COLOR(z_tile, iree_task_tile_to_color(&tile_context));
+
+      // NOTE: these are useful for debugging but dramatically increase our
+      // cost here; only enable if needed for tracking work distribution:
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[0]);
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[1]);
+      IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[2]);
+      // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
+
+      iree_status_t status = dispatch_task->closure.fn(
+          dispatch_task->closure.user_context, (uintptr_t)&tile_context);
+
+      IREE_TRACE_ZONE_END(z_tile);
+      if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+        // NOTE: we don't bother to update statistics here on failure as the
+        // partial results won't really help much.
+        IREE_TRACE_ZONE_END(z0);
+        return status;
+      }
+    }
+
+    tile_base = next_tile_base;
+  }
+
+  // Push aggregate statistics up to the dispatch.
+  iree_task_dispatch_statistics_merge(&shard_statistics,
+                                      &dispatch_task->statistics);
+
+  iree_task_retire(&task->header, pending_submission);
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}

diff --git a/iree/task/task.h b/iree/task/task.h
new file mode 100644
index 0000000..35a632f
--- /dev/null
+++ b/iree/task/task.h

@@ -0,0 +1,620 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TASK_H_
+#define IREE_TASK_TASK_H_
+
+#include "iree/base/api.h"
+#include "iree/base/atomic_slist.h"
+#include "iree/base/atomics.h"
+#include "iree/base/synchronization.h"
+#include "iree/base/wait_handle.h"
+#include "iree/task/affinity_set.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef struct iree_task_list_s iree_task_list_t;
+typedef struct iree_task_pool_s iree_task_pool_t;
+typedef struct iree_task_scope_s iree_task_scope_t;
+
+//==============================================================================
+// Function closures
+//==============================================================================
+
+typedef iree_status_t(IREE_API_PTR* iree_task_closure_fn_t)(
+    uintptr_t user_context, uintptr_t task_context);
+
+// A function closure representing the function to call and its arguments.
+typedef struct {
+  // Function called per tile invocation.
+  iree_task_closure_fn_t fn;
+
+  // User-defined argument passed to task functions during invocation.
+  // Opaque pointer-sized values that could point to user data structures or
+  // contain embedded values. No lifetime management is performed by the task
+  // system and it is required that users ensure that the memory referenced is
+  // live until after the task has completed.
+  uintptr_t user_context;
+
+  // TODO(benvanik): cleanup function? right now assume arg is never freed.
+} iree_task_closure_t;
+
+// Binds a function pointer and the arguments it should be called with.
+// If the arguments represent pointers they must remain live until the task
+// has completed execution.
+static inline iree_task_closure_t iree_task_make_closure(
+    iree_task_closure_fn_t fn, uintptr_t user_context) {
+  iree_task_closure_t closure = {fn, user_context};
+  return closure;
+}
+
+//==============================================================================
+// Task header for internal tracking
+//==============================================================================
+
+// Specifies the type of a task and how executors handle it.
+enum iree_task_type_e {
+  // Task is a no-op (performs no work) and exists for flexibility.
+  IREE_TASK_TYPE_NOP = 0u,
+
+  // Task will synchronously call a function before continuing.
+  IREE_TASK_TYPE_CALL = 1u,
+
+  // Task exists only as a barrier to join/fork tasks and has no executable
+  // payload.
+  IREE_TASK_TYPE_BARRIER = 2u,
+
+  // Task is a fence indicating that a certain point in the task graph has been
+  // reached. All tasks prior to this fence (by way of happens-before
+  // dependencies) are guaranteed to have retired.
+  IREE_TASK_TYPE_FENCE = 3u,
+
+  // Task is a wait on an external wait handle (fd, HANDLE, etc).
+  // Executors will wait on the handle until it is signaled and meets the
+  // specified condition prior to readying the dependent tasks.
+  IREE_TASK_TYPE_WAIT = 4u,
+
+  // Task is a 3D grid dispatch of zero or more tiles.
+  // Dispatches are issued when ready by either being split into zero or more
+  // slices with one or more tiles each based on the workgroup count or one
+  // shard per worker that should process the dispatch.
+  //
+  // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
+  // workgroup count from a buffer immediately prior to fan-out instead of using
+  // the values embedded in the task structure.
+  //
+  // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
+  // set to indicate that when the dispatch becomes ready again it will be after
+  // all slices have completed.
+  IREE_TASK_TYPE_DISPATCH = 5u,
+
+  // Task is a slice of a larger contiguous dispatch tile range. The full
+  // dispatch will be sliced into zero or more slices and each slice will be
+  // posted to a particular worker for executiion. If work progresses unevenly
+  // then entire slices will be stolen across workers to balance out the timing.
+  // Slices retire once they have completed the tiles assigned to them.
+  IREE_TASK_TYPE_DISPATCH_SLICE = 6u,
+
+  // Task is one of potentially many shards processing a larger dispatch grid.
+  // Each shard may have a preference as to which parts of grid it will focus
+  // on but is able to otherwise steal any available region directly from the
+  // shared dispatch coordination state. Shards retire once there are no more
+  // tiles remaining in the dispatch grid.
+  IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
+};
+typedef uint8_t iree_task_type_t;
+
+enum iree_task_flags_e {
+  // The wait handle the task is specified to wait on has resolved and the task
+  // can now be considered complete.
+  IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,
+
+  // The workgroup count for the dispatch is provided by way of a pointer to a
+  // list of 3 uint32_t values that will be sampled immediately prior to
+  // issuing of the dispatch. The contents of the pointer can be safely modified
+  // up until the last dependency has completed and the dispatch is about to be
+  // issued.
+  IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,
+
+  // The dispatch should be sliced across workers via the low-contention
+  // IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
+  // to the time when the grid is sliced for a savings during when the grid is
+  // executed.
+  IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,
+
+  // The dispatch has been issued and the task is waiting for one or more
+  // slices to complete. After they complete the dispatch will be readied and
+  // can be retired.
+  //
+  // Though added by the executor after issuing a dispatch users can also set
+  // this to indicate that all dispatch slices for a particular dispatch have
+  // been statically scheduled. Executors will then skip issuing the dispatch
+  // and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
+  // behavior but without an additional task as dispatches are still required
+  // to store information for slices.
+  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
+};
+typedef uint16_t iree_task_flags_t;
+
+typedef struct iree_task_s iree_task_t;
+
+// A task within the task system that runs on an executor.
+// Tasks have an iree_task_type_t that defines which parameters are valid and
+// how the executor is to treat the task. Dependency edges can be defined that
+// determine the execution order of tasks within the executors.
+struct iree_alignas(iree_max_align_t) iree_task_s {
+  // Instrusive pointer used to store tasks within iree_task_list_t and
+  // iree_atomic_task_list_t singly-linked lists. This must come first in the
+  // structure so that it is at the appropriate alignment.
+  iree_task_t* next_task;
+
+  // The scope this task is attributed to. Errors with the task will be
+  // propagated to the scope and errors in the scope will cause pending tasks to
+  // be skipped.
+  iree_task_scope_t* scope;
+
+  // Optional task that will be notified when the task completes.
+  // The task will have its pending_dependency_count decremented and will be
+  // readied for execution when the count reaches 0.
+  iree_task_t* completion_task;
+
+  // Specifies which workers will be used to execute this task.
+  // Forked tasks will inherit their parent task affinity (possibly with some
+  // task-dependent rules) to partition workloads across workers with knowledge
+  // of the specific work being performed. For example, some dispatches can be
+  // limited to run on certain microarchitectures that workers have affinity
+  // with at the OS scheduler level (such as little.BIG topologies).
+  iree_task_affinity_set_t affinity_set;
+
+  // Total number of dependent tasks still outstanding. Decremented each time
+  // a dependent task completes. The task is considered ready to execute when
+  // this value reaches 0.
+  iree_atomic_int32_t pending_dependency_count;
+
+  // Optional pool the task should be returned to after it has resolved. If the
+  // task was allocated as part of a larger data structure (embedded within
+  // an arena for example) then this can be NULL to prevent the task system
+  // from interfering.
+  iree_task_pool_t* pool;
+
+  // Specifies the type of the task and how the executor handles it.
+  iree_task_type_t type;
+
+  // Task-specific flag bits.
+  iree_task_flags_t flags;
+};
+static_assert(offsetof(iree_task_t, next_task) == 0,
+              "next_task intrusive pointer must be at offset 0");
+
+// Initializes a task header with the given type.
+// Must be called on all tasks to ensure proper dependency tracking and list
+// state prior to enqueuing. Only the task header structure is initialized and
+// any additional data as part of the wrapping task type must be initialized by
+// the caller.
+void iree_task_initialize(iree_task_type_t type, iree_task_scope_t* scope,
+                          iree_task_t* out_task);
+
+// Sets up a dependency edge from |task| to |completion_task| such that when
+// |task| completes |completion_task| will be notified and have its
+// pending_dependency_count decremented.
+void iree_task_set_completion_task(iree_task_t* task,
+                                   iree_task_t* completion_task);
+
+// Returns true if the |task| is ready to execute immediately.
+// Though this is safe to call from any thread the test may have false-negatives
+// (ready tasks are not returned as ready) due to cross-thread synchronization
+// latency. Note that tasks may yield themselves during execution and switch
+// from ready to waiting (such as when an indirect dispatch needs to wait for
+// all tiles to complete).
+bool iree_task_is_ready(iree_task_t* task);
+
+// Discards the task and any dependent tasks.
+// Any dependent tasks that need to be discarded will be added to
+// |discard_worklist| for the caller to continue discarding.
+void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist);
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+// Task is a no-op (performs no work) and exists for flexibility.
+// NOP tasks can be used to link together task lists from multiple threads
+// where it may otherwise not be ideal to have heavy-weight concurrency
+// structures. NOP tasks can also be useful for neutering another task type
+// after it has already been recorded into a list such as when cancellations
+// occur.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+} iree_task_nop_t;
+
+void iree_task_nop_initialize(iree_task_scope_t* scope,
+                              iree_task_nop_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// A task that will synchronously call a function from the executor and wait
+// for it to complete before continuing.
+//
+// Memory referenced by closure arguments must be kept valid until the function
+// executes (in general with the same lifetime as the task itself).
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Function closure to call when the task is executed.
+  iree_task_closure_t closure;
+} iree_task_call_t;
+
+void iree_task_call_initialize(iree_task_scope_t* scope,
+                               iree_task_closure_t closure,
+                               iree_task_call_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// A join point for fork/join-style scheduling.
+// References a set of dependent tasks that will be notified and possibly
+// readied when the barrier is reached.
+//
+// This allows for modeling one-to-many and many-to-many relationships. The base
+// task dependency system only models one-to-one and should be used if possible
+// to avoid the additional overhead of a barrier task both in memory and task
+// indirection/queuing.
+//
+// Example:
+//  * [A] -> Barrier -> [C, D]
+//  - A executes
+//  - Barrier is processed after A completes
+//  - C and D execute concurrently (in any order)
+//
+//  * [A, B] -> Barrier -> [C, D]
+//  - A and B execute concurrently (in any order)
+//  - Barrier is processed after both A and B complete
+//  - C and D execute concurrently
+//
+//  * [A] -> Barrier -> [B]
+//  - Don't do this and use the base task dependency instead; it'll work, but
+//    it's much better to avoid the additional barrier indirection when
+//    possible.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Number of valid tasks in the dependent_tasks list.
+  iree_host_size_t dependent_task_count;
+  // [0-dependent_task_count] tasks that will be notified when the barrier is
+  // reached. Each task will have its pending_dependency_count decremented and
+  // when the count reaches 0 be added to the ready list.
+  iree_task_t* const* dependent_tasks;
+} iree_task_barrier_t;
+
+void iree_task_barrier_initialize(iree_task_scope_t* scope,
+                                  iree_host_size_t dependent_task_count,
+                                  iree_task_t* const* dependent_tasks,
+                                  iree_task_barrier_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// A fence indicating that a certain point in the task graph has been reached.
+// All tasks prior to this fence (by way of happens-before dependencies) are
+// guaranteed to have retired.
+//
+// When all of the dependencies of a fence have retired the fence will notify
+// the parent scope of the task by decrementing the pending_submissions count
+// and publishing an idle_notification if it was the last in-flight submission.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // TODO(benvanik): user-defined fence data for semaphore signaling. Optional
+  // wait_handle to signal?
+} iree_task_fence_t;
+
+void iree_task_fence_initialize(iree_task_scope_t* scope,
+                                iree_task_fence_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+typedef struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // The external wait handle that the task is waiting on.
+  iree_wait_handle_t wait_handle;
+
+  // TODO(benvanik): deadline_ns.
+  // TODO(benvanik): condition (possibly a closure to evaluate) ala condvar.
+} iree_task_wait_t;
+
+void iree_task_wait_initialize(iree_task_scope_t* scope,
+                               iree_wait_handle_t wait_handle,
+                               iree_task_wait_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_* structures
+//==============================================================================
+
+// Statistics tracked across an entire dispatch operation.
+// Each tile contributes to these statistics as they execute to provide an
+// aggregate set of statistics that can be reported to tracing/user queries.
+//
+// We want to keep this structure relatively compact as it does add overhead.
+// If statistics are used purely for interactive tracing then they can be
+// piped directly to the tracing tool using IREE_TRACE_* macros. If the
+// statistics are programmatically queried for benchmarks or reporting then
+// they belong here where we can efficiently move them around.
+//
+// If we find ourselves with a lot of hardware-specific counters (vs more
+// generic ones like 'l2 cache misses' or 'ipc') then we can sprinkle in some
+// #ifdefs.
+typedef struct {
+  // TODO(benvanik): statistics counters.
+  iree_atomic_int32_t reserved;
+} iree_task_dispatch_statistics_t;
+
+// Merges statistics from |source| to |target| atomically per-field.
+// As each field is updated independently and in a relaxed memory order it's
+// possible for statistics consumers to see a tear.
+void iree_task_dispatch_statistics_merge(
+    const iree_task_dispatch_statistics_t* source,
+    iree_task_dispatch_statistics_t* target);
+
+typedef struct {
+  // TODO(benvanik): coroutine storage.
+  // Ideally we'll be able to have a fixed coroutine storage size per dispatch
+  // (via @llvm.coro.size) such that we can preallocate all of the storage for
+  // a dispatch in one shot. If we need to do dynamic allocation we will need a
+  // ringbuffer or other kind of pool to allocate from on-demand.
+  uint32_t reserved;
+} iree_task_tile_storage_t;
+
+// Per-tile context provided to each dispatch function invocation in the grid.
+// This information is unique to the tile being dispatched and may contain
+// specific state about the calling thread/fiber/etc.
+//
+// If tile execution is suspended by hitting a coroutine suspend point then the
+// coroutine state will be stored within the tile context until the tile is
+// resumed.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Workgroup ID for the current invocation.
+  uint32_t workgroup_xyz[3];
+  // Workgroup size for each invocation.
+  uint32_t workgroup_size[3];
+  // Total workgroup count for the task. Can be used in conjunction with the
+  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+  uint32_t workgroup_count[3];
+  // TODO(benvanik): workgroup index to amortize calculating linear offsets.
+  // (like gl_GlobalInvocationID)
+
+  // Incoherent memory shared across all invocations of the task.
+  // Aligned to at least the natural pointer size of the machine. Functions must
+  // use atomic operations to ensure proper memory ordering.
+  iree_byte_span_t shared_memory;
+
+  // Shared statistics counters for the dispatch slice.
+  iree_task_dispatch_statistics_t* statistics;
+
+  // TODO(benvanik): cpuid uarch.
+  // TODO(benvanik): per-tile coroutine storage.
+} iree_task_tile_context_t;
+
+typedef struct iree_task_dispatch_s iree_task_dispatch_t;
+
+// Shared state for all shards processing a dispatch.
+typedef iree_alignas(iree_max_align_t) struct {
+  // Direct reference to the parent dispatch that all shards are processing.
+  iree_task_dispatch_t* dispatch_task;
+
+  // The tail tile index; the next reservation will start from here.
+  iree_atomic_int32_t tile_index;
+
+  // The total number of tiles in the dispatch bounding tile_index.
+  uint32_t tile_count;
+
+  // Maximum number of tiles to fetch per tile reservation from the grid.
+  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
+  // reasonable number chosen based on the tile and shard counts.
+  uint32_t tiles_per_reservation;
+
+  // Total workgroup count for the task. Can be used in conjunction with the
+  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+  uint32_t workgroup_count[3];
+
+  // Incoherent memory shared across all invocations of the task.
+  // Aligned to at least the natural pointer size of the machine. Functions must
+  // use atomic operations to ensure proper memory ordering.
+  iree_byte_span_t shared_memory;
+} iree_task_dispatch_shard_state_t;
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// An execution request across a tiled grid.
+// Dispatches are fork points where zero or more dispatch slice tasks are
+// spawned and processed prior to joining again on the dispatch completion task.
+//
+// The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
+// The count may either be embedded directly into the dispatch or provided as a
+// pointer to the workgroup_count[3] that will be read immediately prior to
+// forking. If any dimension of the workgroup count is zero then the dispatch is
+// skipped and the completion task will be readied immediately.
+//
+// Example:
+//   dispatch([5, 1, 1])
+//     forked into slices based on affinity/scheduling parameters:
+//     -> dispatch_slice([0-1, 1, 1])
+//     -> dispatch_slice([2-3, 1, 1])
+//     -> dispatch_slice([4-5, 1, 1])
+//   completion_task run after all slices complete
+typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_s {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Function closure to call per tile.
+  iree_task_closure_t closure;
+
+  // Workgroup size for each invocation. Passed on to tiles without
+  // modification and not used for scheduling.
+  uint32_t workgroup_size[3];
+
+  // 3D workgroup count used to tile the dispatch.
+  // [1,1,1] specifies single invocation of the function. A value of 0 in
+  // any dimension will skip execution of the function.
+  union {
+    // Embedded immutable 3D workgroup count value.
+    uint32_t value[3];
+    // Pointer to the uint32_t[3] containing the 3D workgroup count.
+    // Sampled immediately prior to execution.
+    const uint32_t* ptr;
+  } workgroup_count;
+
+  // Optional transient shared memory size to allocate and pass into the
+  // iree_task_context_t::shared_memory of each invocation of the task
+  // closure.
+  iree_host_size_t shared_memory_size;
+
+  // Statistics storage used for aggregating counters across all slices.
+  iree_task_dispatch_statistics_t statistics;
+
+  // Shared state across all slices/shards/etc.
+  // Stored once per dispatch and then referenced by all subtasks.
+  union {
+    iree_task_dispatch_shard_state_t shard_state;
+  } shared;
+} iree_task_dispatch_t;
+
+void iree_task_dispatch_initialize(iree_task_scope_t* scope,
+                                   iree_task_closure_t closure,
+                                   const uint32_t workgroup_size[3],
+                                   const uint32_t workgroup_count[3],
+                                   iree_task_dispatch_t* out_task);
+
+void iree_task_dispatch_initialize_indirect(iree_task_scope_t* scope,
+                                            iree_task_closure_t closure,
+                                            const uint32_t workgroup_size[3],
+                                            const uint32_t* workgroup_count_ptr,
+                                            iree_task_dispatch_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+// TODO(benvanik): per-region dependencies (allow slices to execute directly
+// across dispatches).
+
+// A slice of tiles within a larger dispatch grid.
+// These tasks are designed to be synthesized by the task system when processing
+// a dispatch task. Based on the workgroup count, affinity settings, and
+// available executor threads zero or more slices are enqueued, executed, and
+// retired as part of the complete dispatch task. The dispatch is only
+// considered completed and subsquent tasks readied once all slices are
+// complete.
+//
+// Slices aggregate statistics from all tiles within them and then upon
+// completion merge those into the shared dispatch statistics. As slices may
+// suspend and resume several times the dispatch-level statistics should only be
+// read once all slices have completed fully.
+//
+// In general slices represent a contiguous range of tiles along the most
+// rapidly changing dimension (x, then y, then z). This ensures that we at least
+// give the opportunity for cache locality to the tiles as they are processed.
+// If work stealing is enabled then slices may shed their trailing tiles to
+// other threads that have completed all of their work (at a cost of power vs.
+// potential latency savings).
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
+  // This removes the need for touching the dispatch struct when beginning a
+  // tile which would likely be a cache miss as we fan out to other cores.
+
+  // Function closure to call per tile (same as the closure in the dispatch).
+  iree_task_closure_t closure;
+
+  // Base workgroup ID for the slice range.
+  uint32_t workgroup_base[3];
+  // Total count of tiles within the slice range.
+  uint32_t workgroup_range[3];
+
+  // Workgroup size for each invocation.
+  uint32_t workgroup_size[3];
+  // Total workgroup count for the task. Can be used in conjunction with the
+  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
+  uint32_t workgroup_count[3];
+
+  // Incoherent memory shared across all invocations of the task.
+  // Aligned to at least the natural pointer size of the machine. Functions must
+  // use atomic operations to ensure proper memory ordering.
+  iree_byte_span_t shared_memory;
+
+  // Shared statistics counters for the entire dispatch. References the storage
+  // held in the parent iree_task_dispatch_t.
+  iree_task_dispatch_statistics_t* dispatch_statistics;
+  // Statistics just for this single slice. The statistics will be added to the
+  // dispatch_statistics after the slice completes to prevent excessive
+  // contention on the shared dispatch statistics across multiple threads.
+  iree_task_dispatch_statistics_t slice_statistics;
+
+  // Per-tile initialized coroutine storage for all tiles in the range
+  // initialized as each tile begins execution.
+  // TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
+} iree_task_dispatch_slice_t;
+
+// TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
+// This would be useful to reduce latency when the number of slices is small
+// (~<5) as the dispatch wouldn't need to be issued. This can also be used to
+// implement per-region dependencies as direct slice->slice deps vs. fork/join
+// dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
+void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
+                                         const uint32_t workgroup_base[3],
+                                         const uint32_t workgroup_range[3],
+                                         const uint32_t workgroup_count[3],
+                                         iree_task_dispatch_slice_t* out_task);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+typedef iree_alignas(iree_max_align_t) struct {
+  // Task header: implementation detail, do not use.
+  iree_task_t header;
+
+  // Active dispatch progress shared across all shards.
+  // Each shard will be read/modify/writing this and there's likely to be
+  // contention.
+  iree_task_dispatch_shard_state_t* shared_state;
+} iree_task_dispatch_shard_t;
+
+void iree_task_dispatch_shard_initialize(
+    iree_task_dispatch_t* dispatch_task,
+    iree_task_dispatch_shard_state_t* shared_state,
+    iree_task_dispatch_shard_t* out_task);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TASK_H_

diff --git a/iree/task/task_impl.h b/iree/task/task_impl.h
new file mode 100644
index 0000000..6320998
--- /dev/null
+++ b/iree/task/task_impl.h

@@ -0,0 +1,158 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TASK_IMPL_H_
+#define IREE_TASK_TASK_IMPL_H_
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/post_batch.h"
+#include "iree/task/submission.h"
+#include "iree/task/task.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//==============================================================================
+// IREE_TASK_TYPE_NOP
+//==============================================================================
+
+//==============================================================================
+// IREE_TASK_TYPE_CALL
+//==============================================================================
+
+// Executes and retires a user call.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns the status of the user call.
+iree_status_t iree_task_call_execute(
+    iree_task_call_t* task, iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_BARRIER
+//==============================================================================
+
+// Retires a barrier task by notifying all dependent tasks.
+// May add zero or more tasks to the |pending_submission| if they are ready.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_barrier_retire(iree_task_barrier_t* task,
+                              iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_FENCE
+//==============================================================================
+
+// Retires a fence task by updating the scope state.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_fence_retire(iree_task_fence_t* task,
+                            iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_WAIT
+//==============================================================================
+
+// Returns true if the user-specified condition on the task is true.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+bool iree_task_wait_check_condition(iree_task_wait_t* task);
+
+// Retires a wait when it has completed waiting (successfully or not).
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_wait_retire(iree_task_wait_t* task,
+                           iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH
+//==============================================================================
+
+// Schedules a dispatch by forking out to zero or more slices that will be
+// executed on workers. The slices are allocated from an executor-owned pool
+// and are generally not user-visible - they'll just see their dispatch begin
+// execution prior to the slices and end execution after the last slice
+// finishes.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
+                                     iree_task_pool_t* slice_task_pool,
+                                     iree_task_submission_t* pending_submission,
+                                     iree_task_post_batch_t* post_batch);
+
+// Schedules a dispatch by forking out to zero or more shards that will be
+// executed on workers. The shards are allocated from an executor-owned pool
+// and are generally not user-visible - they'll just see their dispatch begin
+// execution prior to the slices and end execution after the last shard
+// finishes.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_issue_sharded(
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
+    iree_task_submission_t* pending_submission,
+    iree_task_post_batch_t* post_batch);
+
+// Retires a dispatch when all issued slices have completed executing.
+//
+// Only called during coordination and expects the coordinator lock to be held.
+void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
+                               iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SLICE
+//==============================================================================
+
+// Allocates a dispatch slice task from the shared executor task pool.
+// The slice will be released back to the pool when it has completed execution.
+iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
+    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
+    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
+    iree_task_pool_t* slice_task_pool);
+
+// Executes and retires a dispatch slice task.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns ok if all tiles were successfully executed and otherwise returns
+// an unspecified status (probably the first non-ok status hit).
+iree_status_t iree_task_dispatch_slice_execute(
+    iree_task_dispatch_slice_t* task,
+    iree_task_submission_t* pending_submission);
+
+//==============================================================================
+// IREE_TASK_TYPE_DISPATCH_SHARD
+//==============================================================================
+
+// Allocates a dispatch shard task from the shared executor task pool.
+// The shard will be released back to the pool when it has completed execution.
+iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
+    iree_task_dispatch_t* dispatch_task,
+    iree_task_dispatch_shard_state_t* shared_state,
+    iree_task_pool_t* shard_task_pool);
+
+// Executes and retires a dispatch shard task.
+// May block the caller for an indeterminate amount of time and should only be
+// called from threads owned by or donated to the executor.
+// Returns ok if all tiles processed in the shard successfully executed and
+// otherwise returns an unspecified status (probably the first non-ok status
+// hit).
+iree_status_t iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task,
+    iree_task_submission_t* pending_submission);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TASK_IMPL_H_

diff --git a/iree/task/testing/BUILD b/iree/task/testing/BUILD
new file mode 100644
index 0000000..5c32559
--- /dev/null
+++ b/iree/task/testing/BUILD

@@ -0,0 +1,29 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    hdrs = ["test_util.h"],
+    deps = [
+        "//iree/task",
+        "//iree/testing:gtest",
+    ],
+)

diff --git a/iree/task/testing/CMakeLists.txt b/iree/task/testing/CMakeLists.txt
new file mode 100644
index 0000000..5ba9ce9
--- /dev/null
+++ b/iree/task/testing/CMakeLists.txt

@@ -0,0 +1,27 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    test_util
+  HDRS
+    "test_util.h"
+  DEPS
+    iree::task
+    iree::testing::gtest
+  TESTONLY
+  PUBLIC
+)

diff --git a/iree/task/testing/test_util.h b/iree/task/testing/test_util.h
new file mode 100644
index 0000000..c7ef9a4
--- /dev/null
+++ b/iree/task/testing/test_util.h

@@ -0,0 +1,85 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// NOTE: the best kind of synchronization is no synchronization; always try to
+// design your algorithm so that you don't need anything from this file :)
+// See https://travisdowns.github.io/blog/2020/07/06/concurrency-costs.html
+
+#ifndef IREE_TASK_TESTING_TEST_UTIL_H_
+#define IREE_TASK_TESTING_TEST_UTIL_H_
+
+#include <memory>
+
+#include "iree/task/list.h"
+#include "iree/task/pool.h"
+#include "iree/task/scope.h"
+#include "iree/testing/status_matchers.h"
+
+using TaskPoolPtr =
+    std::unique_ptr<iree_task_pool_t, void (*)(iree_task_pool_t*)>;
+static inline TaskPoolPtr AllocateNopPool() {
+  iree_task_pool_t* pool = new iree_task_pool_t();
+  IREE_CHECK_OK(iree_task_pool_initialize(iree_allocator_system(),
+                                          sizeof(iree_task_nop_t), 1024, pool));
+  return {pool, [](iree_task_pool_t* pool) {
+            iree_task_pool_deinitialize(pool);
+            delete pool;
+          }};
+}
+
+using TaskScopePtr =
+    std::unique_ptr<iree_task_scope_t, void (*)(iree_task_scope_t*)>;
+static inline TaskScopePtr AllocateScope(const char* name) {
+  iree_task_scope_t* scope = new iree_task_scope_t();
+  iree_task_scope_initialize(iree_make_cstring_view(name), scope);
+  return {scope, [](iree_task_scope_t* scope) {
+            iree_task_scope_deinitialize(scope);
+            delete scope;
+          }};
+}
+
+static inline iree_task_t* AcquireNopTask(TaskPoolPtr& pool,
+                                          TaskScopePtr& scope, uint16_t value) {
+  iree_task_t* task = NULL;
+  IREE_CHECK_OK(iree_task_pool_acquire(pool.get(), &task));
+  iree_task_initialize(IREE_TASK_TYPE_NOP, scope.get(), task);
+  task->flags = value;
+  return task;
+}
+
+static inline bool CheckListOrderFIFO(iree_task_list_t* list) {
+  iree_task_t* p = list->head;
+  if (!p) return true;
+  uint16_t value = p->flags;
+  p = p->next_task;
+  while (p) {
+    if (p->flags <= value) return false;
+    p = p->next_task;
+  }
+  return true;
+}
+
+static inline bool CheckListOrderLIFO(iree_task_list_t* list) {
+  iree_task_t* p = list->head;
+  if (!p) return true;
+  uint16_t value = p->flags;
+  p = p->next_task;
+  while (p) {
+    if (p->flags >= value) return false;
+    p = p->next_task;
+  }
+  return true;
+}
+
+#endif  // IREE_TASK_TESTING_TEST_UTIL_H_

diff --git a/iree/task/topology.c b/iree/task/topology.c
new file mode 100644
index 0000000..ee0061f
--- /dev/null
+++ b/iree/task/topology.c

@@ -0,0 +1,387 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/topology.h"
+
+#include <cpuinfo.h>
+#include <stdio.h>
+
+#include "iree/base/math.h"
+#include "iree/base/tracing.h"
+
+struct iree_task_topology_s {
+  iree_allocator_t allocator;
+  iree_host_size_t group_capacity;
+  iree_host_size_t group_count;
+  iree_task_topology_group_t groups[0];
+};
+
+iree_status_t iree_task_topology_allocate(iree_host_size_t group_capacity,
+                                          iree_allocator_t allocator,
+                                          iree_task_topology_t** out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_host_size_t topology_size =
+      sizeof(iree_task_topology_t) +
+      group_capacity * sizeof(iree_task_topology_group_t);
+
+  iree_task_topology_t* topology = NULL;
+  IREE_RETURN_IF_ERROR(
+      iree_allocator_malloc(allocator, topology_size, (void**)&topology));
+  topology->allocator = allocator;
+  topology->group_capacity = group_capacity;
+  topology->group_count = 0;
+
+  *out_topology = topology;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+void iree_task_topology_free(iree_task_topology_t* topology) {
+  if (!topology) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+  iree_allocator_free(topology->allocator, topology);
+  IREE_TRACE_ZONE_END(z0);
+}
+
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+                                       iree_allocator_t allocator,
+                                       iree_task_topology_t** out_topology) {
+  // TODO(benvanik): define a format that is generally useful alongside cpuinfo.
+  // Maybe colon-separated group-id values from thread affinities? Like:
+  //   0.0:0.2:0.4:0.8 to indicate cores 0,2,4,8 on group 0
+  //   0.0:0.1:1.0:1.1 to indicate cores 0,1 of both groups 0,1
+  // etc
+  return iree_make_status(IREE_STATUS_UNIMPLEMENTED);
+}
+
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+                               iree_host_size_t buffer_capacity, char* buffer,
+                               iree_host_size_t* out_buffer_length) {
+  // TODO(benvanik): formatting to match parsing.
+  return false;
+}
+
+iree_host_size_t iree_task_topology_group_count(
+    const iree_task_topology_t* topology) {
+  return topology->group_count;
+}
+
+const iree_task_topology_group_t* iree_task_topology_get_group(
+    const iree_task_topology_t* topology, iree_host_size_t group_index) {
+  if (group_index >= topology->group_count) return NULL;
+  return &topology->groups[group_index];
+}
+
+iree_status_t iree_task_topology_push_group(
+    iree_task_topology_t* topology, const iree_task_topology_group_t* group) {
+  if (topology->group_count + 1 > topology->group_capacity) {
+    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                            "group capacity exceeded");
+  }
+  iree_task_topology_group_t* dst_group =
+      &topology->groups[topology->group_count];
+  memcpy(dst_group, group, sizeof(*group));
+  dst_group->group_index = topology->group_count++;
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_from_group_count(
+    iree_host_size_t group_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_task_topology_t* topology = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_task_topology_allocate(group_count, allocator, &topology));
+
+  for (iree_host_size_t i = 0; i < group_count; ++i) {
+    iree_task_topology_group_t* group = &topology->groups[i];
+    group->group_index = i;
+    snprintf(group->name, IREE_ARRAYSIZE(group->name), "worker[%d]", (int)i);
+    iree_thread_affinity_set_any(&group->ideal_thread_affinity);
+    group->constructive_sharing_mask = IREE_TASK_TOPOLOGY_GROUP_MASK_ALL;
+  }
+  topology->group_count = group_count;
+
+  *out_topology = topology;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Runs the cpuinfo initializer which caches its result on the first call.
+// Returns a failure if cpuinfo does not support the CPU/platform.
+static iree_status_t iree_task_topology_ensure_cpuinfo_available() {
+  if (!cpuinfo_initialize()) {
+    return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                            "cpuinfo failed to initialize");
+  }
+  return iree_ok_status();
+}
+
+// Returns the core of the calling thread or NULL if not supported.
+// We wrap this here because cpuinfo only returns non-NULL on linux.
+static const struct cpuinfo_core* iree_task_topology_get_current_core() {
+  const struct cpuinfo_core* current_core = cpuinfo_get_current_core();
+#if defined(IREE_PLATFORM_WINDOWS)
+  // TODO(benvanik): upstream into cpuinfo.
+  if (current_core == NULL) {
+    PROCESSOR_NUMBER processor_number;
+    GetCurrentProcessorNumberEx(&processor_number);
+    uint32_t processor_id =
+        cpuinfo_get_package(processor_number.Group)->processor_start +
+        processor_number.Number;
+    current_core = cpuinfo_get_processor(processor_id)->core;
+  }
+#endif  // IREE_PLATFORM_WINDOWS
+  return current_core;
+}
+
+// Returns |core_id| rotated by the calling base core ID.
+// On many systems the kernel will have already assigned a randomized starting
+// core for thread distribution and we can just reuse that.
+static uint32_t iree_task_topology_rotate_from_base_core(uint32_t core_id) {
+  const struct cpuinfo_core* current_core =
+      iree_task_topology_get_current_core();
+  if (!current_core) {
+    return core_id;  // don't modify if we don't know
+  }
+  uint32_t next_core_id =
+      (current_core->core_id + 1) % cpuinfo_get_cores_count();
+  return (next_core_id + core_id) % cpuinfo_get_cores_count();
+}
+
+// Sets a platform-specific iree_thread_affinity_t based on the cpuinfo
+// processor.
+static void iree_task_topology_set_affinity_from_processor(
+    const struct cpuinfo_processor* processor,
+    iree_thread_affinity_t* out_affinity) {
+  memset(out_affinity, 0, sizeof(*out_affinity));
+  out_affinity->specified = 1;
+
+  // Special bit to indicate that (if required) we want the entire core.
+  if (processor->core->processor_count > 1) {
+    out_affinity->smt = 1;
+  }
+
+  // cpuinfo #ifdefs the fields we need to extract the right platform IDs.
+  // We purposefully use the same exact macros they do there so that we don't
+  // have to worry about skew.
+
+#if defined(__MACH__) && defined(__APPLE__)
+  // TODO(benvanik): run on darwin to see how the l2 caches map. We ideally want
+  // a unique affinity ID per L2 cache.
+  // For now, we just use some random pointer bytes. It's just a tag used by
+  // the kernel to distribute the threads so the exact bits don't matter as long
+  // as they are unique per group we want isolated.
+  out_affinity->id = (uint32_t)(uintptr_t)processor;
+#elif defined(__linux__)
+  out_affinity->id = processor->linux_id;
+#elif defined(_WIN32) || defined(__CYGWIN__)
+  out_affinity->group = processor->windows_group_id;
+  out_affinity->id = processor->windows_processor_id;
+#else
+  // WASM? Unusued today.
+  out_affinity->specified = 0;
+#endif  // cpuinfo-like platform field
+}
+
+// Returns a bitset with all *processors* that share the same |cache|.
+static uint64_t iree_task_topology_calculate_cache_bits(
+    const struct cpuinfo_cache* cache) {
+  if (!cache) return 0;
+  uint64_t mask = 0;
+  for (uint32_t processor_i = 0; processor_i < cache->processor_count;
+       ++processor_i) {
+    mask |= 1ull << (cache->processor_start + processor_i);
+  }
+  return mask;
+}
+
+// Constructs a constructive sharing mask for all *processors* that share the
+// same cache as the specified |processor|.
+static uint64_t iree_task_topology_calculate_constructive_sharing_mask(
+    const struct cpuinfo_processor* processor) {
+  uint64_t mask = 0;
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1i);
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l1d);
+  mask |= iree_task_topology_calculate_cache_bits(processor->cache.l2);
+  // TODO(benvanik): include L3 here too (for systems that have it)? Or use L3
+  // info purely for distribution and focus the group mask on lower-latency
+  // caches?
+  return mask;
+}
+
+// Populates |our_group| with the information from |core|.
+static void iree_task_topology_group_initialize_from_core(
+    uint32_t group_index, const struct cpuinfo_core* core,
+    iree_task_topology_group_t* out_group) {
+  memset(out_group, 0, sizeof(*out_group));
+  out_group->group_index = group_index;
+  snprintf(out_group->name, IREE_ARRAYSIZE(out_group->name), "worker[%u]",
+           group_index);
+
+  // Guess: always pick the first processor in a core.
+  // When pinning to threads we'll take into account whether the core is SMT
+  // and use all threads anyway so this alignment is just helpful for debugging.
+  uint32_t processor_i = core->processor_start;
+  out_group->processor_index = processor_i;
+
+  const struct cpuinfo_processor* processor =
+      cpuinfo_get_processor(processor_i);
+  iree_task_topology_set_affinity_from_processor(
+      processor, &out_group->ideal_thread_affinity);
+}
+
+// Fixes constructive_sharing_mask values such that they represent other chosen
+// topology groups instead of processor indices. We do this so that code using
+// the topology groups doesn't need to know anything about which physical
+// processor IDs a particular group is mapped to.
+static void iree_task_topology_fixup_constructive_sharing_masks(
+    iree_task_topology_t* topology) {
+  // O(n^2), but n is always <= 64 (and often <= 8).
+  for (iree_host_size_t i = 0; i < topology->group_count; ++i) {
+    iree_task_topology_group_t* group = &topology->groups[i];
+
+    // Compute the processors that we can constructively share with.
+    uint64_t constructive_sharing_mask =
+        iree_task_topology_calculate_constructive_sharing_mask(
+            cpuinfo_get_processor(group->processor_index));
+
+    iree_task_topology_group_mask_t group_mask = 0;
+    for (iree_host_size_t j = 0; j < topology->group_count; ++j) {
+      if (i == j) continue;
+      const iree_task_topology_group_t* other_group = &topology->groups[j];
+      uint64_t group_processor_bits = 1ull << other_group->processor_index;
+      if (constructive_sharing_mask & group_processor_bits) {
+        group_mask |= 1ull << other_group->group_index;
+      }
+    }
+
+    group->constructive_sharing_mask = group_mask;
+  }
+}
+
+// Matches all cores.
+static bool iree_task_topology_core_filter_all(const struct cpuinfo_core* core,
+                                               uintptr_t user_data) {
+  return true;
+}
+
+iree_status_t iree_task_topology_from_physical_cores(
+    iree_host_size_t max_core_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology) {
+  return iree_task_topology_from_physical_cores_with_filter(
+      iree_task_topology_core_filter_all, 0, max_core_count, allocator,
+      out_topology);
+}
+
+// Matches only cores with the uarch as specified in |user_data|.
+static bool iree_task_topology_core_filter_uarch(
+    const struct cpuinfo_core* core, uintptr_t user_data) {
+  return core->uarch == user_data;
+}
+
+iree_status_t iree_task_topology_from_physical_cores_with_uarch(
+    uint32_t cpuinfo_uarch, iree_host_size_t max_core_count,
+    iree_allocator_t allocator, iree_task_topology_t** out_topology) {
+  return iree_task_topology_from_physical_cores_with_filter(
+      iree_task_topology_core_filter_uarch, cpuinfo_uarch, max_core_count,
+      allocator, out_topology);
+}
+
+iree_status_t iree_task_topology_from_physical_cores_with_filter(
+    iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
+    iree_host_size_t max_core_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_task_topology_ensure_cpuinfo_available());
+
+  // Count cores that match the filter.
+  iree_host_size_t core_count = 0;
+  for (uint32_t i = 0; i < cpuinfo_get_cores_count(); i++) {
+    const struct cpuinfo_core* core = cpuinfo_get_core(i);
+    if (filter_fn(core, filter_fn_data)) ++core_count;
+  }
+  core_count = iree_min(core_count, max_core_count);
+
+  iree_task_topology_t* topology = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_task_topology_allocate(core_count, allocator, &topology));
+
+  // Build each core up to the max allowed.
+  // TODO(benvanik): if our group_count <= core_count/2 then distribute better;
+  // for now we just do a straight-line through (cores 0-N) when instead we may
+  // want to take advantage of L3 cache info (half of groups on one L3 cache,
+  // half of groups on another, etc).
+  topology->group_count = core_count;
+  for (uint32_t core_i = 0, group_i = 0; group_i < topology->group_count;
+       ++core_i) {
+    // Rotate the core ID so that we avoid setting the affinity to the calling
+    // thread which we assume is something the user has plans for and doesn't
+    // want to have our workers stealing their time.
+    const struct cpuinfo_core* core =
+        cpuinfo_get_core(iree_task_topology_rotate_from_base_core(core_i));
+    if (filter_fn(core, filter_fn_data)) {
+      iree_task_topology_group_initialize_from_core(group_i, core,
+                                                    &topology->groups[group_i]);
+      ++group_i;
+    }
+  }
+
+  iree_task_topology_fixup_constructive_sharing_masks(topology);
+  *out_topology = topology;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+iree_status_t iree_task_topology_from_unique_l2_cache_groups(
+    iree_host_size_t max_group_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_task_topology_ensure_cpuinfo_available());
+
+  iree_host_size_t cache_count = cpuinfo_get_l2_caches_count();
+  cache_count = iree_min(cache_count, max_group_count);
+
+  iree_task_topology_t* topology = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_task_topology_allocate(cache_count, allocator, &topology));
+
+  // TODO(benvanik): iree_task_topology_rotate_from_base_core to offset all of
+  // the selection here (while still preserving the cache groups). May need to
+  // rework this to instead walk the core list and skip until a new cache is
+  // found instead of starting with the cache list.
+
+  // TODO(benvanik): if our group_count <= cache_count/2 then distribute better;
+  // we could use l3 cache in addition to ensure we are selecting cores that do
+  // (or do not) share.
+  topology->group_count = cache_count;
+  for (uint32_t cache_i = 0, group_i = 0; group_i < topology->group_count;
+       ++cache_i) {
+    const struct cpuinfo_cache* cache = cpuinfo_get_l2_cache(cache_i);
+    const struct cpuinfo_core* core =
+        cpuinfo_get_processor(cache->processor_start)->core;
+    iree_task_topology_group_initialize_from_core(group_i, core,
+                                                  &topology->groups[group_i]);
+    ++group_i;
+  }
+
+  iree_task_topology_fixup_constructive_sharing_masks(topology);
+  *out_topology = topology;
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}

diff --git a/iree/task/topology.h b/iree/task/topology.h
new file mode 100644
index 0000000..c07d68f
--- /dev/null
+++ b/iree/task/topology.h

@@ -0,0 +1,162 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TOPOLOGY_H_
+#define IREE_TASK_TOPOLOGY_H_
+
+#include <limits.h>
+
+#include "iree/base/api.h"
+#include "iree/base/threading.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct cpuinfo_core;
+
+// A bitmask indicating which other groups from 0 to N may constructively share
+// caches. For example, a value of 0b1100 indicates that group 2 and 3 share.
+typedef uint64_t iree_task_topology_group_mask_t;
+
+#define IREE_TASK_TOPOLOGY_GROUP_MASK_ALL UINT64_MAX
+
+// Information about a particular group within the topology.
+// Groups may be of varying levels of granularity even within the same topology
+// based on how the topology is defined.
+typedef struct {
+  // Group index within the topology matching a particular bit in
+  // iree_task_topology_group_mask_t.
+  uint8_t group_index;
+
+  // A name assigned to executor workers used for logging/tracing.
+  char name[16];
+
+  // Processor index in the cpuinfo set.
+  uint32_t processor_index;
+
+  // Ideal thread affinity for threads within this group.
+  // All threads within the group share the same affinity and this is what
+  // allows us to model Simultaneous Multi-Threading (SMT) (aka hyperthreading).
+  iree_thread_affinity_t ideal_thread_affinity;
+
+  // A bitmask of other group indices that share some level of the cache
+  // hierarchy. Workers of this group are more likely to constructively share
+  // some cache levels higher up with these other groups. For example, if the
+  // workers in a group all share an L2 cache then the groups indicated here may
+  // all share the same L3 cache.
+  iree_task_topology_group_mask_t constructive_sharing_mask;
+} iree_task_topology_group_t;
+
+// Task system topology information used to define the workers within an
+// executor.
+//
+// Topologies are used to statically configure task executors by defining the
+// total number of workers in the worker pool and how those workers map to
+// hardware compute resources.
+//
+// Users can allocate topologies, populate them with zero or more groups, and
+// then pass them to the executor to construct the desired configuration. To
+// ease testing and debugging topologies can be formatted as string values and
+// round tripped through flags, though obviously the value of such encodings are
+// machine-dependent.
+//
+// Several helper constructors are available that query the machine topology
+// and attempt to derive some (hopefully) useful task system topology from it.
+// We can add the more common heuristics over time to the core and leave the
+// edge cases for applications to construct.
+typedef struct iree_task_topology_s iree_task_topology_t;
+
+// Allocates a task topology with at least |group_capacity|.
+iree_status_t iree_task_topology_allocate(iree_host_size_t group_capacity,
+                                          iree_allocator_t allocator,
+                                          iree_task_topology_t** out_topology);
+
+// Frees a topology structure.
+void iree_task_topology_free(iree_task_topology_t* topology);
+
+// Parses a serialized topology in string form.
+iree_status_t iree_task_topology_parse(iree_string_view_t value,
+                                       iree_allocator_t allocator,
+                                       iree_task_topology_t** out_topology);
+
+// Formats the topology as a string value that can be parsed with
+// iree_task_topology_parse.
+bool iree_task_topology_format(const iree_task_topology_t* topology,
+                               iree_host_size_t buffer_capacity, char* buffer,
+                               iree_host_size_t* out_buffer_length);
+
+// Returns the total group count defined by the topology.
+iree_host_size_t iree_task_topology_group_count(
+    const iree_task_topology_t* topology);
+
+// Returns the group information for the given group index.
+const iree_task_topology_group_t* iree_task_topology_get_group(
+    const iree_task_topology_t* topology, iree_host_size_t group_index);
+
+// Pushes a new group onto the topology set.
+// The provided group data will be copied into the toplogy structure.
+iree_status_t iree_task_topology_push_group(
+    iree_task_topology_t* topology, const iree_task_topology_group_t* group);
+
+// Allocates a topology with the specified number of groups.
+// 0 is a valid value, indicating that only donated threads will be used to
+// perform work. Groups will have no specific affinity and rely on the OS
+// scheduler to ensure they are distributed in a meaningful way; this generally
+// works out as threads created within a process are usually rotated across
+// preferred processors by default.
+iree_status_t iree_task_topology_from_group_count(
+    iree_host_size_t group_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each physical core in the machine.
+// If detailed cache information is not available this is a decent
+// approximation that can be used as a fallback.
+iree_status_t iree_task_topology_from_physical_cores(
+    iree_host_size_t max_core_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each physical core in the machine
+// with the given microarchitecture specified as a cpuinfo_uarch value.
+iree_status_t iree_task_topology_from_physical_cores_with_uarch(
+    uint32_t cpuinfo_uarch, iree_host_size_t max_core_count,
+    iree_allocator_t allocator, iree_task_topology_t** out_topology);
+
+// Returns true if the given |core| passes the filter and should be included.
+// |user_data| is the value passed alongside the filter function.
+typedef bool (*iree_task_topology_core_filter_t)(
+    const struct cpuinfo_core* core, uintptr_t user_data);
+
+// Allocates a topology with one group for each core that matches |filter_fn|.
+iree_status_t iree_task_topology_from_physical_cores_with_filter(
+    iree_task_topology_core_filter_t filter_fn, uintptr_t filter_fn_data,
+    iree_host_size_t max_core_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology);
+
+// Allocates a topology with one group for each unique L2 cache group across
+// all available cores. This optimizes for temporal and spatial cache locality
+// but may suffer from oversubscription if there are other processes trying to
+// use the same cores.
+iree_status_t iree_task_topology_from_unique_l2_cache_groups(
+    iree_host_size_t max_group_count, iree_allocator_t allocator,
+    iree_task_topology_t** out_topology);
+
+// TODO(benvanik): more? or just make users implement as desired? Ideas:
+// - _from_unique_l2_cache_groups but with a min/max count (N% utilization)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TOPOLOGY_H_

diff --git a/iree/task/topology_test.cc b/iree/task/topology_test.cc
new file mode 100644
index 0000000..2b7a665
--- /dev/null
+++ b/iree/task/topology_test.cc

@@ -0,0 +1,26 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/topology.h"
+
+#include "iree/testing/gtest.h"
+#include "iree/testing/status_matchers.h"
+
+namespace {
+
+TEST(TopologyTest, Any) {
+  // TODO(benvanik): tests.
+}
+
+}  // namespace

diff --git a/iree/task/tuning.h b/iree/task/tuning.h
new file mode 100644
index 0000000..57fb434
--- /dev/null
+++ b/iree/task/tuning.h

@@ -0,0 +1,130 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_TUNING_H_
+#define IREE_TASK_TUNING_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Maximum number of workers that an executor can manage.
+// A 64 worker hard limit is based on us using uint64_t as a bitmask to select
+// workers. It's easy to go smaller (just use fewer bits) if it's known that
+// only <64 will ever be used (such as for devices with 2 cores).
+#define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
+
+// Initial number of slice tasks that are allocated in the executor pool.
+// Increasing this number will decrease initial allocation storms in cases of
+// extremely wide fan-out (many dispatches with many thousands of slices) at the
+// cost of a higher minimum memory consumption.
+//
+// Set to zero if sliced dispatches will not be used or will be allocated by the
+// caller to avoid fixed overhead associated with the internal executor pool.
+#define IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER (0)
+
+// Initial number of shard tasks that are allocated in the executor pool.
+// Increasing this number will decrease initial allocation storms in cases of
+// extremely wide concurrency regions (many dispatches running at the same time)
+// at the cost of a higher minimum memory consumption.
+#define IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER (4)
+
+// Maximum number of simultaneous waits an executor may perform as part of a
+// wait-any operation. A larger value may enable better wake coalescing by the
+// kernel. This is only a count limiting wait tasks that have been scheduled and
+// been promoted to the root executor waiting list. There may be any number of
+// waits deeper in the pipeline so long as they don't all become ready
+// simultaneously.
+//
+// Realistically, though, if we have more than 64 outstanding **root** waits
+// it's hard to reason about if/when the executor queue could make forward
+// progress and indicates a possible error in task assignment.
+//
+// Also, the underlying iree_wait_set_t may not support more than 64 handles on
+// certain platforms without emulation. Trying to keep us on the fast-path
+// with a reasonable number seems fine for now until we have a need for more.
+//
+// NOTE: we reserve 1 wait handle for our own internal use. This allows us to
+// wake the coordination worker when new work is submitted from external
+// sources.
+#define IREE_TASK_EXECUTOR_MAX_OUTSTANDING_WAITS (64 - 1)
+
+// Allows for dividing the total number of attempts that a worker will make to
+// steal tasks from other workers. By default all other workers will be
+// attempted while setting this to 2, for example, will try for only half of
+// the available workers.
+#define IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR (1)
+
+// Maximum number of tasks that will be stolen in one go from another worker.
+//
+// Too few tasks will cause additional overhead as the worker repeatedly sips
+// away tasks and when it does get tasks it may suffer spatial locality cache
+// issues as it is effectively walking backwards in memory to both touch the
+// tasks and - a much larger impact - running tasks that themselves are walking
+// orders of magnitude more memory backwards.
+//
+// Too many tasks will cause additional latency on workers that may interfere
+// with higher level scheduling; for example, if a worker runs out of tasks and
+// immediately steals 8000 of them from another worker it's going to take until
+// those 8000 complete before any work that arrives specifically for the worker
+// is able to start processing.
+//
+// In real-time systems too few tasks is better (slightly more work for much
+// lower variance in execution) while in batch mode systems too many tasks is
+// better (as latencies don't matter so long as throughput is maximized).
+#define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
+  IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
+
+// Number of tiles that will be batched into a single slice along each XYZ dim.
+//
+// Larger numbers reduce overhead and ensure that more tiles are executed
+// locally on the same worker (== shared caches) while also increasing potential
+// latency as work-stealing (always per-slice) is less effective.
+//
+// Numbers >1 on Y and Z can be used to cluster tiles together within the same
+// slice when spatial coherence outside of the X dimension is useful.
+//
+// The current usage of this is provisional; we may do all of this from the
+// compiler and want this behavior to be relatively fixed so that we can predict
+// it better. The only thing we want to be introducing here is flexibility for
+// when worker topology differs at runtime from what is knowable during offline
+// compilation.
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_X (8)
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Y (1)
+#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Z (1)
+
+// Number of tiles that will be batched into a single reservation from the grid.
+// This is a maximum; if there are fewer tiles that would otherwise allow for
+// maximum parallelism then this may be ignored.
+//
+// The more tiles reserved at a time the higher the chance for latency to
+// increase as many reserved tiles are held up on one worker while another may
+// have otherwise been able to steal them and help finish them sooner.
+//
+// The fewer tiles reserved at a time the higher the chance for cache-locality
+// destroying behavior where multiple workers all stomp on the same cache lines
+// (as say worker 0 and worker 1 both fight over sequential tiles adjacent in
+// memory).
+#define IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION (8)
+
+// Whether to enable per-tile colors for each tile tracing zone based on the
+// tile grid xyz. Not cheap and can be disabled to reduce tracing overhead.
+// TODO(#4017): make per-tile color tracing fast enough to always have on.
+#define IREE_TASK_TRACING_PER_TILE_COLORS 1
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_TUNING_H_

diff --git a/iree/task/worker.c b/iree/task/worker.c
new file mode 100644
index 0000000..a2eecb2
--- /dev/null
+++ b/iree/task/worker.c

@@ -0,0 +1,359 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "iree/task/worker.h"
+
+#include "iree/base/debugging.h"
+#include "iree/base/math.h"
+#include "iree/task/executor_impl.h"
+#include "iree/task/task_impl.h"
+
+static int iree_task_worker_main(iree_task_worker_t* worker);
+
+iree_status_t iree_task_worker_initialize(
+    iree_task_executor_t* executor, iree_host_size_t worker_index,
+    const iree_task_topology_group_t* topology_group,
+    iree_prng_splitmix64_state_t* seed_prng, iree_task_worker_t* out_worker) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  out_worker->executor = executor;
+  out_worker->worker_bit = iree_task_affinity_for_worker(worker_index);
+  out_worker->ideal_thread_affinity = topology_group->ideal_thread_affinity;
+  out_worker->constructive_sharing_mask =
+      topology_group->constructive_sharing_mask;
+  out_worker->max_theft_attempts =
+      executor->worker_count / IREE_TASK_EXECUTOR_MAX_THEFT_ATTEMPTS_DIVISOR;
+  iree_prng_minilcg128_initialize(iree_prng_splitmix64_next(seed_prng),
+                                  &out_worker->theft_prng);
+
+  iree_task_worker_state_t initial_state = IREE_TASK_WORKER_STATE_RUNNING;
+  if (executor->scheduling_mode &
+      IREE_TASK_SCHEDULING_MODE_DEFER_WORKER_STARTUP) {
+    // User is favoring startup latency vs. initial scheduling latency. Our
+    // thread will be created suspended and not first scheduled until work
+    // arrives for it, (almost) ensuring no context switches and 10x+ lower
+    // blocking startup time.
+    initial_state = IREE_TASK_WORKER_STATE_SUSPENDED;
+  }
+  iree_atomic_store_int32(&out_worker->state, initial_state,
+                          iree_memory_order_relaxed);
+
+  iree_notification_initialize(&out_worker->wake_notification);
+  iree_notification_initialize(&out_worker->state_notification);
+  iree_atomic_task_slist_initialize(&out_worker->mailbox_slist);
+  iree_task_queue_initialize(&out_worker->local_task_queue);
+
+  iree_thread_create_params_t thread_params;
+  memset(&thread_params, 0, sizeof(thread_params));
+  thread_params.name = iree_make_cstring_view(topology_group->name);
+  thread_params.create_suspended =
+      initial_state == IREE_TASK_WORKER_STATE_SUSPENDED;
+  thread_params.priority_class = IREE_THREAD_PRIORITY_CLASS_NORMAL;
+  thread_params.initial_affinity = out_worker->ideal_thread_affinity;
+
+  // NOTE: if the thread creation fails we'll bail here and let the caller
+  // cleanup by calling deinitialize (which is safe because we zero init
+  // everything).
+  iree_status_t status = iree_thread_create(
+      (iree_thread_entry_t)iree_task_worker_main, out_worker, thread_params,
+      executor->allocator, &out_worker->thread);
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+// Returns true if the worker is in the zombie state (exited and awaiting
+// teardown).
+static bool iree_task_worker_is_zombie(iree_task_worker_t* worker) {
+  return iree_atomic_load_int32(&worker->state, iree_memory_order_seq_cst) ==
+         IREE_TASK_WORKER_STATE_ZOMBIE;
+}
+
+void iree_task_worker_deinitialize(iree_task_worker_t* worker) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Wait for the thread to enter the zombie state indicating it has exited our
+  // main function - it may still be live in the OS, but it'll not be touching
+  // any of our data structures again so it's fine to blast away.
+  if (worker->thread) {
+    iree_notification_await(&worker->state_notification,
+                            (iree_condition_fn_t)iree_task_worker_is_zombie,
+                            worker);
+  }
+  iree_thread_release(worker->thread);
+
+  // Release unfinished tasks by flushing the mailbox (which if we're here can't
+  // get anything more posted to it) and then discarding everything we still
+  // have a reference to.
+  iree_atomic_task_slist_discard(&worker->mailbox_slist);
+  iree_task_list_discard(&worker->local_task_queue.list);
+
+  iree_notification_deinitialize(&worker->wake_notification);
+  iree_notification_deinitialize(&worker->state_notification);
+  iree_atomic_task_slist_deinitialize(&worker->mailbox_slist);
+  iree_task_queue_deinitialize(&worker->local_task_queue);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_request_exit(iree_task_worker_t* worker) {
+  if (!worker->thread) return;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // If the thread is already in the exiting/zombie state we don't need to do
+  // anything.
+  iree_task_worker_state_t prev_state =
+      (iree_task_worker_state_t)iree_atomic_exchange_int32(
+          &worker->state, IREE_TASK_WORKER_STATE_EXITING,
+          iree_memory_order_acq_rel);
+  switch (prev_state) {
+    case IREE_TASK_WORKER_STATE_SUSPENDED:
+      // Worker was suspended; resume it so that it can exit itself.
+      iree_thread_resume(worker->thread);
+      break;
+    case IREE_TASK_WORKER_STATE_ZOMBIE:
+      // Worker already exited; reset state to ZOMBIE.
+      iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+                              iree_memory_order_relaxed);
+      break;
+    default:
+      // Worker now set to EXITING and should exit soon.
+      break;
+  }
+
+  // Kick the worker in case it is waiting for work.
+  iree_notification_post(&worker->wake_notification, 1);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+                                 iree_task_list_t* list) {
+  // Move the list into the mailbox. Note that the mailbox is LIFO and this list
+  // is concatenated with its current order preserved (which should be LIFO).
+  iree_atomic_task_slist_concat(&worker->mailbox_slist, list->head, list->tail);
+  memset(list, 0, sizeof(*list));
+}
+
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+                                             iree_task_queue_t* target_queue,
+                                             iree_host_size_t max_tasks) {
+  // Try to grab tasks from the worker; if more than one task is stolen then the
+  // first will be returned and the remaining will be added to the target queue.
+  iree_task_t* task = iree_task_queue_try_steal(
+      &worker->local_task_queue, target_queue,
+      /*max_tasks=*/IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT);
+  if (task) return task;
+
+  // If we still didn't steal any tasks then let's try the slist instead.
+  task = iree_atomic_task_slist_pop(&worker->mailbox_slist);
+  if (task) return task;
+
+  return NULL;
+}
+
+// Executes a task on a worker.
+// Only task types that are scheduled to workers are handled; all others must be
+// handled by the coordinator during scheduling.
+static iree_status_t iree_task_worker_execute(iree_task_worker_t* worker,
+                                              iree_task_t* task) {
+  // Execute the task and resolve the task and gather any tasks that are now
+  // ready for submission to the executor. They'll be scheduled the next time
+  // the coordinator runs.
+  //
+  // TODO(benvanik): think a bit more about this timing; this ensures we have
+  // BFS behavior at the cost of the additional merge overhead - it's probably
+  // worth it?
+  // TODO(benvanik): handle partial tasks and re-queuing.
+  iree_task_submission_t pending_submission;
+  iree_task_submission_initialize(&pending_submission);
+  switch (task->type) {
+    case IREE_TASK_TYPE_CALL: {
+      IREE_RETURN_IF_ERROR(
+          iree_task_call_execute((iree_task_call_t*)task, &pending_submission));
+      break;
+    }
+    case IREE_TASK_TYPE_DISPATCH_SLICE: {
+      IREE_RETURN_IF_ERROR(iree_task_dispatch_slice_execute(
+          (iree_task_dispatch_slice_t*)task, &pending_submission));
+      break;
+    }
+    case IREE_TASK_TYPE_DISPATCH_SHARD: {
+      IREE_RETURN_IF_ERROR(iree_task_dispatch_shard_execute(
+          (iree_task_dispatch_shard_t*)task, &pending_submission));
+      break;
+    }
+    default:
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "incorrect task type for worker execution");
+  }
+
+  // NOTE: task is invalidated here!
+  task = NULL;
+
+  if (!iree_task_submission_is_empty(&pending_submission)) {
+    iree_task_executor_merge_submission(worker->executor, &pending_submission);
+  }
+  return iree_ok_status();
+}
+
+// Pumps the worker thread once, processing a single task.
+// Returns true if pumping should continue as there are more tasks remaining or
+// false if the caller should wait for more tasks to be posted.
+static bool iree_task_worker_pump_once(iree_task_worker_t* worker) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Check the local work queue for any work we know we should start
+  // processing immediately. Other workers may try to steal some of this work
+  // if we take too long.
+  iree_task_t* task = iree_task_queue_pop_front(&worker->local_task_queue);
+
+  // Check the mailbox to see if we have incoming work that has been posted.
+  // We try to greedily move it to our local work list so that we can work
+  // with the full thread-local pending task list.
+  if (!task) {
+    // NOTE: there's a potential for theft pessimization if the queue runs too
+    // low and there's nothing there when a thief goes to grab some tasks. A
+    // standout there would indicate that we weren't scheduling very well in the
+    // first place (large uneven workloads for various workers, bad distribution
+    // in the face of heterogenous multi-core architectures where some workers
+    // complete tasks faster than others, etc).
+    task = iree_task_queue_append_from_lifo_slist(&worker->local_task_queue,
+                                                  &worker->mailbox_slist);
+  }
+
+  // If we ran out of work assigned to this specific worker try to steal some
+  // from other workers that we hopefully share some of the cache hierarchy
+  // with. Their tasks will be moved from their local queue into ours and the
+  // the first task in the queue is popped off and returned.
+  if (!task) {
+    task = iree_task_executor_try_steal_task(
+        worker->executor, worker->constructive_sharing_mask,
+        worker->max_theft_attempts, &worker->theft_prng,
+        &worker->local_task_queue);
+  }
+
+  // No tasks to run; let the caller know we want to wait for more.
+  if (!task) {
+    IREE_TRACE_ZONE_END(z0);
+    return false;
+  }
+
+  // Execute the task (may call out to arbitrary user code and may submit more
+  // tasks for execution).
+  iree_status_t status = iree_task_worker_execute(worker, task);
+
+  // TODO(#4026): propagate failure to task scope.
+  // We currently drop the error on the floor here; that's because the error
+  // should have already been propagated to the scope and everyone should be
+  // checking that before running things anyway.
+  //
+  // Since we can host work from multiple scopes and want to ensure an error
+  // in one doesn't bring down the whole system we pretend we executed
+  // something here by falling through.
+  IREE_ASSERT_TRUE(iree_status_is_ok(status));
+  iree_status_ignore(status);
+
+  IREE_TRACE_ZONE_END(z0);
+  return true;  // try again
+}
+
+// Alternates between pumping ready tasks in the worker queue and waiting
+// for more tasks to arrive. Only returns when the worker has been asked by
+// the executor to exit.
+static void iree_task_worker_pump_until_exit(iree_task_worker_t* worker) {
+  // Pump the thread loop to process more tasks.
+  while (true) {
+    // Check state to see if we've been asked to exit.
+    if (iree_atomic_load_int32(&worker->state, iree_memory_order_relaxed) ==
+        IREE_TASK_WORKER_STATE_EXITING) {
+      // Thread exit requested - cancel pumping.
+      // TODO(benvanik): complete tasks before exiting?
+      break;
+    }
+
+    // If we fail to find any work to do we'll wait at the end of this loop.
+    // In order not to not miss any work that is enqueued after we've already
+    // checked a particular source we use an interruptable wait token that
+    // will prevent the wait from happening if anyone touches the data
+    // structures we use.
+    iree_wait_token_t wait_token =
+        iree_notification_prepare_wait(&worker->wake_notification);
+    iree_atomic_task_affinity_set_fetch_and(&worker->executor->worker_idle_mask,
+                                            ~worker->worker_bit,
+                                            iree_memory_order_relaxed);
+
+    while (iree_task_worker_pump_once(worker)) {
+      // All work done ^, which will return false when the worker should wait.
+    }
+
+    // When we encounter a complete lack of work we can self-nominate to check
+    // the global work queue and distribute work to other threads. Only one
+    // coordinator can be running at a time so we also ensure that if another
+    // is doing its work we gracefully wait for it. It's fine to block in here
+    // as the next thing we'd have done is go idle anyway.
+
+    // First self-nominate; this *may* do something or just be ignored (if
+    // another worker is already coordinating).
+    iree_task_executor_coordinate(worker->executor, worker,
+                                  /*speculative=*/true);
+
+    // If nothing has been enqueued since we started this loop (so even
+    // coordination didn't find anything) we go idle. Otherwise we fall
+    // through and try the loop again.
+    if (!iree_task_queue_is_empty(&worker->local_task_queue)) {
+      // Have more work to do; loop around to try another pump.
+      iree_notification_cancel_wait(&worker->wake_notification);
+    } else {
+      IREE_TRACE_ZONE_BEGIN_NAMED(z_wait,
+                                  "iree_task_worker_main_pump_wake_wait");
+      iree_atomic_task_affinity_set_fetch_or(
+          &worker->executor->worker_idle_mask, worker->worker_bit,
+          iree_memory_order_relaxed);
+      iree_notification_commit_wait(&worker->wake_notification, wait_token);
+      IREE_TRACE_ZONE_END(z_wait);
+    }
+
+    // Wait completed.
+    // Jump back up and try pumping any tasks that arrived.
+    continue;
+  }
+}
+
+// Thread entry point for each worker.
+static int iree_task_worker_main(iree_task_worker_t* worker) {
+  IREE_TRACE_ZONE_BEGIN(thread_zone);
+
+  // Reset affinity (as it can change over time).
+  // TODO(benvanik): call this after waking in case CPU hotplugging happens.
+  iree_thread_request_affinity(worker->thread, worker->ideal_thread_affinity);
+
+  // Enter the running state immediately. Note that we could have been requested
+  // to exit while suspended/still starting up, so check that here before we
+  // mess with any data structures.
+  bool should_run =
+      iree_atomic_exchange_int32(&worker->state, IREE_TASK_WORKER_STATE_RUNNING,
+                                 iree_memory_order_seq_cst) !=
+      IREE_TASK_WORKER_STATE_EXITING;
+  if (IREE_LIKELY(should_run)) {
+    // << work happens here >>
+    iree_task_worker_pump_until_exit(worker);
+  }
+
+  IREE_TRACE_ZONE_END(thread_zone);
+  iree_atomic_store_int32(&worker->state, IREE_TASK_WORKER_STATE_ZOMBIE,
+                          iree_memory_order_release);
+  iree_notification_post(&worker->state_notification, IREE_ALL_WAITERS);
+  return 0;
+}

diff --git a/iree/task/worker.h b/iree/task/worker.h
new file mode 100644
index 0000000..1d45a5d
--- /dev/null
+++ b/iree/task/worker.h

@@ -0,0 +1,179 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IREE_TASK_WORKER_H_
+#define IREE_TASK_WORKER_H_
+
+#include "iree/base/synchronization.h"
+#include "iree/base/threading.h"
+#include "iree/base/tracing.h"
+#include "iree/task/affinity_set.h"
+#include "iree/task/executor.h"
+#include "iree/task/list.h"
+#include "iree/task/queue.h"
+#include "iree/task/tuning.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Indicates the current state of a worker or, in the case of EXITING, the state
+// the worker should transition to.
+//
+// Transition graph:
+//   SUSPENDED -> RUNNING (IDLE<->PROCESSING) -> EXITING -> ZOMBIE
+//
+// NOTE: state values are ordered such that </> comparisons can be used; ensure
+// that for example all states after resuming are > SUSPENDED and all states
+// before exiting are < EXITING.
+enum iree_task_worker_state_e {
+  // Worker has been created in a suspended state and must be resumed to wake.
+  IREE_TASK_WORKER_STATE_SUSPENDED = 0u,
+  // Worker is idle or actively processing tasks (either its own or others).
+  IREE_TASK_WORKER_STATE_RUNNING = 1u,
+  // Worker should exit (or is exiting) and will soon enter the zombie state.
+  // Coordinators can request workers to exit by setting their state to this and
+  // then waking.
+  IREE_TASK_WORKER_STATE_EXITING = 2u,
+  // Worker has exited and entered a 🧟 state (waiting for join).
+  // The thread handle is still valid and must be destroyed.
+  IREE_TASK_WORKER_STATE_ZOMBIE = 3u,
+};
+typedef int32_t iree_task_worker_state_t;
+
+// A worker within the executor pool.
+//
+// NOTE: fields in here are touched from multiple threads with lock-free
+// techniques. The alignment of the entire iree_task_worker_t as well as the
+// alignment and padding between particular fields is carefully (though perhaps
+// not yet correctly) selected; see the 'LAYOUT' comments below.
+typedef struct iree_task_worker_s {
+  // A LIFO mailbox used by coordinators to post tasks to this worker.
+  // As workers self-nominate to be coordinators and fan out dispatch slices
+  // they can directly emplace those slices into the workers that should execute
+  // them based on the work distribution policy. When workers go to look for
+  // more work after their local queue empties they will flush this list and
+  // move all of the tasks into their local queue and restart processing.
+  // LAYOUT: must be 64b away from local_task_queue.
+  iree_atomic_task_slist_t mailbox_slist;
+
+  // Current state of the worker (iree_task_worker_state_t).
+  // LAYOUT: frequent access; next to wake_notification as they are always
+  //         accessed together.
+  iree_atomic_int32_t state;
+
+  // Notification signaled when the worker should wake (if it is idle).
+  // LAYOUT: next to state for similar access patterns; when posting other
+  //         threads will touch mailbox_slist and then send a wake
+  //         notification.
+  iree_notification_t wake_notification;
+
+  // Notification signaled when the worker changes any state.
+  iree_notification_t state_notification;
+
+  // Parent executor that can be used to access the global work queue or task
+  // pool. Executors always outlive the workers they own.
+  iree_task_executor_t* executor;
+
+  // Bit the worker represents in the various worker bitsets.
+  iree_task_affinity_set_t worker_bit;
+
+  // Ideal thread affinity for the worker thread.
+  iree_thread_affinity_t ideal_thread_affinity;
+
+  // A bitmask of other group indices that share some level of the cache
+  // hierarchy. Workers of this group are more likely to constructively share
+  // some cache levels higher up with these other groups. For example, if the
+  // workers in a group all share an L2 cache then the groups indicated here may
+  // all share the same L3 cache.
+  iree_task_affinity_set_t constructive_sharing_mask;
+
+  // Maximum number of attempts to make when trying to steal tasks from other
+  // workers. This could be 64 (try stealing from all workers) or just a handful
+  // (try stealing from these 3 other cores that share your L3 cache).
+  uint32_t max_theft_attempts;
+
+  // Rotation counter for work stealing (ensures we don't favor one victim).
+  // Only ever touched by the worker thread as it steals work.
+  iree_prng_minilcg128_state_t theft_prng;
+
+  // Thread handle of the worker. If the thread has exited the handle will
+  // remain valid so that the executor can query its state.
+  iree_thread_t* thread;
+
+  // Destructive interference padding between the mailbox and local task queue
+  // to ensure that the worker - who is pounding on local_task_queue - doesn't
+  // contend with submissions or coordinators dropping new tasks in the mailbox.
+  //
+  // TODO(benvanik): test on 32-bit platforms; I'm pretty sure we'll always be
+  // past the iree_hardware_constructive_interference_size given the bulk of
+  // stuff above, but it'd be nice to guarantee it.
+  uint8_t _padding[8];
+
+  // Worker-local FIFO queue containing the slices that will be processed by the
+  // worker. This queue supports work-stealing by other workers if they run out
+  // of work of their own.
+  // LAYOUT: must be 64b away from mailbox_slist.
+  iree_task_queue_t local_task_queue;
+} iree_task_worker_t;
+static_assert(offsetof(iree_task_worker_t, mailbox_slist) +
+                      sizeof(iree_atomic_task_slist_t) <
+                  iree_hardware_constructive_interference_size,
+              "mailbox_slist must be in the first cache line");
+static_assert(offsetof(iree_task_worker_t, local_task_queue) >=
+                  iree_hardware_constructive_interference_size,
+              "local_task_queue must be separated from mailbox_slist by "
+              "at least a cache line");
+
+// Initializes a worker by creating its thread and configuring it for receiving
+// tasks. Where supported the worker will be created in a suspended state so
+// that we aren't creating a thundering herd on startup:
+// https://en.wikipedia.org/wiki/Thundering_herd_problem
+iree_status_t iree_task_worker_initialize(
+    iree_task_executor_t* executor, iree_host_size_t worker_index,
+    const iree_task_topology_group_t* topology_group,
+    iree_prng_splitmix64_state_t* seed_prng, iree_task_worker_t* out_worker);
+
+// Deinitializes a worker that has successfully exited. The worker must be in
+// the IREE_TASK_WORKER_STATE_ZOMBIE state.
+void iree_task_worker_deinitialize(iree_task_worker_t* worker);
+
+// Requests that the worker begin exiting (if it hasn't already).
+// If the worker is actively processing tasks it will wait until it has
+// completed all it can and is about to go idle prior to exiting.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_request_exit(iree_task_worker_t* worker);
+
+// Posts a FIFO list of tasks to the worker mailbox. The target worker takes
+// ownership of the tasks and will be woken if it is currently idle.
+//
+// May be called from any thread (including the worker thread).
+void iree_task_worker_post_tasks(iree_task_worker_t* worker,
+                                 iree_task_list_t* list);
+
+// Tries to steal up to |max_tasks| from the back of the queue.
+// Returns NULL if no tasks are available and otherwise up to |max_tasks| tasks
+// that were at the tail of the worker FIFO will be moved to the |target_queue|
+// and the first of the stolen tasks is returned. While tasks from the FIFO
+// are preferred this may also steal tasks from the mailbox.
+iree_task_t* iree_task_worker_try_steal_task(iree_task_worker_t* worker,
+                                             iree_task_queue_t* target_queue,
+                                             iree_host_size_t max_tasks);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_TASK_WORKER_H_
commit	01600a57fb6e5e4bfe7214a58c43c341b0cad60f	[log] [tgz]
author	Ben Vanik <benvanik@google.com>	Tue Dec 15 17:18:04 2020 -0800
committer	Ben Vanik <benvanik@google.com>	Wed Dec 23 18:16:17 2020 -0800
tree	c3e83c8c34c7a66bdd4f2ccbac9ce18be164d8ee
parent	b47e6f0443a24361fc703c705587684ec94ef62b [diff]