Removing sliced dispatch tasks. Sharding is better and it wasn't worth keeping the slicing code around and allocating the extra memory at runtime in case it was used. This drops executor memory consumption to the ~1-4KB + ~512B/worker.

commit: ccbc227a942a082a996e6856c8d046202bfa6d54 [log] [tgz]
author: Ben Vanik <benvanik@google.com> Mon Jan 31 21:08:11 2022 -0800
committer: Ben Vanik <benvanik@google.com> Tue Feb 01 11:32:58 2022 -0800
tree: c209d964e71fb3dc35b2a1184c8b08c4ba440f9e
parent: 96be7cd59b127e1c7581a0263c8f0569d06f093e [diff]
diff --git a/iree/task/executor.c b/iree/task/executor.c
index 1f21615..13d6b27 100644
--- a/iree/task/executor.c
+++ b/iree/task/executor.c

@@ -105,22 +105,15 @@
                                     allocator, &executor->wait_set);
   }
 
-  // Pool used for all dispatch->slice fanout tasks. These only live within the
-  // executor and since we know the precise lifetime of them we can keep them
-  // entirely within the system here.
-  if (iree_status_is_ok(status)) {
-    status = iree_task_pool_initialize(allocator, sizeof(iree_task_fence_t), 8,
-                                       &executor->fence_task_pool);
-  }
+  // Pool used for all fanout tasks. These only live within the executor and
+  // since we know the precise lifetime of them we can keep them entirely within
+  // the system here.
   if (iree_status_is_ok(status)) {
     status = iree_task_pool_initialize(
         allocator,
-        iree_max(sizeof(iree_task_dispatch_shard_t),
-                 sizeof(iree_task_dispatch_slice_t)),
-        worker_count *
-            iree_max(IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
-                     IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER),
-        &executor->dispatch_task_pool);
+        iree_max(sizeof(iree_task_fence_t), sizeof(iree_task_dispatch_shard_t)),
+        worker_count * IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
+        &executor->transient_task_pool);
   }
 
   // Bring up the workers; the threads will be created here but be suspended
@@ -201,8 +194,7 @@
   iree_slim_mutex_deinitialize(&executor->coordinator_mutex);
   iree_atomic_task_slist_deinitialize(&executor->incoming_ready_slist);
   iree_atomic_task_slist_deinitialize(&executor->incoming_waiting_slist);
-  iree_task_pool_deinitialize(&executor->fence_task_pool);
-  iree_task_pool_deinitialize(&executor->dispatch_task_pool);
+  iree_task_pool_deinitialize(&executor->transient_task_pool);
   iree_allocator_free(executor->allocator, executor);
 
   IREE_TRACE_ZONE_END(z0);
@@ -226,7 +218,7 @@
   // guarantee. We'd need some global executor lock that we did here and
   // on submit - or rework pools to not have this limitation.
   // iree_task_pool_trim(&executor->fence_task_pool);
-  // iree_task_pool_trim(&executor->dispatch_task_pool);
+  // iree_task_pool_trim(&executor->transient_task_pool);
 }
 
 iree_event_pool_t* iree_task_executor_event_pool(
@@ -239,10 +231,10 @@
                                                iree_task_fence_t** out_fence) {
   *out_fence = NULL;
   iree_task_fence_t* fence = NULL;
-  IREE_RETURN_IF_ERROR(iree_task_pool_acquire(&executor->fence_task_pool,
+  IREE_RETURN_IF_ERROR(iree_task_pool_acquire(&executor->transient_task_pool,
                                               (iree_task_t**)&fence));
   iree_task_fence_initialize(scope, fence);
-  fence->header.pool = &executor->fence_task_pool;
+  fence->header.pool = &executor->transient_task_pool;
   *out_fence = fence;
   return iree_ok_status();
 }
@@ -283,8 +275,7 @@
         // Doesn't do anything; just retire and continue on to any dependents.
         iree_task_nop_retire((iree_task_nop_t*)task, pending_submission);
         break;
-      case IREE_TASK_TYPE_CALL:
-      case IREE_TASK_TYPE_DISPATCH_SLICE: {
+      case IREE_TASK_TYPE_CALL: {
         // Generic routing to workers for tasks that should always run there.
         iree_task_executor_relay_to_worker(executor, post_batch, task);
         break;
@@ -319,15 +310,9 @@
           iree_task_dispatch_retire((iree_task_dispatch_t*)task,
                                     pending_submission);
         } else {
-          if (task->flags & IREE_TASK_FLAG_DISPATCH_SLICED) {
-            iree_task_dispatch_issue_sliced((iree_task_dispatch_t*)task,
-                                            &executor->dispatch_task_pool,
-                                            pending_submission, post_batch);
-          } else {
-            iree_task_dispatch_issue_sharded((iree_task_dispatch_t*)task,
-                                             &executor->dispatch_task_pool,
-                                             pending_submission, post_batch);
-          }
+          iree_task_dispatch_issue((iree_task_dispatch_t*)task,
+                                   &executor->transient_task_pool,
+                                   pending_submission, post_batch);
         }
         break;
       }
@@ -598,7 +583,7 @@
     // any changes then.
     //
     // As we schedule tasks we may spawn new ones (like a dispatch -> many
-    // dispatch slices) and we keep track of those here. By doing a pass through
+    // dispatch shards) and we keep track of those here. By doing a pass through
     // all ready tasks and only then merging in the new submission we get
     // breadth-first traversal of task graphs even if they originate from
     // various places and have no relation - hopefully leading to better average

diff --git a/iree/task/executor.h b/iree/task/executor.h
index c82db3f..4a8cdf9 100644
--- a/iree/task/executor.h
+++ b/iree/task/executor.h

@@ -67,7 +67,7 @@
 //   - latency prioritization by partitioning workloads by priority
 // - scheduling overhead tradeoffs by varying:
 //   - coordination/flush frequency to reduce cross-thread communication
-//   - by statically inserting dispatch slices to avoid dynamic fan-out
+//   - by statically inserting dispatch shards to avoid dynamic fan-out
 //   - thread donation to avoid likely context switches upon submit+wait
 //   - multi-wait across all users by sharing a wait set
 //   - per-worker work-stealing specification of victim workers in the topology

diff --git a/iree/task/executor_impl.h b/iree/task/executor_impl.h
index 3256f77..13f66ae 100644
--- a/iree/task/executor_impl.h
+++ b/iree/task/executor_impl.h

@@ -43,8 +43,12 @@
   // Pools of transient dispatch tasks shared across all workers.
   // Depending on configuration the task pool may allocate after creation using
   // the allocator provided upon executor creation.
-  iree_task_pool_t fence_task_pool;
-  iree_task_pool_t dispatch_task_pool;
+  //
+  // Sized to be able to fit at least:
+  //   iree_task_fence_t
+  //   iree_task_dispatch_shard_t
+  // Increasing the size larger than these will waste memory.
+  iree_task_pool_t transient_task_pool;
 
   // A list of incoming tasks that are ready to execute immediately.
   // The list is LIFO and we require that task lists are reversed by the

diff --git a/iree/task/executor_test.cc b/iree/task/executor_test.cc
index fac31e9..4c2a0a1 100644
--- a/iree/task/executor_test.cc
+++ b/iree/task/executor_test.cc

@@ -57,9 +57,10 @@
   iree_task_executor_t* executor = NULL;
   iree_task_scheduling_mode_t scheduling_mode =
       IREE_TASK_SCHEDULING_MODE_RESERVED;
-  IREE_CHECK_OK(iree_task_executor_create(
-      scheduling_mode, &topology,
-      /*worker_local_memory_size=*/(64 * 1024), allocator, &executor));
+  iree_host_size_t worker_local_memory_size = 0;  // 64 * 1024;
+  IREE_CHECK_OK(iree_task_executor_create(scheduling_mode, &topology,
+                                          worker_local_memory_size, allocator,
+                                          &executor));
   iree_task_topology_deinitialize(&topology);
 
   //
@@ -96,7 +97,6 @@
           },
           0),
       workgroup_size_0, workgroup_count_0, &dispatch0);
-  // dispatch0.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
 
   const uint32_t workgroup_size_1[3] = {128, 1, 1};
   const uint32_t workgroup_count_1[3] = {16, 2, 1};
@@ -115,7 +115,6 @@
           },
           0),
       workgroup_size_1, workgroup_count_1, &dispatch1);
-  dispatch1.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
 
   //
   iree_task_call_t call1;

diff --git a/iree/task/task.c b/iree/task/task.c
index 01b9196..5d79e20 100644
--- a/iree/task/task.c
+++ b/iree/task/task.c

@@ -122,7 +122,6 @@
     }
     case IREE_TASK_TYPE_WAIT:
     case IREE_TASK_TYPE_DISPATCH:
-    case IREE_TASK_TYPE_DISPATCH_SLICE:
       break;
   }
 
@@ -449,6 +448,12 @@
   out_task->local_memory_size = 0;
   iree_atomic_store_intptr(&out_task->status, 0, iree_memory_order_release);
   memset(&out_task->statistics, 0, sizeof(out_task->statistics));
+
+  IREE_TRACE({
+    static iree_atomic_int64_t next_dispatch_id = IREE_ATOMIC_VAR_INIT(0);
+    out_task->dispatch_id = iree_atomic_fetch_add_int64(
+        &next_dispatch_id, 1ll, iree_memory_order_acq_rel);
+  });
 }
 
 void iree_task_dispatch_initialize(iree_task_scope_t* scope,
@@ -470,131 +475,18 @@
   out_task->workgroup_count.ptr = workgroup_count_ptr;
 }
 
-void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
-                                     iree_task_pool_t* slice_task_pool,
-                                     iree_task_submission_t* pending_submission,
-                                     iree_task_post_batch_t* post_batch) {
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch) {
   IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
 
   // Mark the dispatch as having been issued; the next time it retires it'll be
   // because all work has completed.
   dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
 
   // Fetch the workgroup count (directly or indirectly).
-  // By the task being ready to execute we know any dependencies on the
-  // indirection buffer have been satisfied and its safe to read.
-  uint32_t workgroup_count[3];
-  if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
-    memcpy(workgroup_count, dispatch_task->workgroup_count.ptr,
-           sizeof(workgroup_count));
-  } else {
-    memcpy(workgroup_count, dispatch_task->workgroup_count.value,
-           sizeof(workgroup_count));
-  }
-  uint32_t total_workgroup_count =
-      workgroup_count[0] * workgroup_count[1] * workgroup_count[2];
-  if (total_workgroup_count == 0) {
-    // No workgroups to execute - bail early.
-    iree_task_dispatch_retire(dispatch_task, pending_submission);
-    IREE_TRACE_ZONE_END(z0);
-    return;
-  }
-
-#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-  char xyz_string[32];
-  int xyz_string_length =
-      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
-               workgroup_count[0], workgroup_count[1], workgroup_count[2]);
-  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
-#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-
-  // Divide up all tiles into slices, our finest-granularity scheduling task.
-  const uint32_t tiles_per_slice_x = IREE_TASK_DISPATCH_TILES_PER_SLICE_X;
-  const uint32_t tiles_per_slice_y = IREE_TASK_DISPATCH_TILES_PER_SLICE_Y;
-  const uint32_t tiles_per_slice_z = IREE_TASK_DISPATCH_TILES_PER_SLICE_Z;
-  uint32_t slice_count_x = iree_max(1, workgroup_count[0] / tiles_per_slice_x);
-  uint32_t slice_count_y = iree_max(1, workgroup_count[1] / tiles_per_slice_y);
-  uint32_t slice_count_z = iree_max(1, workgroup_count[2] / tiles_per_slice_z);
-
-  // Compute how many slices each worker will process.
-  uint32_t slice_count = slice_count_x * slice_count_y * slice_count_z;
-  iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
-  uint32_t slices_per_worker = iree_max(1, slice_count / worker_count);
-
-  // Randomize starting worker.
-  iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
-      post_batch, dispatch_task->header.affinity_set);
-  iree_host_size_t worker_index = worker_offset;
-
-  // TODO(benvanik): rework this with some science. For now we just iteratively
-  // divide up the space from outer->inner scheduling dimension, but ideally
-  // we'd use some fun cray-style torus scheduling or hilbert curve magic to
-  // try to ensure better locality using worker constructive sharing masks.
-  // TODO(benvanik): observe affinity_set here when dividing ranges.
-  iree_host_size_t worker_slice_count = 0;
-  for (uint32_t slice_z = 0; slice_z < slice_count_z; ++slice_z) {
-    for (uint32_t slice_y = 0; slice_y < slice_count_y; ++slice_y) {
-      for (uint32_t slice_x = 0; slice_x < slice_count_x; ++slice_x) {
-        uint32_t workgroup_base[3];
-        workgroup_base[0] = slice_x * tiles_per_slice_x;
-        workgroup_base[1] = slice_y * tiles_per_slice_y;
-        workgroup_base[2] = slice_z * tiles_per_slice_z;
-        uint32_t workgroup_range[3];
-        workgroup_range[0] = iree_min(workgroup_count[0],
-                                      workgroup_base[0] + tiles_per_slice_x) -
-                             1;
-        workgroup_range[1] = iree_min(workgroup_count[1],
-                                      workgroup_base[1] + tiles_per_slice_y) -
-                             1;
-        workgroup_range[2] = iree_min(workgroup_count[2],
-                                      workgroup_base[2] + tiles_per_slice_z) -
-                             1;
-
-        // Allocate and initialize the slice.
-        iree_task_dispatch_slice_t* slice_task =
-            iree_task_dispatch_slice_allocate(dispatch_task, workgroup_base,
-                                              workgroup_range, workgroup_count,
-                                              slice_task_pool);
-
-        // Enqueue on the worker selected for the task.
-        iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
-                                     &slice_task->header);
-        if (++worker_slice_count >= slices_per_worker) {
-          ++worker_index;
-          worker_slice_count = 0;
-        }
-      }
-    }
-  }
-
-  // NOTE: the dispatch is not retired until all slices complete. Upon the last
-  // slice completing the lucky worker will retire the task inline and
-  // potentially queue up more ready tasks that follow.
-  //
-  // The gotcha here is that it's possible for there to be zero slices within
-  // a dispatch (if, for example, and indirect dispatch had its workgroup counts
-  // set to zero to prevent it from running). We check for that here.
-  if (slice_count == 0) {
-    iree_task_dispatch_retire(dispatch_task, pending_submission);
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
-void iree_task_dispatch_issue_sharded(
-    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
-    iree_task_submission_t* pending_submission,
-    iree_task_post_batch_t* post_batch) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // Mark the dispatch as having been issued; the next time it retires it'll be
-  // because all work has completed.
-  dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
-
-  iree_task_dispatch_shard_state_t* shared_state =
-      &dispatch_task->shared.shard_state;
-
-  // Fetch the workgroup count (directly or indirectly).
   if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
     // By the task being ready to execute we know any dependencies on the
     // indirection buffer have been satisfied and its safe to read. We perform
@@ -609,35 +501,35 @@
   }
   const uint32_t* workgroup_count = dispatch_task->workgroup_count.value;
 
-#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-  char xyz_string[32];
-  int xyz_string_length =
-      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
-               workgroup_count[0], workgroup_count[1], workgroup_count[2]);
-  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
-#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  IREE_TRACE({
+    char xyz_string[32];
+    int xyz_string_length =
+        snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+                 workgroup_count[0], workgroup_count[1], workgroup_count[2]);
+    IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+  });
 
   // Setup the iteration space for shards to pull work from the complete grid.
-  iree_atomic_store_int32(&shared_state->tile_index, 0,
+  iree_atomic_store_int32(&dispatch_task->tile_index, 0,
                           iree_memory_order_relaxed);
-  shared_state->tile_count =
+  dispatch_task->tile_count =
       workgroup_count[0] * workgroup_count[1] * workgroup_count[2];
 
   // Compute shard count - almost always worker_count unless we are a very small
   // dispatch (1x1x1, etc).
   iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
   iree_host_size_t shard_count =
-      iree_min(shared_state->tile_count, worker_count);
+      iree_min(dispatch_task->tile_count, worker_count);
 
   // Compute how many tiles we want each shard to reserve at a time from the
   // larger grid. A higher number reduces overhead and improves locality while
   // a lower number reduces maximum worst-case latency (coarser work stealing).
-  if (shared_state->tile_count <
+  if (dispatch_task->tile_count <
       worker_count * IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION) {
     // Grid is small - allow it to be eagerly sliced up.
-    shared_state->tiles_per_reservation = 1;
+    dispatch_task->tiles_per_reservation = 1;
   } else {
-    shared_state->tiles_per_reservation =
+    dispatch_task->tiles_per_reservation =
         IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION;
   }
 
@@ -648,8 +540,8 @@
 
   for (iree_host_size_t i = 0; i < shard_count; ++i) {
     // Allocate and initialize the shard.
-    iree_task_dispatch_shard_t* shard_task = iree_task_dispatch_shard_allocate(
-        dispatch_task, shared_state, shard_task_pool);
+    iree_task_dispatch_shard_t* shard_task =
+        iree_task_dispatch_shard_allocate(dispatch_task, shard_task_pool);
 
     // Enqueue on the worker selected for the task.
     iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
@@ -674,6 +566,7 @@
 void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
                                iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
 
   // TODO(benvanik): attach statistics to the tracy zone.
 
@@ -684,14 +577,14 @@
       &dispatch_task->header.scope->dispatch_statistics);
 
   // Consume the status of the dispatch that may have been set from a workgroup
-  // and notify the scope. We need to do this here so that each slice/shard
-  // retires before we discard any subsequent tasks: otherwise a failure of
-  // one shard would discard the shared dispatch task (and potentially
-  // everything) while other shards were still running. We also want to avoid
-  // fine-grained synchronization across slices/shards that would occur by each
-  // checking to see if any other has hit an error; failure in a dispatch should
-  // be so exceedingly rare that allowing some shards to complete after one
-  // encounters an error is not a problem.
+  // and notify the scope. We need to do this here so that each shard retires
+  // before we discard any subsequent tasks: otherwise a failure of one shard
+  // would discard the shared dispatch task (and potentially everything) while
+  // other shards were still running. We also want to avoid fine-grained
+  // synchronization across shards that would occur by each checking to see if
+  // any other has hit an error; failure in a dispatch should be so exceedingly
+  // rare that allowing some shards to complete after one encounters an error is
+  // not a problem.
   iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
       &dispatch_task->status, 0, iree_memory_order_seq_cst);
 
@@ -700,168 +593,23 @@
 }
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
-                                         const uint32_t workgroup_base[3],
-                                         const uint32_t workgroup_range[3],
-                                         const uint32_t workgroup_count[3],
-                                         iree_task_dispatch_slice_t* out_task) {
-  iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SLICE,
-                       dispatch_task->header.scope, &out_task->header);
-  iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
-  out_task->closure = dispatch_task->closure;
-  out_task->dispatch_status = &dispatch_task->status;
-
-  memcpy(out_task->workgroup_base, workgroup_base,
-         sizeof(out_task->workgroup_base));
-  memcpy(out_task->workgroup_range, workgroup_range,
-         sizeof(out_task->workgroup_range));
-  memcpy(out_task->workgroup_size, dispatch_task->workgroup_size,
-         sizeof(out_task->workgroup_size));
-  memcpy(out_task->workgroup_count, workgroup_count,
-         sizeof(out_task->workgroup_count));
-
-  // Each slice requires at most this amount of memory from the worker-local
-  // pool.
-  out_task->local_memory_size = dispatch_task->local_memory_size;
-
-  // Wire up dispatch statistics; we'll track on the slice while we run and
-  // then the per-slice statistics will roll up into the dispatch statistics.
-  out_task->dispatch_statistics = &dispatch_task->statistics;
-  memset(&out_task->slice_statistics, 0, sizeof(out_task->slice_statistics));
-}
-
-iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
-    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
-    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
-    iree_task_pool_t* slice_task_pool) {
-  iree_task_dispatch_slice_t* slice_task = NULL;
-  iree_status_t status =
-      iree_task_pool_acquire(slice_task_pool, (iree_task_t**)&slice_task);
-  if (!iree_status_is_ok(status)) {
-    iree_status_ignore(status);
-    return NULL;
-  }
-  iree_task_dispatch_slice_initialize(dispatch_task, workgroup_base,
-                                      workgroup_range, workgroup_count,
-                                      slice_task);
-  slice_task->header.pool = slice_task_pool;
-  return slice_task;
-}
-
-void iree_task_dispatch_slice_execute(
-    iree_task_dispatch_slice_t* task, iree_byte_span_t local_memory,
-    iree_task_submission_t* pending_submission) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_SET_COLOR(z0,
-                            iree_math_ptr_to_xrgb(task->closure.user_context));
-
-  // TODO(benvanik): coroutine support. Ideally this function can be called
-  // multiple times for the same slice, and we'll have a way to ready up the
-  // slices on the same workers (some per-worker suspended list?).
-
-  // Prepare context shared for all tiles in the slice.
-  iree_task_tile_context_t tile_context;
-  memcpy(&tile_context.workgroup_size, task->workgroup_size,
-         sizeof(tile_context.workgroup_size));
-  memcpy(&tile_context.workgroup_count, task->workgroup_count,
-         sizeof(tile_context.workgroup_count));
-  tile_context.statistics = &task->slice_statistics;
-
-  // Map only the requested amount of worker local memory into the tile context.
-  // This ensures that how much memory is used by some executions does not
-  // inadvertently leak over into other executions.
-  if (IREE_UNLIKELY(task->local_memory_size > local_memory.data_length)) {
-    iree_task_retire(
-        &task->header, pending_submission,
-        iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
-                         "dispatch requires %ub of local memory but only "
-                         "%zub is available per-worker",
-                         task->local_memory_size, local_memory.data_length));
-    IREE_TRACE_ZONE_END(z0);
-    return;
-  }
-  tile_context.local_memory =
-      iree_make_byte_span(local_memory.data, task->local_memory_size);
-
-  iree_status_t status = iree_ok_status();
-  const uint32_t base_x = task->workgroup_base[0];
-  const uint32_t base_y = task->workgroup_base[1];
-  const uint32_t base_z = task->workgroup_base[2];
-  const uint32_t range_x = task->workgroup_range[0];
-  const uint32_t range_y = task->workgroup_range[1];
-  const uint32_t range_z = task->workgroup_range[2];
-  for (uint32_t z = base_z; z <= range_z; ++z) {
-    tile_context.workgroup_xyz[2] = z;
-    for (uint32_t y = base_y; y <= range_y; ++y) {
-      tile_context.workgroup_xyz[1] = y;
-      for (uint32_t x = base_x; x <= range_x; ++x) {
-        tile_context.workgroup_xyz[0] = x;
-        IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
-                                    "iree_task_dispatch_slice_execute_tile");
-        IREE_TRACE_ZONE_SET_COLOR(z_tile,
-                                  iree_task_tile_to_color(&tile_context));
-
-        // NOTE: these are useful for debugging but dramatically increase our
-        // cost here; only enable if needed for tracking work distribution:
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, x);
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, y);
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, z);
-        // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
-
-        status = task->closure.fn(task->closure.user_context, &tile_context,
-                                  pending_submission);
-
-        IREE_TRACE_ZONE_END(z_tile);
-
-        // If any tile fails we bail early from the loop. This doesn't match
-        // what an accelerator would do but saves some unneeded work.
-        // Note that other slices may have completed execution, be executing
-        // concurrently with this one, or still be pending - this does not
-        // have any influence on them and they may continue to execute even
-        // after we bail from here.
-        if (!iree_status_is_ok(status)) goto abort_slice;
-      }
-    }
-  }
-abort_slice:
-
-  // Push aggregate statistics up to the dispatch.
-  if (task->dispatch_statistics) {
-    iree_task_dispatch_statistics_merge(&task->slice_statistics,
-                                        task->dispatch_statistics);
-  }
-
-  // Propagate failures to the dispatch task.
-  if (!iree_status_is_ok(status)) {
-    iree_task_try_set_status(task->dispatch_status, status);
-  }
-
-  iree_task_retire(&task->header, pending_submission, iree_ok_status());
-  IREE_TRACE_ZONE_END(z0);
-}
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
-void iree_task_dispatch_shard_initialize(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_dispatch_shard_t* out_task) {
+static inline iree_task_dispatch_t* iree_task_dispatch_shard_parent(
+    iree_task_dispatch_shard_t* task) {
+  return (iree_task_dispatch_t*)task->header.completion_task;
+}
+
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task) {
   iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SHARD,
                        dispatch_task->header.scope, &out_task->header);
   iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
-  out_task->dispatch_task = dispatch_task;
-  out_task->shared_state = shared_state;
 }
 
 iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_pool_t* shard_task_pool) {
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool) {
   iree_task_dispatch_shard_t* shard_task = NULL;
   iree_status_t status =
       iree_task_pool_acquire(shard_task_pool, (iree_task_t**)&shard_task);
@@ -869,22 +617,41 @@
     iree_status_ignore(status);
     return NULL;
   }
-  iree_task_dispatch_shard_initialize(dispatch_task, shared_state, shard_task);
+  iree_task_dispatch_shard_initialize(dispatch_task, shard_task);
   shard_task->header.pool = shard_task_pool;
   return shard_task;
 }
 
 void iree_task_dispatch_shard_execute(
-    iree_task_dispatch_shard_t* task, iree_byte_span_t local_memory,
+    iree_task_dispatch_shard_t* task, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_task_dispatch_t* dispatch_task = task->dispatch_task;
+  iree_task_dispatch_t* dispatch_task = iree_task_dispatch_shard_parent(task);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
   IREE_TRACE_ZONE_SET_COLOR(
       z0, iree_math_ptr_to_xrgb(dispatch_task->closure.user_context));
 
+  // Map only the requested amount of worker local memory into the tile context.
+  // This ensures that how much memory is used by some executions does not
+  // inadvertently leak over into other executions.
+  if (IREE_UNLIKELY(dispatch_task->local_memory_size >
+                    worker_local_memory.data_length)) {
+    iree_task_try_set_status(
+        &dispatch_task->status,
+        iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                         "dispatch requires %ub of local memory but only "
+                         "%zub is available per-worker",
+                         dispatch_task->local_memory_size,
+                         worker_local_memory.data_length));
+    iree_task_retire(&task->header, pending_submission, iree_ok_status());
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+  iree_byte_span_t local_memory = iree_make_byte_span(
+      worker_local_memory.data, dispatch_task->local_memory_size);
+
   // Prepare context shared for all tiles in the shard.
-  iree_task_dispatch_shard_state_t* shared_state = task->shared_state;
   iree_task_tile_context_t tile_context;
   memcpy(&tile_context.workgroup_size, dispatch_task->workgroup_size,
          sizeof(tile_context.workgroup_size));
@@ -892,24 +659,7 @@
          sizeof(tile_context.workgroup_count));
   uint32_t workgroup_count_x = tile_context.workgroup_count[0];
   uint32_t workgroup_count_y = tile_context.workgroup_count[1];
-
-  // Map only the requested amount of worker local memory into the tile context.
-  // This ensures that how much memory is used by some executions does not
-  // inadvertently leak over into other executions.
-  if (IREE_UNLIKELY(dispatch_task->local_memory_size >
-                    local_memory.data_length)) {
-    iree_task_retire(
-        &task->header, pending_submission,
-        iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
-                         "dispatch requires %ub of local memory but only "
-                         "%zub is available per-worker",
-                         dispatch_task->local_memory_size,
-                         local_memory.data_length));
-    IREE_TRACE_ZONE_END(z0);
-    return;
-  }
-  tile_context.local_memory =
-      iree_make_byte_span(local_memory.data, dispatch_task->local_memory_size);
+  tile_context.local_memory = local_memory;
 
   // We perform all our shard statistics work locally here and only push back to
   // the dispatch at the end; this avoids contention from each shard trying to
@@ -919,10 +669,9 @@
   tile_context.statistics = &shard_statistics;
 
   // Loop over all tiles until they are all processed.
-  iree_status_t status = iree_ok_status();
-  const uint32_t tile_count = shared_state->tile_count;
-  const uint32_t tiles_per_reservation = shared_state->tiles_per_reservation;
-  uint32_t tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+  const uint32_t tile_count = dispatch_task->tile_count;
+  const uint32_t tiles_per_reservation = dispatch_task->tiles_per_reservation;
+  uint32_t tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
                                                    tiles_per_reservation,
                                                    iree_memory_order_relaxed);
   while (tile_base < tile_count) {
@@ -950,35 +699,40 @@
       IREE_TRACE_ZONE_APPEND_VALUE(z_tile, tile_context.workgroup_xyz[2]);
       // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
 
-      status = dispatch_task->closure.fn(dispatch_task->closure.user_context,
-                                         &tile_context, pending_submission);
+      iree_status_t status =
+          dispatch_task->closure.fn(dispatch_task->closure.user_context,
+                                    &tile_context, pending_submission);
 
       IREE_TRACE_ZONE_END(z_tile);
 
       // If any tile fails we bail early from the loop. This doesn't match
       // what an accelerator would do but saves some unneeded work.
-      // Note that other slices may have completed execution, be executing
+      // Note that other shards may have completed execution, be executing
       // concurrently with this one, or still be pending - this does not
       // have any influence on them and they may continue to execute even
       // after we bail from here.
-      if (!iree_status_is_ok(status)) goto abort_shard;
+      if (!iree_status_is_ok(status)) {
+        // Propagate failures to the dispatch task.
+        iree_task_try_set_status(&dispatch_task->status, status);
+        goto abort_shard;  // out of the while-for nest
+      }
     }
 
-    tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+    // Try to grab the next slice of tiles.
+    tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
                                             tiles_per_reservation,
                                             iree_memory_order_relaxed);
   }
 abort_shard:
 
   // Push aggregate statistics up to the dispatch.
+  // Note that we may have partial information here if we errored out of the
+  // loop but that's still useful to know.
   iree_task_dispatch_statistics_merge(&shard_statistics,
                                       &dispatch_task->statistics);
 
-  // Propagate failures to the dispatch task.
-  if (!iree_status_is_ok(status)) {
-    iree_task_try_set_status(&dispatch_task->status, status);
-  }
-
+  // NOTE: even if an error was hit we retire OK - the error has already been
+  // propagated to the dispatch and it'll clean up after all shards are joined.
   iree_task_retire(&task->header, pending_submission, iree_ok_status());
   IREE_TRACE_ZONE_END(z0);
 }

diff --git a/iree/task/task.h b/iree/task/task.h
index a4a9626..235a8d9 100644
--- a/iree/task/task.h
+++ b/iree/task/task.h

@@ -54,9 +54,8 @@
   IREE_TASK_TYPE_WAIT = 4u,
 
   // Task is a 3D grid dispatch of zero or more tiles.
-  // Dispatches are issued when ready by either being split into zero or more
-  // slices with one or more tiles each based on the workgroup count or one
-  // shard per worker that should process the dispatch.
+  // Dispatches are issued when ready by either being split into one shard per
+  // worker that should process the dispatch.
   //
   // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
   // workgroup count from a buffer immediately prior to fan-out instead of using
@@ -64,26 +63,21 @@
   //
   // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
   // set to indicate that when the dispatch becomes ready again it will be after
-  // all slices have completed.
+  // all shards have completed.
   IREE_TASK_TYPE_DISPATCH = 5u,
 
-  // Task is a slice of a larger contiguous dispatch tile range. The full
-  // dispatch will be sliced into zero or more slices and each slice will be
-  // posted to a particular worker for execution. If work progresses unevenly
-  // then entire slices will be stolen across workers to balance out the timing.
-  // Slices retire once they have completed the tiles assigned to them.
-  IREE_TASK_TYPE_DISPATCH_SLICE = 6u,
-
   // Task is one of potentially many shards processing a larger dispatch grid.
   // Each shard may have a preference as to which parts of grid it will focus
   // on but is able to otherwise steal any available region directly from the
   // shared dispatch coordination state. Shards retire once there are no more
   // tiles remaining in the dispatch grid.
-  IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
+  IREE_TASK_TYPE_DISPATCH_SHARD = 6u,
 };
 typedef uint8_t iree_task_type_t;
 
 enum iree_task_flag_bits_t {
+  IREE_TASK_FLAG_NONE = 0u,
+
   // The wait handle the task is specified to wait on has resolved and the task
   // can now be considered complete.
   IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,
@@ -95,23 +89,17 @@
   // issued.
   IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,
 
-  // The dispatch should be sliced across workers via the low-contention
-  // IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
-  // to the time when the grid is sliced for a savings during when the grid is
-  // executed.
-  IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,
-
   // The dispatch has been issued and the task is waiting for one or more
-  // slices to complete. After they complete the dispatch will be readied and
+  // shards to complete. After they complete the dispatch will be readied and
   // can be retired.
   //
   // Though added by the executor after issuing a dispatch users can also set
-  // this to indicate that all dispatch slices for a particular dispatch have
+  // this to indicate that all dispatch shards for a particular dispatch have
   // been statically scheduled. Executors will then skip issuing the dispatch
-  // and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
+  // and instead wait until all shards complete, enabling IREE_TASK_TYPE_BARRIER
   // behavior but without an additional task as dispatches are still required
-  // to store information for slices.
-  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
+  // to store information for shards.
+  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 2,
 
   // An error occurred at or before the task and it has been aborted.
   // Aborted tasks may continue to execute if they're already in-flight but must
@@ -120,7 +108,7 @@
   // The actual error that occurred is routed to the parent task scope as it
   // happens and may be available for querying before all tasks have been
   // cleaned up.
-  IREE_TASK_FLAG_ABORTED = 1u << 4,
+  IREE_TASK_FLAG_ABORTED = 1u << 3,
 };
 typedef uint16_t iree_task_flags_t;
 
@@ -187,6 +175,9 @@
 };
 static_assert(offsetof(iree_task_t, next_task) == 0,
               "next_task intrusive pointer must be at offset 0");
+static_assert(sizeof(iree_task_t) <= 64,
+              "the task header greatly influences pool sizes due to alignment "
+              "requirements and should be kept tiny");
 
 // Initializes a task header with the given type.
 // Must be called on all tasks to ensure proper dependency tracking and list
@@ -458,7 +449,7 @@
   // Contents are (today) undefined upon entry.
   iree_byte_span_t local_memory;
 
-  // Shared statistics counters for the dispatch slice.
+  // Shared statistics counters for the dispatch shard.
   iree_task_dispatch_statistics_t* statistics;
 
   // TODO(benvanik): cpuid uarch.
@@ -467,20 +458,6 @@
 
 typedef struct iree_task_dispatch_t iree_task_dispatch_t;
 
-// Shared state for all shards processing a dispatch.
-typedef iree_alignas(iree_max_align_t) struct {
-  // The tail tile index; the next reservation will start from here.
-  iree_atomic_int32_t tile_index;
-
-  // The total number of tiles in the dispatch bounding tile_index.
-  uint32_t tile_count;
-
-  // Maximum number of tiles to fetch per tile reservation from the grid.
-  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
-  // reasonable number chosen based on the tile and shard counts.
-  uint32_t tiles_per_reservation;
-} iree_task_dispatch_shard_state_t;
-
 //==============================================================================
 // Dispatch function closures
 //==============================================================================
@@ -516,10 +493,10 @@
 //==============================================================================
 
 // An execution request across a tiled grid.
-// Dispatches are fork points where zero or more dispatch slice tasks are
+// Dispatches are fork points where zero or more dispatch shard tasks are
 // spawned and processed prior to joining again on the dispatch completion task.
 //
-// The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
+// The total workgroup count defines the [x,y,z] extents of the dispatch grid.
 // The count may either be embedded directly into the dispatch or provided as a
 // pointer to the workgroup_count[3] that will be read immediately prior to
 // forking. If any dimension of the workgroup count is zero then the dispatch is
@@ -527,11 +504,11 @@
 //
 // Example:
 //   dispatch([5, 1, 1])
-//     forked into slices based on affinity/scheduling parameters:
-//     -> dispatch_slice([0-1, 1, 1])
-//     -> dispatch_slice([2-3, 1, 1])
-//     -> dispatch_slice([4-5, 1, 1])
-//   completion_task run after all slices complete
+//     forked into shards based on affinity/scheduling parameters:
+//     -> dispatch_shard for core 0, processes [0-1, 1, 1]
+//     -> dispatch_shard for core 1, processes [2-3, 1, 1]
+//     -> dispatch_shard for core 2, processes [4-5, 1, 1]
+//   completion_task run after all shards complete
 typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_t {
   // Task header: implementation detail, do not use.
   iree_task_t header;
@@ -560,20 +537,32 @@
   uint32_t local_memory_size;
 
   // Resulting status from the dispatch available once all workgroups have
-  // completed (or would have completed). If multiple shards/slices processing
-  // the workgroups hit an error the first will be taken and the result ignored.
-  // A dispatch with a non-ok status will mark the parent task scope as failing
+  // completed (or would have completed). If multiple shards processing the
+  // workgroups hit an error the first will be taken and the result ignored. A
+  // dispatch with a non-ok status will mark the parent task scope as failing
   // when it retires.
   iree_atomic_intptr_t status;
 
-  // Statistics storage used for aggregating counters across all slices.
+  // Statistics storage used for aggregating counters across all shards.
   iree_task_dispatch_statistics_t statistics;
 
-  // Shared state across all slices/shards/etc.
-  // Stored once per dispatch and then referenced by all subtasks.
-  union {
-    iree_task_dispatch_shard_state_t shard_state;
-  } shared;
+  // The total number of tiles in the dispatch bounding tile_index.
+  uint32_t tile_count;
+
+  // Maximum number of tiles to fetch per tile reservation from the grid.
+  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
+  // reasonable number chosen based on the tile and shard counts.
+  uint32_t tiles_per_reservation;
+
+  // The tail tile index; the next reservation will start from here.
+  // This is used by shards to slice off the work to perform in their inner
+  // loop. Ideally we'd have no destructive interference with other shared data
+  // in this structure but the shared parts (status/statistics) are updated once
+  // per shard instead of once per slice and are less of a concern.
+  iree_atomic_int32_t tile_index;
+
+  // Incrementing process-lifetime dispatch identifier.
+  IREE_TRACE(int64_t dispatch_id;)
 } iree_task_dispatch_t;
 
 void iree_task_dispatch_initialize(iree_task_scope_t* scope,
@@ -588,86 +577,6 @@
     iree_task_dispatch_t* out_task);
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-// TODO(benvanik): per-region dependencies (allow slices to execute directly
-// across dispatches).
-
-// A slice of tiles within a larger dispatch grid.
-// These tasks are designed to be synthesized by the task system when processing
-// a dispatch task. Based on the workgroup count, affinity settings, and
-// available executor threads zero or more slices are enqueued, executed, and
-// retired as part of the complete dispatch task. The dispatch is only
-// considered completed and subsquent tasks readied once all slices are
-// complete.
-//
-// Slices aggregate statistics from all tiles within them and then upon
-// completion merge those into the shared dispatch statistics. As slices may
-// suspend and resume several times the dispatch-level statistics should only be
-// read once all slices have completed fully.
-//
-// In general slices represent a contiguous range of tiles along the most
-// rapidly changing dimension (x, then y, then z). This ensures that we at least
-// give the opportunity for cache locality to the tiles as they are processed.
-// If work stealing is enabled then slices may shed their trailing tiles to
-// other threads that have completed all of their work (at a cost of power vs.
-// potential latency savings).
-typedef iree_alignas(iree_max_align_t) struct {
-  // Task header: implementation detail, do not use.
-  iree_task_t header;
-
-  // NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
-  // This removes the need for touching the dispatch struct when beginning a
-  // tile which would likely be a cache miss as we fan out to other cores.
-
-  // Status of the dispatch aggregating failues from all slices.
-  iree_atomic_intptr_t* dispatch_status;
-
-  // Function closure to call per tile (same as the closure in the dispatch).
-  iree_task_dispatch_closure_t closure;
-
-  // Base workgroup ID for the slice range.
-  uint32_t workgroup_base[3];
-  // Total count of tiles within the slice range.
-  uint32_t workgroup_range[3];
-
-  // Workgroup size for each invocation.
-  uint32_t workgroup_size[3];
-  // Total workgroup count for the task. Can be used in conjunction with the
-  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
-  uint32_t workgroup_count[3];
-
-  // Optional transient shared memory size in bytes to allocate and pass into
-  // the iree_task_tile_context_t::local_memory of each invocation of the
-  // dispatch closure.
-  uint32_t local_memory_size;
-
-  // Shared statistics counters for the entire dispatch. References the storage
-  // held in the parent iree_task_dispatch_t.
-  iree_task_dispatch_statistics_t* dispatch_statistics;
-  // Statistics just for this single slice. The statistics will be added to the
-  // dispatch_statistics after the slice completes to prevent excessive
-  // contention on the shared dispatch statistics across multiple threads.
-  iree_task_dispatch_statistics_t slice_statistics;
-
-  // Per-tile initialized coroutine storage for all tiles in the range
-  // initialized as each tile begins execution.
-  // TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
-} iree_task_dispatch_slice_t;
-
-// TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
-// This would be useful to reduce latency when the number of slices is small
-// (~<5) as the dispatch wouldn't need to be issued. This can also be used to
-// implement per-region dependencies as direct slice->slice deps vs. fork/join
-// dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
-void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
-                                         const uint32_t workgroup_base[3],
-                                         const uint32_t workgroup_range[3],
-                                         const uint32_t workgroup_count[3],
-                                         iree_task_dispatch_slice_t* out_task);
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
@@ -675,19 +584,12 @@
   // Task header: implementation detail, do not use.
   iree_task_t header;
 
-  // The root dispatch task that this shard is a part of.
-  iree_task_dispatch_t* dispatch_task;
-
-  // Active dispatch progress shared across all shards.
-  // Each shard will be read/modify/writing this and there's likely to be
-  // contention.
-  iree_task_dispatch_shard_state_t* shared_state;
+  // NOTE: the parent dispatch task this shard is applied to is in the
+  // header.completion_task field.
 } iree_task_dispatch_shard_t;
 
-void iree_task_dispatch_shard_initialize(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_dispatch_shard_t* out_task);
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/iree/task/task_impl.h b/iree/task/task_impl.h
index 817f245..11639d9 100644
--- a/iree/task/task_impl.h
+++ b/iree/task/task_impl.h

@@ -79,82 +79,44 @@
 // IREE_TASK_TYPE_DISPATCH
 //==============================================================================
 
-// Schedules a dispatch by forking out to zero or more slices that will be
-// executed on workers. The slices are allocated from an executor-owned pool
-// and are generally not user-visible - they'll just see their dispatch begin
-// execution prior to the slices and end execution after the last slice
-// finishes.
-//
-// Only called during coordination and expects the coordinator lock to be held.
-void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
-                                     iree_task_pool_t* slice_task_pool,
-                                     iree_task_submission_t* pending_submission,
-                                     iree_task_post_batch_t* post_batch);
-
 // Schedules a dispatch by forking out to zero or more shards that will be
 // executed on workers. The shards are allocated from an executor-owned pool
 // and are generally not user-visible - they'll just see their dispatch begin
-// execution prior to the slices and end execution after the last shard
+// execution prior to the shards and end execution after the last shard
 // finishes.
 //
 // Only called during coordination and expects the coordinator lock to be held.
-void iree_task_dispatch_issue_sharded(
-    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
-    iree_task_submission_t* pending_submission,
-    iree_task_post_batch_t* post_batch);
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch);
 
-// Retires a dispatch when all issued slices have completed executing.
+// Retires a dispatch when all issued shards have completed executing.
 //
 // Only called during coordination and expects the coordinator lock to be held.
 void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
                                iree_task_submission_t* pending_submission);
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-// Allocates a dispatch slice task from the shared executor task pool.
-// The slice will be released back to the pool when it has completed execution.
-iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
-    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
-    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
-    iree_task_pool_t* slice_task_pool);
-
-// Executes and retires a dispatch slice task.
-// May block the caller for an indeterminate amount of time and should only be
-// called from threads owned by or donated to the executor.
-//
-// |local_memory| is a block of memory exclusively available to the slice
-// during execution. Contents are undefined both before and after execution.
-//
-// Returns ok if all tiles were successfully executed and otherwise returns
-// an unspecified status (probably the first non-ok status hit).
-void iree_task_dispatch_slice_execute(
-    iree_task_dispatch_slice_t* task, iree_byte_span_t local_memory,
-    iree_task_submission_t* pending_submission);
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
 // Allocates a dispatch shard task from the shared executor task pool.
 // The shard will be released back to the pool when it has completed execution.
 iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_pool_t* shard_task_pool);
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool);
 
 // Executes and retires a dispatch shard task.
 // May block the caller for an indeterminate amount of time and should only be
 // called from threads owned by or donated to the executor.
 //
-// |local_memory| is a block of memory exclusively available to the shard
+// |worker_local_memory| is a block of memory exclusively available to the shard
 // during execution. Contents are undefined both before and after execution.
 //
 // Errors are propagated to the parent scope and the dispatch will fail once
 // all shards have completed.
 void iree_task_dispatch_shard_execute(
-    iree_task_dispatch_shard_t* task, iree_byte_span_t local_memory,
+    iree_task_dispatch_shard_t* task, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission);
 
 #ifdef __cplusplus

diff --git a/iree/task/task_test_dispatch.cc b/iree/task/task_test_dispatch.cc
index 74e51c0..19ff45c 100644
--- a/iree/task/task_test_dispatch.cc
+++ b/iree/task/task_test_dispatch.cc

@@ -84,56 +84,28 @@
   }
 };
 
-TEST_F(TaskDispatchTest, Issue000Sharded) {
+TEST_F(TaskDispatchTest, Issue000) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {0, 0, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue000Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {0, 0, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue120Sharded) {
+TEST_F(TaskDispatchTest, Issue120) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {1, 2, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue120Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {1, 2, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue111Sharded) {
+TEST_F(TaskDispatchTest, Issue111) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {1, 1, 1};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue111Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {1, 1, 1};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue345Sharded) {
+TEST_F(TaskDispatchTest, Issue345) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {3, 4, 5};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
-}
-
-TEST_F(TaskDispatchTest, Issue345Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {3, 4, 5};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
 TEST_F(TaskDispatchTest, IssueIndirect) {

diff --git a/iree/task/tuning.h b/iree/task/tuning.h
index 0f3a220..015e567 100644
--- a/iree/task/tuning.h
+++ b/iree/task/tuning.h

@@ -17,15 +17,6 @@
 // only <64 will ever be used (such as for devices with 2 cores).
 #define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
 
-// Initial number of slice tasks that are allocated in the executor pool.
-// Increasing this number will decrease initial allocation storms in cases of
-// extremely wide fan-out (many dispatches with many thousands of slices) at the
-// cost of a higher minimum memory consumption.
-//
-// Set to zero if sliced dispatches will not be used or will be allocated by the
-// caller to avoid fixed overhead associated with the internal executor pool.
-#define IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER (0)
-
 // Initial number of shard tasks that are allocated in the executor pool.
 // Increasing this number will decrease initial allocation storms in cases of
 // extremely wide concurrency regions (many dispatches running at the same time)
@@ -81,24 +72,6 @@
 #define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
   IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
 
-// Number of tiles that will be batched into a single slice along each XYZ dim.
-//
-// Larger numbers reduce overhead and ensure that more tiles are executed
-// locally on the same worker (== shared caches) while also increasing potential
-// latency as work-stealing (always per-slice) is less effective.
-//
-// Numbers >1 on Y and Z can be used to cluster tiles together within the same
-// slice when spatial coherence outside of the X dimension is useful.
-//
-// The current usage of this is provisional; we may do all of this from the
-// compiler and want this behavior to be relatively fixed so that we can predict
-// it better. The only thing we want to be introducing here is flexibility for
-// when worker topology differs at runtime from what is knowable during offline
-// compilation.
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_X (8)
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Y (1)
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Z (1)
-
 // Number of tiles that will be batched into a single reservation from the grid.
 // This is a maximum; if there are fewer tiles that would otherwise allow for
 // maximum parallelism then this may be ignored.

diff --git a/iree/task/worker.c b/iree/task/worker.c
index ca5b2d9..ee64bbe 100644
--- a/iree/task/worker.c
+++ b/iree/task/worker.c

@@ -183,12 +183,6 @@
       iree_task_call_execute((iree_task_call_t*)task, pending_submission);
       break;
     }
-    case IREE_TASK_TYPE_DISPATCH_SLICE: {
-      iree_task_dispatch_slice_execute((iree_task_dispatch_slice_t*)task,
-                                       worker->local_memory,
-                                       pending_submission);
-      break;
-    }
     case IREE_TASK_TYPE_DISPATCH_SHARD: {
       iree_task_dispatch_shard_execute((iree_task_dispatch_shard_t*)task,
                                        worker->local_memory,

diff --git a/iree/task/worker.h b/iree/task/worker.h
index 9342727..1cc2d0f 100644
--- a/iree/task/worker.h
+++ b/iree/task/worker.h

@@ -58,8 +58,8 @@
 // not yet correctly) selected; see the 'LAYOUT' comments below.
 typedef struct iree_task_worker_t {
   // A LIFO mailbox used by coordinators to post tasks to this worker.
-  // As workers self-nominate to be coordinators and fan out dispatch slices
-  // they can directly emplace those slices into the workers that should execute
+  // As workers self-nominate to be coordinators and fan out dispatch shards
+  // they can directly emplace those shards into the workers that should execute
   // them based on the work distribution policy. When workers go to look for
   // more work after their local queue empties they will flush this list and
   // move all of the tasks into their local queue and restart processing.
@@ -127,7 +127,7 @@
   // workers.
   iree_byte_span_t local_memory;
 
-  // Worker-local FIFO queue containing the slices that will be processed by the
+  // Worker-local FIFO queue containing the tasks that will be processed by the
   // worker. This queue supports work-stealing by other workers if they run out
   // of work of their own.
   // LAYOUT: must be 64b away from mailbox_slist.
commit	ccbc227a942a082a996e6856c8d046202bfa6d54	[log] [tgz]
author	Ben Vanik <benvanik@google.com>	Mon Jan 31 21:08:11 2022 -0800
committer	Ben Vanik <benvanik@google.com>	Tue Feb 01 11:32:58 2022 -0800
tree	c209d964e71fb3dc35b2a1184c8b08c4ba440f9e
parent	96be7cd59b127e1c7581a0263c8f0569d06f093e [diff]