Merge pull request #8219 from google/benvanik-task-errors Propagating errors through the task system. This allows for errors to properly fail task scopes and ensure subsequent work is discarded correctly; errors now propagate up to their parent scope and can be retrieved by HAL semaphores or users for graceful handling. Misc cleanup and simplification was done as part of this and new tests were added to verify the behavior. Progress on #4026. Waits still need work but are being redesigned in future changes.

commit: e838b4aadaaed0d6f84203cfb7f20ce33e632fa3 [log] [tgz]
author: Ben Vanik <benvanik@google.com> Tue Feb 01 14:18:29 2022 -0800
committer: GitHub <noreply@github.com> Tue Feb 01 14:18:29 2022 -0800
tree: b2e9d0ea50dc244d65ed660e9cf5a7b09a4f32c0
parent: 9b7d74d243db2b2214b9586e2ab0952bab60ce5f [diff]
parent: 0adc497b35e6f0b40f6dee66a203aafe37d8e325 [diff]
diff --git a/iree/base/alignment.h b/iree/base/alignment.h
index 5229d97..6f32ffe 100644
--- a/iree/base/alignment.h
+++ b/iree/base/alignment.h

@@ -59,7 +59,7 @@
 
 // Returns the size of a struct padded out to iree_max_align_t.
 // This must be used when performing manual trailing allocation packing to
-// ensure the alignment requirements of the trailing data are satisified.
+// ensure the alignment requirements of the trailing data are satisfied.
 //
 // NOTE: do not use this if using VLAs (`struct { int trailing[]; }`) - those
 // must precisely follow the normal sizeof(t) as the compiler does the padding

diff --git a/iree/base/allocator.h b/iree/base/allocator.h
index ea2acf6..6b71d54 100644
--- a/iree/base/allocator.h
+++ b/iree/base/allocator.h

@@ -213,7 +213,7 @@
 // Function pointer for an iree_allocator_t control function.
 // |command| provides the operation to perform. Optionally some commands may use
 // |params| to pass additional operation-specific parameters. |inout_ptr| usage
-// is defined by each operation but is general a pointer to the pointer to
+// is defined by each operation but is generally a pointer to the pointer to
 // set to the newly allocated memory or a pointer to the pointer to free.
 typedef iree_status_t(IREE_API_PTR* iree_allocator_ctl_fn_t)(
     void* self, iree_allocator_command_t command, const void* params,
@@ -223,7 +223,7 @@
 // IREE will attempt to use this in place of the system malloc and free.
 // Pass the iree_allocator_system() macro to use the system allocator.
 typedef struct iree_allocator_t {
-  // User-defined pointer passed to all functions.
+  // Control function data.
   void* self;
   // ioctl-style control function servicing all allocator-related commands.
   // See iree_allocator_command_t for more information.

diff --git a/iree/base/assert.h b/iree/base/assert.h
index 450bf9d..930baab 100644
--- a/iree/base/assert.h
+++ b/iree/base/assert.h

@@ -61,6 +61,8 @@
 #define IREE_ASSERT_TRUE(expr, ...) IREE_ASSERT(!!(expr), __VA_ARGS__)
 #define IREE_ASSERT_FALSE(expr, ...) IREE_ASSERT(!(expr), __VA_ARGS__)
 
+#define IREE_ASSERT_UNREACHABLE(...) IREE_ASSERT(false, __VA_ARGS__)
+
 #define IREE_ASSERT_EQ(x, y, ...) _IREE_ASSERT_CMP(x, ==, y, __VA_ARGS__)
 #define IREE_ASSERT_NE(x, y, ...) _IREE_ASSERT_CMP(x, !=, y, __VA_ARGS__)
 #define IREE_ASSERT_LE(x, y, ...) _IREE_ASSERT_CMP(x, <=, y, __VA_ARGS__)

diff --git a/iree/base/internal/BUILD b/iree/base/internal/BUILD
index bf0ce3c..1bea880 100644
--- a/iree/base/internal/BUILD
+++ b/iree/base/internal/BUILD

@@ -318,6 +318,20 @@
 )
 
 cc_library(
+    name = "event_pool",
+    srcs = ["event_pool.c"],
+    hdrs = ["event_pool.h"],
+    deps = [
+        ":internal",
+        ":synchronization",
+        ":wait_handle",
+        "//iree/base",
+        "//iree/base:core_headers",
+        "//iree/base:tracing",
+    ],
+)
+
+cc_library(
     name = "threading",
     srcs = [
         "threading.c",

diff --git a/iree/base/internal/CMakeLists.txt b/iree/base/internal/CMakeLists.txt
index bb4a0a1..99a54fa 100644
--- a/iree/base/internal/CMakeLists.txt
+++ b/iree/base/internal/CMakeLists.txt

@@ -326,6 +326,23 @@
 
 iree_cc_library(
   NAME
+    event_pool
+  HDRS
+    "event_pool.h"
+  SRCS
+    "event_pool.c"
+  DEPS
+    ::internal
+    ::synchronization
+    ::wait_handle
+    iree::base
+    iree::base::core_headers
+    iree::base::tracing
+  PUBLIC
+)
+
+iree_cc_library(
+  NAME
     threading
   HDRS
     "threading.h"

diff --git a/iree/base/internal/debugging.h b/iree/base/internal/debugging.h
index d37ebcc..0bf232c 100644
--- a/iree/base/internal/debugging.h
+++ b/iree/base/internal/debugging.h

@@ -63,7 +63,7 @@
 // picked up. In addition, specific uses of memory like arenas can thwart tools
 // like ASAN that try to detect accesses to freed memory because we are never
 // actually malloc()'ing and free()'ing and need to tell ASAN when blocks of
-// memory come into/outof the pool.
+// memory come into/out-of the pool.
 //
 // The documentation on these interfaces is pretty sparse but it's possible to
 // find usage examples of the hooks in the compiler-provided hooks themselves.

diff --git a/iree/hal/local/event_pool.c b/iree/base/internal/event_pool.c
similarity index 84%
rename from iree/hal/local/event_pool.c
rename to iree/base/internal/event_pool.c
index 40a2905..2cc93d4 100644
--- a/iree/hal/local/event_pool.c
+++ b/iree/base/internal/event_pool.c

@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/hal/local/event_pool.h"
+#include "iree/base/internal/event_pool.h"
 
 #include <stdbool.h>
 #include <stddef.h>
@@ -13,7 +13,7 @@
 #include "iree/base/internal/synchronization.h"
 #include "iree/base/tracing.h"
 
-struct iree_hal_local_event_pool_t {
+struct iree_event_pool_t {
   // Allocator used to create the event pool.
   iree_allocator_t host_allocator;
   // Guards the pool. Since this pool is used to get operating system-level
@@ -31,14 +31,14 @@
   iree_event_t available_list[];
 };
 
-iree_status_t iree_hal_local_event_pool_allocate(
-    iree_host_size_t available_capacity, iree_allocator_t host_allocator,
-    iree_hal_local_event_pool_t** out_event_pool) {
+iree_status_t iree_event_pool_allocate(iree_host_size_t available_capacity,
+                                       iree_allocator_t host_allocator,
+                                       iree_event_pool_t** out_event_pool) {
   IREE_ASSERT_ARGUMENT(out_event_pool);
   *out_event_pool = NULL;
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_hal_local_event_pool_t* event_pool = NULL;
+  iree_event_pool_t* event_pool = NULL;
   iree_host_size_t total_size =
       sizeof(*event_pool) +
       available_capacity * sizeof(event_pool->available_list[0]);
@@ -60,13 +60,13 @@
   if (iree_status_is_ok(status)) {
     *out_event_pool = event_pool;
   } else {
-    iree_hal_local_event_pool_free(event_pool);
+    iree_event_pool_free(event_pool);
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }
 
-void iree_hal_local_event_pool_free(iree_hal_local_event_pool_t* event_pool) {
+void iree_event_pool_free(iree_event_pool_t* event_pool) {
   iree_allocator_t host_allocator = event_pool->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -79,9 +79,9 @@
   IREE_TRACE_ZONE_END(z0);
 }
 
-iree_status_t iree_hal_local_event_pool_acquire(
-    iree_hal_local_event_pool_t* event_pool, iree_host_size_t event_count,
-    iree_event_t* out_events) {
+iree_status_t iree_event_pool_acquire(iree_event_pool_t* event_pool,
+                                      iree_host_size_t event_count,
+                                      iree_event_t* out_events) {
   IREE_ASSERT_ARGUMENT(event_pool);
   if (!event_count) return iree_ok_status();
   IREE_ASSERT_ARGUMENT(out_events);
@@ -113,8 +113,7 @@
                                      &out_events[from_pool_count + i]);
       if (!iree_status_is_ok(status)) {
         // Must release all events we've acquired so far.
-        iree_hal_local_event_pool_release(event_pool, from_pool_count + i,
-                                          out_events);
+        iree_event_pool_release(event_pool, from_pool_count + i, out_events);
         IREE_TRACE_ZONE_END(z0);
         return status;
       }
@@ -125,9 +124,9 @@
   return iree_ok_status();
 }
 
-void iree_hal_local_event_pool_release(iree_hal_local_event_pool_t* event_pool,
-                                       iree_host_size_t event_count,
-                                       iree_event_t* events) {
+void iree_event_pool_release(iree_event_pool_t* event_pool,
+                             iree_host_size_t event_count,
+                             iree_event_t* events) {
   IREE_ASSERT_ARGUMENT(event_pool);
   if (!event_count) return;
   IREE_ASSERT_ARGUMENT(events);

diff --git a/iree/base/internal/event_pool.h b/iree/base/internal/event_pool.h
new file mode 100644
index 0000000..7ac56cb
--- /dev/null
+++ b/iree/base/internal/event_pool.h

@@ -0,0 +1,49 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_BASE_INTERNAL_EVENT_POOL_H_
+#define IREE_BASE_INTERNAL_EVENT_POOL_H_
+
+#include "iree/base/api.h"
+#include "iree/base/internal/wait_handle.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// A simple pool of iree_event_ts to recycle.
+//
+// Thread-safe; multiple threads may acquire and release events from the pool.
+typedef struct iree_event_pool_t iree_event_pool_t;
+
+// Allocates a new event pool with up to |available_capacity| events.
+iree_status_t iree_event_pool_allocate(iree_host_size_t available_capacity,
+                                       iree_allocator_t host_allocator,
+                                       iree_event_pool_t** out_event_pool);
+
+// Deallocates an event pool and destroys all events.
+// All events that were acquired from the pool must have already been released
+// back to it prior to deallocation.
+void iree_event_pool_free(iree_event_pool_t* event_pool);
+
+// Acquires one or more events from the event pool.
+// The returned events will be unsignaled and ready for use. Callers may set and
+// reset the events as much as they want prior to releasing them back to the
+// pool with iree_event_pool_release.
+iree_status_t iree_event_pool_acquire(iree_event_pool_t* event_pool,
+                                      iree_host_size_t event_count,
+                                      iree_event_t* out_events);
+
+// Releases one or more events back to the block pool.
+void iree_event_pool_release(iree_event_pool_t* event_pool,
+                             iree_host_size_t event_count,
+                             iree_event_t* events);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_BASE_INTERNAL_EVENT_POOL_H_

diff --git a/iree/base/internal/synchronization.h b/iree/base/internal/synchronization.h
index df41241..230dca2 100644
--- a/iree/base/internal/synchronization.h
+++ b/iree/base/internal/synchronization.h

@@ -173,7 +173,7 @@
 // Though these locks support spinning they always have a fallback path that
 // ends up calling into the kernel to properly wait the thread. This is critical
 // to avoid pathological cases under contention and allowing for thread priority
-// inheritence when there are multiple threads competing that may otherwise be
+// inheritance when there are multiple threads competing that may otherwise be
 // scheduled in a potentially livelocking order.
 //
 // The "unfair" here comes from the fact that it's possible on certain platforms

diff --git a/iree/base/internal/threading.h b/iree/base/internal/threading.h
index 8b97793..1518fd0 100644
--- a/iree/base/internal/threading.h
+++ b/iree/base/internal/threading.h

@@ -59,7 +59,7 @@
 //
 // Linux/Android:
 //   sched_setaffinity is used to pin the thread to the core with the given ID.
-//   There are, naturally, issues on Android where if the governer has turned
+//   There are, naturally, issues on Android where if the governor has turned
 //   off some cores (such as powering down big cores in an ARM big.LITTLE
 //   configuration) the affinity request will be dropped on the floor even if
 //   the cores are later enabled. This is one of the reasons why we note in
@@ -150,7 +150,7 @@
 
 // Updates the thread affinity of the given |thread|.
 // Affinities are not sticky and may need to be refreshed over time as CPUs are
-// enabled/disabled by the OS (such as power mode changes, governer adjustments,
+// enabled/disabled by the OS (such as power mode changes, governor adjustments,
 // etc). Users wanting to ensure threads have specific affinities may want to
 // request updates whenever new large amounts of work are about to be performed.
 //

diff --git a/iree/base/time.h b/iree/base/time.h
index 3df4393..63eefce 100644
--- a/iree/base/time.h
+++ b/iree/base/time.h

@@ -18,7 +18,7 @@
 #endif  // __cplusplus
 
 // A point in time represented as nanoseconds since unix epoch.
-// TODO(benvanik): pick something easy to get into/outof time_t/etc.
+// TODO(benvanik): pick something easy to get into/out-of time_t/etc.
 typedef int64_t iree_time_t;
 
 // A time in the infinite past used to indicate "already happened".
@@ -157,6 +157,14 @@
              : iree_relative_timeout_to_deadline_ns(timeout.nanos);
 }
 
+// Returns the earliest timeout between |lhs| and |rhs|.
+static inline iree_timeout_t iree_timeout_min(iree_timeout_t lhs,
+                                              iree_timeout_t rhs) {
+  iree_convert_timeout_to_absolute(&lhs);
+  iree_convert_timeout_to_absolute(&rhs);
+  return iree_make_deadline(lhs.nanos < rhs.nanos ? lhs.nanos : rhs.nanos);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

diff --git a/iree/hal/local/BUILD b/iree/hal/local/BUILD
index 0dd87f4..26bcff7 100644
--- a/iree/hal/local/BUILD
+++ b/iree/hal/local/BUILD

@@ -122,22 +122,6 @@
     inline = True,
 )
 
-# TODO(benvanik): move into base/? may be useful for other backends or for other
-# parts of the system (like modules handling IO/RPC).
-cc_library(
-    name = "event_pool",
-    srcs = ["event_pool.c"],
-    hdrs = ["event_pool.h"],
-    deps = [
-        "//iree/base",
-        "//iree/base:core_headers",
-        "//iree/base:tracing",
-        "//iree/base/internal",
-        "//iree/base/internal:synchronization",
-        "//iree/base/internal:wait_handle",
-    ],
-)
-
 cc_library(
     name = "task_driver",
     srcs = [
@@ -159,7 +143,6 @@
         "task_semaphore.h",
     ],
     deps = [
-        ":event_pool",
         ":executable_library",
         ":local",
         "//iree/base",
@@ -167,6 +150,7 @@
         "//iree/base:tracing",
         "//iree/base/internal",
         "//iree/base/internal:arena",
+        "//iree/base/internal:event_pool",
         "//iree/base/internal:synchronization",
         "//iree/base/internal:wait_handle",
         "//iree/hal",

diff --git a/iree/hal/local/CMakeLists.txt b/iree/hal/local/CMakeLists.txt
index 3756a96..b28fcd8 100644
--- a/iree/hal/local/CMakeLists.txt
+++ b/iree/hal/local/CMakeLists.txt

@@ -111,23 +111,6 @@
 
 iree_cc_library(
   NAME
-    event_pool
-  HDRS
-    "event_pool.h"
-  SRCS
-    "event_pool.c"
-  DEPS
-    iree::base
-    iree::base::core_headers
-    iree::base::internal
-    iree::base::internal::synchronization
-    iree::base::internal::wait_handle
-    iree::base::tracing
-  PUBLIC
-)
-
-iree_cc_library(
-  NAME
     task_driver
   HDRS
     "task_command_buffer.h"
@@ -146,13 +129,13 @@
     "task_queue_state.c"
     "task_semaphore.c"
   DEPS
-    ::event_pool
     ::executable_library
     ::local
     iree::base
     iree::base::core_headers
     iree::base::internal
     iree::base::internal::arena
+    iree::base::internal::event_pool
     iree::base::internal::synchronization
     iree::base::internal::wait_handle
     iree::base::tracing

diff --git a/iree/hal/local/event_pool.h b/iree/hal/local/event_pool.h
deleted file mode 100644
index 596ae14..0000000
--- a/iree/hal/local/event_pool.h
+++ /dev/null

@@ -1,49 +0,0 @@
-// Copyright 2021 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_HAL_LOCAL_EVENT_POOL_H_
-#define IREE_HAL_LOCAL_EVENT_POOL_H_
-
-#include "iree/base/api.h"
-#include "iree/base/internal/wait_handle.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// A simple pool of iree_event_ts to recycle.
-//
-// Thread-safe; multiple threads may acquire and release events from the pool.
-typedef struct iree_hal_local_event_pool_t iree_hal_local_event_pool_t;
-
-// Allocates a new event pool with up to |available_capacity| events.
-iree_status_t iree_hal_local_event_pool_allocate(
-    iree_host_size_t available_capacity, iree_allocator_t host_allocator,
-    iree_hal_local_event_pool_t** out_event_pool);
-
-// Deallocates an event pool and destroys all events.
-// All events that were acquired from the pool must have already been released
-// back to it prior to deallocation.
-void iree_hal_local_event_pool_free(iree_hal_local_event_pool_t* event_pool);
-
-// Acquires one or more events from the event pool.
-// The returned events will be unsignaled and ready for use. Callers may set and
-// reset the events as much as they want prior to releasing them back to the
-// pool with iree_hal_local_event_pool_release.
-iree_status_t iree_hal_local_event_pool_acquire(
-    iree_hal_local_event_pool_t* event_pool, iree_host_size_t event_count,
-    iree_event_t* out_events);
-
-// Releases one or more events back to the block pool.
-void iree_hal_local_event_pool_release(iree_hal_local_event_pool_t* event_pool,
-                                       iree_host_size_t event_count,
-                                       iree_event_t* events);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // IREE_HAL_LOCAL_EVENT_POOL_H_

diff --git a/iree/hal/local/task_command_buffer.c b/iree/hal/local/task_command_buffer.c
index ed5cb96..010a480 100644
--- a/iree/hal/local/task_command_buffer.c
+++ b/iree/hal/local/task_command_buffer.c

@@ -501,7 +501,7 @@
 } iree_hal_cmd_fill_buffer_t;
 
 static iree_status_t iree_hal_cmd_fill_tile(
-    uintptr_t user_context, const iree_task_tile_context_t* tile_context,
+    void* user_context, const iree_task_tile_context_t* tile_context,
     iree_task_submission_t* pending_submission) {
   const iree_hal_cmd_fill_buffer_t* cmd =
       (const iree_hal_cmd_fill_buffer_t*)user_context;
@@ -550,7 +550,7 @@
   };
   iree_task_dispatch_initialize(
       command_buffer->scope,
-      iree_task_make_dispatch_closure(iree_hal_cmd_fill_tile, (uintptr_t)cmd),
+      iree_task_make_dispatch_closure(iree_hal_cmd_fill_tile, (void*)cmd),
       workgroup_size, workgroup_count, &cmd->task);
   cmd->target_buffer = target_buffer;
   cmd->target_offset = target_offset;
@@ -575,7 +575,7 @@
 } iree_hal_cmd_update_buffer_t;
 
 static iree_status_t iree_hal_cmd_update_buffer(
-    uintptr_t user_context, iree_task_t* task,
+    void* user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission) {
   const iree_hal_cmd_update_buffer_t* cmd =
       (const iree_hal_cmd_update_buffer_t*)user_context;
@@ -605,7 +605,7 @@
 
   iree_task_call_initialize(
       command_buffer->scope,
-      iree_task_make_call_closure(iree_hal_cmd_update_buffer, (uintptr_t)cmd),
+      iree_task_make_call_closure(iree_hal_cmd_update_buffer, (void*)cmd),
       &cmd->task);
   cmd->target_buffer = target_buffer;
   cmd->target_offset = target_offset;
@@ -640,7 +640,7 @@
 } iree_hal_cmd_copy_buffer_t;
 
 static iree_status_t iree_hal_cmd_copy_tile(
-    uintptr_t user_context, const iree_task_tile_context_t* tile_context,
+    void* user_context, const iree_task_tile_context_t* tile_context,
     iree_task_submission_t* pending_submission) {
   const iree_hal_cmd_copy_buffer_t* cmd =
       (const iree_hal_cmd_copy_buffer_t*)user_context;
@@ -690,7 +690,7 @@
   };
   iree_task_dispatch_initialize(
       command_buffer->scope,
-      iree_task_make_dispatch_closure(iree_hal_cmd_copy_tile, (uintptr_t)cmd),
+      iree_task_make_dispatch_closure(iree_hal_cmd_copy_tile, (void*)cmd),
       workgroup_size, workgroup_count, &cmd->task);
   cmd->source_buffer = source_buffer;
   cmd->source_offset = source_offset;
@@ -813,7 +813,7 @@
 } iree_hal_cmd_dispatch_t;
 
 static iree_status_t iree_hal_cmd_dispatch_tile(
-    uintptr_t user_context, const iree_task_tile_context_t* tile_context,
+    void* user_context, const iree_task_tile_context_t* tile_context,
     iree_task_submission_t* pending_submission) {
   const iree_hal_cmd_dispatch_t* cmd =
       (const iree_hal_cmd_dispatch_t*)user_context;
@@ -893,10 +893,10 @@
   const uint32_t workgroup_count[3] = {workgroup_x, workgroup_y, workgroup_z};
   // TODO(benvanik): expose on API or keep fixed on executable.
   const uint32_t workgroup_size[3] = {1, 1, 1};
-  iree_task_dispatch_initialize(command_buffer->scope,
-                                iree_task_make_dispatch_closure(
-                                    iree_hal_cmd_dispatch_tile, (uintptr_t)cmd),
-                                workgroup_size, workgroup_count, &cmd->task);
+  iree_task_dispatch_initialize(
+      command_buffer->scope,
+      iree_task_make_dispatch_closure(iree_hal_cmd_dispatch_tile, (void*)cmd),
+      workgroup_size, workgroup_count, &cmd->task);
 
   // Tell the task system how much workgroup local memory is required for the
   // dispatch; each invocation of the entry point will have at least as much

diff --git a/iree/hal/local/task_device.c b/iree/hal/local/task_device.c
index 4b0e1ed..25ef6a9 100644
--- a/iree/hal/local/task_device.c
+++ b/iree/hal/local/task_device.c

@@ -12,7 +12,6 @@
 
 #include "iree/base/internal/arena.h"
 #include "iree/base/tracing.h"
-#include "iree/hal/local/event_pool.h"
 #include "iree/hal/local/local_descriptor_set.h"
 #include "iree/hal/local/local_descriptor_set_layout.h"
 #include "iree/hal/local/local_executable_cache.h"
@@ -23,8 +22,6 @@
 #include "iree/hal/local/task_semaphore.h"
 #include "iree/hal/utils/buffer_transfer.h"
 
-#define IREE_HAL_LOCAL_TASK_EVENT_POOL_CAPACITY 32
-
 typedef struct iree_hal_task_device_t {
   iree_hal_resource_t resource;
   iree_string_view_t identifier;
@@ -36,9 +33,6 @@
   // buffers can contain inlined data uploads).
   iree_arena_block_pool_t large_block_pool;
 
-  // iree_event_t pool for semaphore wait operations.
-  iree_hal_local_event_pool_t* event_pool;
-
   iree_task_executor_t* executor;
 
   iree_host_size_t loader_count;
@@ -115,7 +109,6 @@
                                      &device->small_block_pool);
     iree_arena_block_pool_initialize(params->arena_block_size, host_allocator,
                                      &device->large_block_pool);
-    device->event_pool = NULL;
 
     device->executor = executor;
     iree_task_executor_retain(device->executor);
@@ -140,12 +133,6 @@
   }
 
   if (iree_status_is_ok(status)) {
-    status = iree_hal_local_event_pool_allocate(
-        IREE_HAL_LOCAL_TASK_EVENT_POOL_CAPACITY, host_allocator,
-        &device->event_pool);
-  }
-
-  if (iree_status_is_ok(status)) {
     *out_device = (iree_hal_device_t*)device;
   } else {
     iree_hal_device_release((iree_hal_device_t*)device);
@@ -166,7 +153,6 @@
     iree_hal_executable_loader_release(device->loaders[i]);
   }
   iree_task_executor_release(device->executor);
-  iree_hal_local_event_pool_free(device->event_pool);
   iree_arena_block_pool_deinitialize(&device->large_block_pool);
   iree_arena_block_pool_deinitialize(&device->small_block_pool);
   iree_hal_allocator_release(device->device_allocator);
@@ -301,8 +287,9 @@
     iree_hal_device_t* base_device, uint64_t initial_value,
     iree_hal_semaphore_t** out_semaphore) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
-  return iree_hal_task_semaphore_create(device->event_pool, initial_value,
-                                        device->host_allocator, out_semaphore);
+  return iree_hal_task_semaphore_create(
+      iree_task_executor_event_pool(device->executor), initial_value,
+      device->host_allocator, out_semaphore);
 }
 
 static iree_status_t iree_hal_task_device_queue_submit(
@@ -336,9 +323,10 @@
     iree_hal_device_t* base_device, iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout) {
   iree_hal_task_device_t* device = iree_hal_task_device_cast(base_device);
-  return iree_hal_task_semaphore_multi_wait(wait_mode, semaphore_list, timeout,
-                                            device->event_pool,
-                                            &device->large_block_pool);
+  return iree_hal_task_semaphore_multi_wait(
+      wait_mode, semaphore_list, timeout,
+      iree_task_executor_event_pool(device->executor),
+      &device->large_block_pool);
 }
 
 static iree_status_t iree_hal_task_device_wait_idle(

diff --git a/iree/hal/local/task_queue.c b/iree/hal/local/task_queue.c
index 5090e56..23fcb43 100644
--- a/iree/hal/local/task_queue.c
+++ b/iree/hal/local/task_queue.c

@@ -115,7 +115,7 @@
 
 // Forks out multiple wait tasks prior to issuing the commands.
 static iree_status_t iree_hal_task_queue_wait_cmd(
-    uintptr_t user_context, iree_task_t* task,
+    void* user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission) {
   iree_hal_task_queue_wait_cmd_t* cmd = (iree_hal_task_queue_wait_cmd_t*)task;
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -135,8 +135,8 @@
 
 // Cleanup for iree_hal_task_queue_wait_cmd_t that releases the retained
 // semaphores.
-static void iree_hal_task_queue_wait_cmd_cleanup(iree_task_t* task,
-                                                 iree_status_t status) {
+static void iree_hal_task_queue_wait_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
   iree_hal_task_queue_wait_cmd_t* cmd = (iree_hal_task_queue_wait_cmd_t*)task;
   iree_hal_semaphore_list_release(&cmd->wait_semaphores);
 }
@@ -189,7 +189,7 @@
 
 // Issues a set of command buffers without waiting for them to complete.
 static iree_status_t iree_hal_task_queue_issue_cmd(
-    uintptr_t user_context, iree_task_t* task,
+    void* user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission) {
   iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -219,8 +219,8 @@
 
 // Cleanup for iree_hal_task_queue_issue_cmd_t that resets the queue state
 // tracking the last in-flight issue.
-static void iree_hal_task_queue_issue_cmd_cleanup(iree_task_t* task,
-                                                  iree_status_t status) {
+static void iree_hal_task_queue_issue_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
   iree_hal_task_queue_issue_cmd_t* cmd = (iree_hal_task_queue_issue_cmd_t*)task;
 
   // Reset queue tail issue task if it was us.
@@ -283,7 +283,7 @@
 // Retires a submission by signaling semaphores to their desired value and
 // disposing of the temporary arena memory used for the submission.
 static iree_status_t iree_hal_task_queue_retire_cmd(
-    uintptr_t user_context, iree_task_t* task,
+    void* user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission) {
   iree_hal_task_queue_retire_cmd_t* cmd =
       (iree_hal_task_queue_retire_cmd_t*)task;
@@ -307,17 +307,17 @@
 // Cleanup for iree_hal_task_queue_retire_cmd_t that ensures that the arena
 // holding the submission is properly disposed and that semaphores are signaled
 // (or signaled to failure if the command failed).
-static void iree_hal_task_queue_retire_cmd_cleanup(iree_task_t* task,
-                                                   iree_status_t status) {
+static void iree_hal_task_queue_retire_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
   iree_hal_task_queue_retire_cmd_t* cmd =
       (iree_hal_task_queue_retire_cmd_t*)task;
 
   // If the command failed then fail all semaphores to ensure future
   // submissions fail as well (including those on other queues).
-  if (!iree_status_is_ok(status)) {
+  if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
     for (iree_host_size_t i = 0; i < cmd->signal_semaphores.count; ++i) {
       iree_hal_semaphore_fail(cmd->signal_semaphores.semaphores[i],
-                              iree_status_clone(status));
+                              iree_status_from_code(status_code));
     }
   }
 

diff --git a/iree/hal/local/task_semaphore.c b/iree/hal/local/task_semaphore.c
index 6aad400..59c0327 100644
--- a/iree/hal/local/task_semaphore.c
+++ b/iree/hal/local/task_semaphore.c

@@ -133,7 +133,7 @@
 typedef struct iree_hal_task_semaphore_t {
   iree_hal_resource_t resource;
   iree_allocator_t host_allocator;
-  iree_hal_local_event_pool_t* event_pool;
+  iree_event_pool_t* event_pool;
 
   // Guards all mutable fields. We expect low contention on semaphores and since
   // iree_slim_mutex_t is (effectively) just a CAS this keeps things simpler
@@ -167,7 +167,7 @@
 }
 
 iree_status_t iree_hal_task_semaphore_create(
-    iree_hal_local_event_pool_t* event_pool, uint64_t initial_value,
+    iree_event_pool_t* event_pool, uint64_t initial_value,
     iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore) {
   IREE_ASSERT_ARGUMENT(event_pool);
   IREE_ASSERT_ARGUMENT(out_semaphore);
@@ -306,8 +306,8 @@
     iree_hal_task_timepoint_t* out_timepoint) {
   memset(out_timepoint, 0, sizeof(*out_timepoint));
   out_timepoint->payload_value = minimum_value;
-  IREE_RETURN_IF_ERROR(iree_hal_local_event_pool_acquire(
-      semaphore->event_pool, 1, &out_timepoint->event));
+  IREE_RETURN_IF_ERROR(
+      iree_event_pool_acquire(semaphore->event_pool, 1, &out_timepoint->event));
   iree_hal_task_timepoint_list_append(&semaphore->timepoint_list,
                                       out_timepoint);
   return iree_ok_status();
@@ -321,13 +321,12 @@
 
 // Cleans up a wait task by returning the event used to the pool and - if the
 // task failed - ensuring we scrub it from the timepoint list.
-static void iree_hal_task_semaphore_wait_cmd_cleanup(iree_task_t* task,
-                                                     iree_status_t status) {
+static void iree_hal_task_semaphore_wait_cmd_cleanup(
+    iree_task_t* task, iree_status_code_t status_code) {
   iree_hal_task_semaphore_wait_cmd_t* cmd =
       (iree_hal_task_semaphore_wait_cmd_t*)task;
-  iree_hal_local_event_pool_release(cmd->semaphore->event_pool, 1,
-                                    &cmd->timepoint.event);
-  if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
+  iree_event_pool_release(cmd->semaphore->event_pool, 1, &cmd->timepoint.event);
+  if (IREE_UNLIKELY(status_code != IREE_STATUS_OK)) {
     // Abort the timepoint. Note that this is not designed to be fast as
     // semaphore failure is an exceptional case.
     iree_slim_mutex_lock(&cmd->semaphore->mutex);
@@ -413,15 +412,14 @@
     iree_hal_task_timepoint_list_erase(&semaphore->timepoint_list, &timepoint);
     iree_slim_mutex_unlock(&semaphore->mutex);
   }
-  iree_hal_local_event_pool_release(semaphore->event_pool, 1, &timepoint.event);
+  iree_event_pool_release(semaphore->event_pool, 1, &timepoint.event);
   return status;
 }
 
 iree_status_t iree_hal_task_semaphore_multi_wait(
     iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
-    iree_hal_local_event_pool_t* event_pool,
-    iree_arena_block_pool_t* block_pool) {
+    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool) {
   IREE_ASSERT_ARGUMENT(semaphore_list);
   if (semaphore_list->count == 0) {
     return iree_ok_status();
@@ -487,7 +485,7 @@
     // TODO(benvanik): if we flip the API to multi-acquire events from the pool
     // above then we can multi-release here too.
     for (iree_host_size_t i = 0; i < timepoint_count; ++i) {
-      iree_hal_local_event_pool_release(event_pool, 1, &timepoints[i].event);
+      iree_event_pool_release(event_pool, 1, &timepoints[i].event);
     }
   }
   iree_wait_set_free(wait_set);

diff --git a/iree/hal/local/task_semaphore.h b/iree/hal/local/task_semaphore.h
index 452e5f0..f3a1060 100644
--- a/iree/hal/local/task_semaphore.h
+++ b/iree/hal/local/task_semaphore.h

@@ -11,8 +11,8 @@
 
 #include "iree/base/api.h"
 #include "iree/base/internal/arena.h"
+#include "iree/base/internal/event_pool.h"
 #include "iree/hal/api.h"
-#include "iree/hal/local/event_pool.h"
 #include "iree/task/submission.h"
 #include "iree/task/task.h"
 
@@ -23,7 +23,7 @@
 // Creates a semaphore that integrates with the task system to allow for
 // pipelined wait and signal operations.
 iree_status_t iree_hal_task_semaphore_create(
-    iree_hal_local_event_pool_t* event_pool, uint64_t initial_value,
+    iree_event_pool_t* event_pool, uint64_t initial_value,
     iree_allocator_t host_allocator, iree_hal_semaphore_t** out_semaphore);
 
 // Reserves a new timepoint in the timeline for the given minimum payload value.
@@ -42,8 +42,7 @@
 iree_status_t iree_hal_task_semaphore_multi_wait(
     iree_hal_wait_mode_t wait_mode,
     const iree_hal_semaphore_list_t* semaphore_list, iree_timeout_t timeout,
-    iree_hal_local_event_pool_t* event_pool,
-    iree_arena_block_pool_t* block_pool);
+    iree_event_pool_t* event_pool, iree_arena_block_pool_t* block_pool);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/iree/task/BUILD b/iree/task/BUILD
index e66dc30..455db67 100644
--- a/iree/task/BUILD
+++ b/iree/task/BUILD

@@ -71,6 +71,7 @@
         "//iree/base:tracing",
         "//iree/base/internal",
         "//iree/base/internal:atomic_slist",
+        "//iree/base/internal:event_pool",
         "//iree/base/internal:fpu_state",
         "//iree/base/internal:prng",
         "//iree/base/internal:synchronization",

diff --git a/iree/task/CMakeLists.txt b/iree/task/CMakeLists.txt
index 6863140..1b4fe06 100644
--- a/iree/task/CMakeLists.txt
+++ b/iree/task/CMakeLists.txt

@@ -66,6 +66,7 @@
     iree::base::core_headers
     iree::base::internal
     iree::base::internal::atomic_slist
+    iree::base::internal::event_pool
     iree::base::internal::fpu_state
     iree::base::internal::prng
     iree::base::internal::synchronization

diff --git a/iree/task/executor.c b/iree/task/executor.c
index 70c494f..267b681 100644
--- a/iree/task/executor.c
+++ b/iree/task/executor.c

@@ -89,6 +89,14 @@
 
   iree_status_t status = iree_ok_status();
 
+  // Pool used for system events; exposed to users of the task system to ensure
+  // we minimize the number of live events and reduce overheads in
+  // high-frequency transient parking operations.
+  if (iree_status_is_ok(status)) {
+    status = iree_event_pool_allocate(IREE_TASK_EXECUTOR_EVENT_POOL_CAPACITY,
+                                      allocator, &executor->event_pool);
+  }
+
   // Wait set used to batch syscalls for polling/waiting on wait handles.
   // This is currently limited to a relatively small max to make bad behavior
   // clearer with nice RESOURCE_EXHAUSTED errors.
@@ -97,22 +105,15 @@
                                     allocator, &executor->wait_set);
   }
 
-  // Pool used for all dispatch->slice fanout tasks. These only live within the
-  // executor and since we know the precise lifetime of them we can keep them
-  // entirely within the system here.
-  if (iree_status_is_ok(status)) {
-    status = iree_task_pool_initialize(allocator, sizeof(iree_task_fence_t), 8,
-                                       &executor->fence_task_pool);
-  }
+  // Pool used for all fanout tasks. These only live within the executor and
+  // since we know the precise lifetime of them we can keep them entirely within
+  // the system here.
   if (iree_status_is_ok(status)) {
     status = iree_task_pool_initialize(
         allocator,
-        iree_max(sizeof(iree_task_dispatch_shard_t),
-                 sizeof(iree_task_dispatch_slice_t)),
-        worker_count *
-            iree_max(IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
-                     IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER),
-        &executor->dispatch_task_pool);
+        iree_max(sizeof(iree_task_fence_t), sizeof(iree_task_dispatch_shard_t)),
+        worker_count * IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER,
+        &executor->transient_task_pool);
   }
 
   // Bring up the workers; the threads will be created here but be suspended
@@ -188,12 +189,12 @@
   }
 
   iree_wait_set_free(executor->wait_set);
+  iree_event_pool_free(executor->event_pool);
   iree_slim_mutex_deinitialize(&executor->wait_mutex);
   iree_slim_mutex_deinitialize(&executor->coordinator_mutex);
   iree_atomic_task_slist_deinitialize(&executor->incoming_ready_slist);
   iree_atomic_task_slist_deinitialize(&executor->incoming_waiting_slist);
-  iree_task_pool_deinitialize(&executor->fence_task_pool);
-  iree_task_pool_deinitialize(&executor->dispatch_task_pool);
+  iree_task_pool_deinitialize(&executor->transient_task_pool);
   iree_allocator_free(executor->allocator, executor);
 
   IREE_TRACE_ZONE_END(z0);
@@ -217,7 +218,12 @@
   // guarantee. We'd need some global executor lock that we did here and
   // on submit - or rework pools to not have this limitation.
   // iree_task_pool_trim(&executor->fence_task_pool);
-  // iree_task_pool_trim(&executor->dispatch_task_pool);
+  // iree_task_pool_trim(&executor->transient_task_pool);
+}
+
+iree_event_pool_t* iree_task_executor_event_pool(
+    iree_task_executor_t* executor) {
+  return executor->event_pool;
 }
 
 iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
@@ -225,10 +231,10 @@
                                                iree_task_fence_t** out_fence) {
   *out_fence = NULL;
   iree_task_fence_t* fence = NULL;
-  IREE_RETURN_IF_ERROR(iree_task_pool_acquire(&executor->fence_task_pool,
+  IREE_RETURN_IF_ERROR(iree_task_pool_acquire(&executor->transient_task_pool,
                                               (iree_task_t**)&fence));
   iree_task_fence_initialize(scope, fence);
-  fence->header.pool = &executor->fence_task_pool;
+  fence->header.pool = &executor->transient_task_pool;
   *out_fence = fence;
   return iree_ok_status();
 }
@@ -264,13 +270,23 @@
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_task_t* task = NULL;
   while ((task = iree_task_list_pop_front(&pending_submission->ready_list))) {
+    // If the scope has been marked as failing then we abort the task.
+    // This needs to happen as a poll here because one or more of the tasks we
+    // are joining may have failed.
+    if (IREE_UNLIKELY(iree_task_scope_has_failed(task->scope))) {
+      iree_task_list_t discard_worklist;
+      iree_task_list_initialize(&discard_worklist);
+      iree_task_discard(task, &discard_worklist);
+      iree_task_list_discard(&discard_worklist);
+      continue;
+    }
+
     switch (task->type) {
       case IREE_TASK_TYPE_NOP:
         // Doesn't do anything; just retire and continue on to any dependents.
         iree_task_nop_retire((iree_task_nop_t*)task, pending_submission);
         break;
-      case IREE_TASK_TYPE_CALL:
-      case IREE_TASK_TYPE_DISPATCH_SLICE: {
+      case IREE_TASK_TYPE_CALL: {
         // Generic routing to workers for tasks that should always run there.
         iree_task_executor_relay_to_worker(executor, post_batch, task);
         break;
@@ -305,15 +321,9 @@
           iree_task_dispatch_retire((iree_task_dispatch_t*)task,
                                     pending_submission);
         } else {
-          if (task->flags & IREE_TASK_FLAG_DISPATCH_SLICED) {
-            iree_task_dispatch_issue_sliced((iree_task_dispatch_t*)task,
-                                            &executor->dispatch_task_pool,
-                                            pending_submission, post_batch);
-          } else {
-            iree_task_dispatch_issue_sharded((iree_task_dispatch_t*)task,
-                                             &executor->dispatch_task_pool,
-                                             pending_submission, post_batch);
-          }
+          iree_task_dispatch_issue((iree_task_dispatch_t*)task,
+                                   &executor->transient_task_pool,
+                                   pending_submission, post_batch);
         }
         break;
       }
@@ -584,7 +594,7 @@
     // any changes then.
     //
     // As we schedule tasks we may spawn new ones (like a dispatch -> many
-    // dispatch slices) and we keep track of those here. By doing a pass through
+    // dispatch shards) and we keep track of those here. By doing a pass through
     // all ready tasks and only then merging in the new submission we get
     // breadth-first traversal of task graphs even if they originate from
     // various places and have no relation - hopefully leading to better average

diff --git a/iree/task/executor.h b/iree/task/executor.h
index 1c06d9a..4a8cdf9 100644
--- a/iree/task/executor.h
+++ b/iree/task/executor.h

@@ -11,6 +11,7 @@
 
 #include "iree/base/api.h"
 #include "iree/base/internal/atomics.h"
+#include "iree/base/internal/event_pool.h"
 #include "iree/base/internal/wait_handle.h"
 #include "iree/task/scope.h"
 #include "iree/task/submission.h"
@@ -66,7 +67,7 @@
 //   - latency prioritization by partitioning workloads by priority
 // - scheduling overhead tradeoffs by varying:
 //   - coordination/flush frequency to reduce cross-thread communication
-//   - by statically inserting dispatch slices to avoid dynamic fan-out
+//   - by statically inserting dispatch shards to avoid dynamic fan-out
 //   - thread donation to avoid likely context switches upon submit+wait
 //   - multi-wait across all users by sharing a wait set
 //   - per-worker work-stealing specification of victim workers in the topology
@@ -324,6 +325,13 @@
 // Trims pools and caches used by the executor and its workers.
 void iree_task_executor_trim(iree_task_executor_t* executor);
 
+// Returns an iree_event_t pool managed by the executor.
+// Users of the task system should acquire their transient events from this.
+// Long-lived events should be allocated on their own in order to avoid
+// expending the pool and harming high-frequency event acquisition.
+iree_event_pool_t* iree_task_executor_event_pool(
+    iree_task_executor_t* executor);
+
 // Acquires a fence for the given |scope| from the executor fence pool.
 iree_status_t iree_task_executor_acquire_fence(iree_task_executor_t* executor,
                                                iree_task_scope_t* scope,

diff --git a/iree/task/executor_impl.h b/iree/task/executor_impl.h
index f15ed56..13f66ae 100644
--- a/iree/task/executor_impl.h
+++ b/iree/task/executor_impl.h

@@ -43,8 +43,12 @@
   // Pools of transient dispatch tasks shared across all workers.
   // Depending on configuration the task pool may allocate after creation using
   // the allocator provided upon executor creation.
-  iree_task_pool_t fence_task_pool;
-  iree_task_pool_t dispatch_task_pool;
+  //
+  // Sized to be able to fit at least:
+  //   iree_task_fence_t
+  //   iree_task_dispatch_shard_t
+  // Increasing the size larger than these will waste memory.
+  iree_task_pool_t transient_task_pool;
 
   // A list of incoming tasks that are ready to execute immediately.
   // The list is LIFO and we require that task lists are reversed by the
@@ -65,6 +69,13 @@
   // really matter here as all tasks will be waited on simultaneously.
   iree_atomic_task_slist_t incoming_waiting_slist;
 
+  // iree_event_t pool used to acquire system wait handles.
+  // Many subsystems interacting with the executor will need events to park
+  // their work in the wait set and sharing the pool across all of them ensures
+  // we limit the number we have outstanding and avoid syscalls to allocate
+  // them.
+  iree_event_pool_t* event_pool;
+
   // Guards coordination logic; only one thread at a time may be acting as the
   // coordinator.
   iree_slim_mutex_t coordinator_mutex;

diff --git a/iree/task/executor_test.cc b/iree/task/executor_test.cc
index 796d221..4c2a0a1 100644
--- a/iree/task/executor_test.cc
+++ b/iree/task/executor_test.cc

@@ -57,9 +57,10 @@
   iree_task_executor_t* executor = NULL;
   iree_task_scheduling_mode_t scheduling_mode =
       IREE_TASK_SCHEDULING_MODE_RESERVED;
-  IREE_CHECK_OK(iree_task_executor_create(
-      scheduling_mode, &topology,
-      /*worker_local_memory_size=*/(64 * 1024), allocator, &executor));
+  iree_host_size_t worker_local_memory_size = 0;  // 64 * 1024;
+  IREE_CHECK_OK(iree_task_executor_create(scheduling_mode, &topology,
+                                          worker_local_memory_size, allocator,
+                                          &executor));
   iree_task_topology_deinitialize(&topology);
 
   //
@@ -70,7 +71,7 @@
   iree_task_call_t call0;
   iree_task_call_initialize(&scope_a,
                             iree_task_make_call_closure(
-                                [](uintptr_t user_context, iree_task_t* task,
+                                [](void* user_context, iree_task_t* task,
                                    iree_task_submission_t* pending_submission) {
                                   IREE_TRACE_SCOPE0("call0");
                                   EXPECT_EQ(0, user_context);
@@ -85,8 +86,7 @@
   iree_task_dispatch_initialize(
       &scope_a,
       iree_task_make_dispatch_closure(
-          [](uintptr_t user_context,
-             const iree_task_tile_context_t* tile_context,
+          [](void* user_context, const iree_task_tile_context_t* tile_context,
              iree_task_submission_t* pending_submission) {
             IREE_TRACE_SCOPE0("tile0");
             EXPECT_EQ(0, user_context);
@@ -97,7 +97,6 @@
           },
           0),
       workgroup_size_0, workgroup_count_0, &dispatch0);
-  // dispatch0.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
 
   const uint32_t workgroup_size_1[3] = {128, 1, 1};
   const uint32_t workgroup_count_1[3] = {16, 2, 1};
@@ -105,8 +104,7 @@
   iree_task_dispatch_initialize(
       &scope_a,
       iree_task_make_dispatch_closure(
-          [](uintptr_t user_context,
-             const iree_task_tile_context_t* tile_context,
+          [](void* user_context, const iree_task_tile_context_t* tile_context,
              iree_task_submission_t* pending_submission) {
             IREE_TRACE_SCOPE0("tile1");
             EXPECT_EQ(0, user_context);
@@ -117,19 +115,18 @@
           },
           0),
       workgroup_size_1, workgroup_count_1, &dispatch1);
-  dispatch1.header.flags |= IREE_TASK_FLAG_DISPATCH_SLICED;
 
   //
   iree_task_call_t call1;
   iree_task_call_initialize(&scope_a,
                             iree_task_make_call_closure(
-                                [](uintptr_t user_context, iree_task_t* task,
+                                [](void* user_context, iree_task_t* task,
                                    iree_task_submission_t* pending_submission) {
                                   IREE_TRACE_SCOPE0("call1");
-                                  EXPECT_EQ(1, user_context);
+                                  EXPECT_EQ((void*)1, user_context);
                                   return iree_ok_status();
                                 },
-                                1),
+                                (void*)1),
                             &call1);
 
 #if 1

diff --git a/iree/task/list.c b/iree/task/list.c
index 984dd50..607ab82 100644
--- a/iree/task/list.c
+++ b/iree/task/list.c

@@ -47,6 +47,7 @@
   while (!iree_task_list_is_empty(list)) {
     iree_task_t* task = iree_task_list_pop_front(list);
     iree_task_discard(task, list);
+    task = NULL;  // invalidated during discard
   }
 }
 

diff --git a/iree/task/list_test.cc b/iree/task/list_test.cc
index 6eef318..c5cb5b2 100644
--- a/iree/task/list_test.cc
+++ b/iree/task/list_test.cc

@@ -108,6 +108,62 @@
   // IMPLICIT: if the tasks were not released back to the pool we'll leak.
 }
 
+TEST(TaskListTest, DiscardSequence) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_set_completion_task(task0, task1);
+  iree_task_set_completion_task(task1, task2);
+  iree_task_set_completion_task(task2, task3);
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // IMPLICIT: if the tasks were not released back to the pool we'll leak.
+}
+
+TEST(TaskListTest, DiscardJoin) {
+  auto pool = AllocateNopPool();
+  auto scope = AllocateScope("a");
+
+  iree_task_list_t list;
+  iree_task_list_initialize(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  auto task0 = AcquireNopTask(pool, scope, 0);
+  auto task1 = AcquireNopTask(pool, scope, 1);
+  auto task2 = AcquireNopTask(pool, scope, 2);
+  auto task3 = AcquireNopTask(pool, scope, 3);
+  iree_task_set_completion_task(task0, task3);
+  iree_task_set_completion_task(task1, task3);
+  iree_task_set_completion_task(task2, task3);
+  iree_task_list_push_back(&list, task0);
+  iree_task_list_push_back(&list, task1);
+  iree_task_list_push_back(&list, task2);
+  iree_task_list_push_back(&list, task3);
+  EXPECT_EQ(4, iree_task_list_calculate_size(&list));
+  EXPECT_TRUE(CheckListOrderFIFO(&list));
+
+  iree_task_list_discard(&list);
+  EXPECT_TRUE(iree_task_list_is_empty(&list));
+
+  // IMPLICIT: if the tasks were not released back to the pool we'll leak.
+}
+
 TEST(TaskListTest, PushFront) {
   auto pool = AllocateNopPool();
   auto scope = AllocateScope("a");

diff --git a/iree/task/scope.c b/iree/task/scope.c
index 37aaf5e..723a07b 100644
--- a/iree/task/scope.c
+++ b/iree/task/scope.c

@@ -66,6 +66,11 @@
   return result;
 }
 
+bool iree_task_scope_has_failed(iree_task_scope_t* scope) {
+  return iree_atomic_load_intptr(&scope->permanent_status,
+                                 iree_memory_order_seq_cst) != 0;
+}
+
 iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope) {
   iree_status_t old_status = iree_ok_status();
   iree_status_t new_status = iree_ok_status();
@@ -107,7 +112,6 @@
 
 void iree_task_scope_fail(iree_task_scope_t* scope, iree_task_t* task,
                           iree_status_t status) {
-  // TODO(benvanik): logging/tracing based on task.
   iree_task_scope_try_set_status(scope, status);
 }
 

diff --git a/iree/task/scope.h b/iree/task/scope.h
index 6e00d28..e13ef0b 100644
--- a/iree/task/scope.h
+++ b/iree/task/scope.h

@@ -51,7 +51,7 @@
   IREE_TRACE(uint32_t task_trace_color;)
 
   // A permanent status code set when a task within the scope fails. All pending
-  // tasks will be cancelled, though any in-flight tasks may continue executing
+  // tasks will be aborted, though any in-flight tasks may continue executing
   // to completion.
   iree_atomic_intptr_t permanent_status;
 
@@ -104,6 +104,11 @@
 iree_task_dispatch_statistics_t iree_task_scope_consume_statistics(
     iree_task_scope_t* scope);
 
+// Returns true if the scope has failed.
+// iree_task_scope_consume_status can be used once to get the full status
+// describing the failure and subsequent calls will return the status code.
+bool iree_task_scope_has_failed(iree_task_scope_t* scope);
+
 // Returns the permanent scope failure status to the caller (transfering
 // ownership). The scope will remain in a failed state with the status code.
 iree_status_t iree_task_scope_consume_status(iree_task_scope_t* scope);

diff --git a/iree/task/task.c b/iree/task/task.c
index 1a7b0ef..537fc77 100644
--- a/iree/task/task.c
+++ b/iree/task/task.c

@@ -53,12 +53,33 @@
   return true;
 }
 
-static void iree_task_cleanup(iree_task_t* task, iree_status_t status) {
+static void iree_task_try_set_status(iree_atomic_intptr_t* permanent_status,
+                                     iree_status_t new_status) {
+  if (IREE_UNLIKELY(iree_status_is_ok(new_status))) return;
+
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_TEXT(z0, "failed: ");
+  IREE_TRACE_ZONE_APPEND_TEXT(
+      z0, iree_status_code_string(iree_status_code(new_status)));
+
+  iree_status_t old_status = iree_ok_status();
+  if (!iree_atomic_compare_exchange_strong_intptr(
+          permanent_status, (intptr_t*)&old_status, (intptr_t)new_status,
+          iree_memory_order_seq_cst, iree_memory_order_seq_cst)) {
+    // Previous status was not OK; drop our new status.
+    IREE_IGNORE_ERROR(new_status);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static void iree_task_cleanup(iree_task_t* task,
+                              iree_status_code_t status_code) {
   // Call the (optional) cleanup function.
   // NOTE: this may free the memory of the task itself!
   iree_task_pool_t* pool = task->pool;
   if (task->cleanup_fn) {
-    task->cleanup_fn(task, iree_ok_status());
+    task->cleanup_fn(task, status_code);
   }
 
   // Return the task to the pool it was allocated from.
@@ -69,17 +90,31 @@
   }
 }
 
+static void iree_task_barrier_discard(iree_task_barrier_t* task,
+                                      iree_task_list_t* discard_worklist);
+static void iree_task_fence_discard(iree_task_fence_t* task,
+                                    iree_task_list_t* discard_worklist);
+
 void iree_task_discard(iree_task_t* task, iree_task_list_t* discard_worklist) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  // NOTE: we always try adding to the head of the discard_worklist so that
-  // we hopefully get some locality benefits. This models a DFS discard in
-  // our non-recursive approach.
+  // This models a BFS discard in our non-recursive approach.
+  // We must ensure that we only discard each task once and that we discard the
+  // tasks in the appropriate order: if we had a DAG of A -> B, C -> D we must
+  // discard respecting the same topological ordering.
+
+  IREE_ASSERT_EQ(0, iree_atomic_load_int32(&task->pending_dependency_count,
+                                           iree_memory_order_acquire));
 
   // Almost all tasks will have a completion task; some may have additional
   // dependent tasks (like barriers) that will be handled below.
-  if (task->completion_task) {
-    iree_task_list_push_front(discard_worklist, task->completion_task);
+  const bool completion_task_ready =
+      task->completion_task &&
+      iree_atomic_fetch_sub_int32(
+          &task->completion_task->pending_dependency_count, 1,
+          iree_memory_order_acq_rel) == 1;
+  if (completion_task_ready) {
+    iree_task_list_push_back(discard_worklist, task->completion_task);
   }
 
   switch (task->type) {
@@ -87,49 +122,70 @@
     case IREE_TASK_TYPE_NOP:
     case IREE_TASK_TYPE_CALL:
       break;
-    case IREE_TASK_TYPE_BARRIER: {
-      iree_task_barrier_t* barrier_task = (iree_task_barrier_t*)task;
-      for (uint32_t i = 0; i < barrier_task->dependent_task_count; ++i) {
-        iree_task_list_push_front(discard_worklist,
-                                  barrier_task->dependent_tasks[i]);
-      }
+    case IREE_TASK_TYPE_BARRIER:
+      iree_task_barrier_discard((iree_task_barrier_t*)task, discard_worklist);
       break;
-    }
-    case IREE_TASK_TYPE_FENCE: {
-      // TODO(benvanik): signal as error.
-      // iree_task_fence_t* fence_task = (iree_task_fence_t*)task;
+    case IREE_TASK_TYPE_FENCE:
       iree_task_scope_end(task->scope);
       break;
-    }
     case IREE_TASK_TYPE_WAIT:
     case IREE_TASK_TYPE_DISPATCH:
-    case IREE_TASK_TYPE_DISPATCH_SLICE:
       break;
   }
 
-  iree_task_cleanup(task, iree_status_from_code(IREE_STATUS_ABORTED));
+  iree_task_cleanup(task, IREE_STATUS_ABORTED);
   // NOTE: task is invalidated here and cannot be used!
+  task = NULL;
 
   IREE_TRACE_ZONE_END(z0);
 }
 
 static void iree_task_retire(iree_task_t* task,
-                             iree_task_submission_t* pending_submission) {
+                             iree_task_submission_t* pending_submission,
+                             iree_status_t status) {
   IREE_ASSERT_EQ(0, iree_atomic_load_int32(&task->pending_dependency_count,
                                            iree_memory_order_acquire));
 
   // Decrement the pending count on the completion task, if any.
   iree_task_t* completion_task = task->completion_task;
   task->completion_task = NULL;
-  if (completion_task &&
+  bool completion_task_ready =
+      completion_task &&
       iree_atomic_fetch_sub_int32(&completion_task->pending_dependency_count, 1,
-                                  iree_memory_order_acq_rel) == 1) {
-    // The completion task has retired and can now be made ready.
-    iree_task_submission_enqueue(pending_submission, completion_task);
+                                  iree_memory_order_acq_rel) == 1;
+
+  if (iree_status_is_ok(status)) {
+    // Task completed successfully.
+    iree_task_cleanup(task, IREE_STATUS_OK);
+    if (completion_task_ready) {
+      // This was the last pending dependency and the completion task is ready
+      // to run.
+      iree_task_submission_enqueue(pending_submission, completion_task);
+    }
+  } else {
+    // Task failed.
+    iree_task_scope_fail(task->scope, task, status);
+    status = iree_ok_status();  // consumed by the fail
+    iree_task_cleanup(task, IREE_STATUS_ABORTED);
+    if (completion_task_ready) {
+      // This was the last pending dependency and we know that we can safely
+      // abort the completion task by discarding.
+      iree_task_list_t discard_worklist;
+      iree_task_list_initialize(&discard_worklist);
+      iree_task_discard(completion_task, &discard_worklist);
+      iree_task_list_discard(&discard_worklist);
+    } else if (completion_task) {
+      // One or more pending dependencies are not yet satisfied and the
+      // completion task must stay alive. We can mark it as aborted, though,
+      // so that it knows not to execute when it is ready to run.
+      // TODO(benvanik): make this atomic? we only ever add bits and it's safe
+      // for it to run if we got this far.
+      completion_task->flags |= IREE_TASK_FLAG_ABORTED;
+    }
   }
 
-  iree_task_cleanup(task, iree_ok_status());
   // NOTE: task is invalidated here and cannot be used!
+  task = NULL;
 }
 
 //==============================================================================
@@ -143,36 +199,69 @@
 
 void iree_task_nop_retire(iree_task_nop_t* task,
                           iree_task_submission_t* pending_submission) {
-  iree_task_retire(&task->header, pending_submission);
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
 }
 
 //==============================================================================
 // IREE_TASK_TYPE_CALL
 //==============================================================================
 
+// Returns an XXBBGGRR color (red in the lowest bits).
+// Must not be 0 (tracy will ignore).
+static uint32_t iree_math_ptr_to_xrgb(const void* ptr) {
+  // This is just a simple hack to give us a unique(ish) per-pointer color.
+  // It's only to make it easier to distinguish which tiles are from the same
+  // dispatch.
+  uint64_t ptr64 = (uintptr_t)ptr;
+  return (uint32_t)ptr64 ^ (uint32_t)(ptr64 >> 32);
+}
+
 void iree_task_call_initialize(iree_task_scope_t* scope,
                                iree_task_call_closure_t closure,
                                iree_task_call_t* out_task) {
   iree_task_initialize(IREE_TASK_TYPE_CALL, scope, &out_task->header);
   out_task->closure = closure;
+  iree_atomic_store_intptr(&out_task->status, 0, iree_memory_order_release);
 }
 
-iree_status_t iree_task_call_execute(
-    iree_task_call_t* task, iree_task_submission_t* pending_submission) {
+void iree_task_call_execute(iree_task_call_t* task,
+                            iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_SET_COLOR(z0,
+                            iree_math_ptr_to_xrgb(task->closure.user_context));
 
-  // Execute the user callback.
-  // Note that this may enqueue more nested tasks, including tasks that prevent
-  // this task from retiring.
-  iree_status_t status = task->closure.fn(task->closure.user_context,
-                                          &task->header, pending_submission);
+  if (IREE_LIKELY(
+          !iree_any_bit_set(task->header.flags, IREE_TASK_FLAG_ABORTED))) {
+    // Execute the user callback.
+    // Note that this may enqueue more nested tasks, including tasks that
+    // prevent this task from retiring.
+    iree_status_t status = task->closure.fn(task->closure.user_context,
+                                            &task->header, pending_submission);
+    if (!iree_status_is_ok(status)) {
+      // Stash the failure status on the task.
+      // If there's still pending dependencies we won't be able to discard
+      // immediately and need to keep the status around until they all complete.
+      iree_task_try_set_status(&task->status, status);
+      status = iree_ok_status();  // consumed by try_set_status
+
+      // TODO(benvanik): discard pending_submission? As we may have pending work
+      // from multiple scopes it's dangerous to discard all. We could filter
+      // based on scope, though, and if we did that we (probably) wouldn't need
+      // to handle the permanent status on the task and could discard
+      // immediately.
+    }
+  }
+
+  // Check to see if there are no pending dependencies before retiring; the
+  // dependency count can go up if new nested tasks were enqueued.
   if (iree_atomic_load_int32(&task->header.pending_dependency_count,
                              iree_memory_order_acquire) == 0) {
-    iree_task_retire(&task->header, pending_submission);
+    iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
+        &task->status, 0, iree_memory_order_seq_cst);
+    iree_task_retire(&task->header, pending_submission, status);
   }
 
   IREE_TRACE_ZONE_END(z0);
-  return status;
 }
 
 //==============================================================================
@@ -212,6 +301,27 @@
   }
 }
 
+static void iree_task_barrier_discard(iree_task_barrier_t* task,
+                                      iree_task_list_t* discard_worklist) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  // Discard all of the tasks after the barrier.
+  // Note that we need to ensure we only enqueue them for discard after all of
+  // their dependencies have been met - otherwise we'll double-discard.
+  for (iree_host_size_t i = 0; i < task->dependent_task_count; ++i) {
+    iree_task_t* dependent_task = task->dependent_tasks[i];
+    const bool dependent_task_ready =
+        iree_atomic_fetch_sub_int32(&dependent_task->pending_dependency_count,
+                                    1, iree_memory_order_acq_rel) == 1;
+    if (dependent_task_ready) {
+      // The dependent task has retired and can now be discard.
+      iree_task_list_push_back(discard_worklist, dependent_task);
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
 void iree_task_barrier_retire(iree_task_barrier_t* task,
                               iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
@@ -227,7 +337,8 @@
     }
   }
 
-  iree_task_retire(&task->header, pending_submission);
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+
   IREE_TRACE_ZONE_END(z0);
 }
 
@@ -247,7 +358,8 @@
 
   iree_task_scope_end(task->header.scope);
 
-  iree_task_retire(&task->header, pending_submission);
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
+
   IREE_TRACE_ZONE_END(z0);
 }
 
@@ -272,7 +384,7 @@
                            iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
   // TODO(benvanik): allow deinit'ing the wait handle (if transient).
-  iree_task_retire(&task->header, pending_submission);
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
   IREE_TRACE_ZONE_END(z0);
 }
 
@@ -282,16 +394,6 @@
 
 // Returns an XXBBGGRR color (red in the lowest bits).
 // Must not be 0 (tracy will ignore).
-static uint32_t iree_math_ptr_to_xrgb(const uintptr_t ptr) {
-  // This is just a simple hack to give us a unique(ish) per-pointer color.
-  // It's only to make it easier to distinguish which tiles are from the same
-  // dispatch.
-  uint64_t ptr64 = ptr;
-  return (uint32_t)ptr64 ^ (uint32_t)(ptr64 >> 32);
-}
-
-// Returns an XXBBGGRR color (red in the lowest bits).
-// Must not be 0 (tracy will ignore).
 static uint32_t iree_task_tile_to_color(
     const iree_task_tile_context_t* tile_context);
 
@@ -376,7 +478,14 @@
   memcpy(out_task->workgroup_size, workgroup_size,
          sizeof(out_task->workgroup_size));
   out_task->local_memory_size = 0;
+  iree_atomic_store_intptr(&out_task->status, 0, iree_memory_order_release);
   memset(&out_task->statistics, 0, sizeof(out_task->statistics));
+
+  IREE_TRACE({
+    static iree_atomic_int64_t next_dispatch_id = IREE_ATOMIC_VAR_INIT(0);
+    out_task->dispatch_id = iree_atomic_fetch_add_int64(
+        &next_dispatch_id, 1ll, iree_memory_order_acq_rel);
+  });
 }
 
 void iree_task_dispatch_initialize(iree_task_scope_t* scope,
@@ -398,131 +507,18 @@
   out_task->workgroup_count.ptr = workgroup_count_ptr;
 }
 
-void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
-                                     iree_task_pool_t* slice_task_pool,
-                                     iree_task_submission_t* pending_submission,
-                                     iree_task_post_batch_t* post_batch) {
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch) {
   IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
 
   // Mark the dispatch as having been issued; the next time it retires it'll be
   // because all work has completed.
   dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
 
   // Fetch the workgroup count (directly or indirectly).
-  // By the task being ready to execute we know any dependencies on the
-  // indirection buffer have been satisfied and its safe to read.
-  uint32_t workgroup_count[3];
-  if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
-    memcpy(workgroup_count, dispatch_task->workgroup_count.ptr,
-           sizeof(workgroup_count));
-  } else {
-    memcpy(workgroup_count, dispatch_task->workgroup_count.value,
-           sizeof(workgroup_count));
-  }
-  uint32_t total_workgroup_count =
-      workgroup_count[0] * workgroup_count[1] * workgroup_count[2];
-  if (total_workgroup_count == 0) {
-    // No workgroups to execute - bail early.
-    iree_task_dispatch_retire(dispatch_task, pending_submission);
-    IREE_TRACE_ZONE_END(z0);
-    return;
-  }
-
-#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-  char xyz_string[32];
-  int xyz_string_length =
-      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
-               workgroup_count[0], workgroup_count[1], workgroup_count[2]);
-  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
-#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-
-  // Divide up all tiles into slices, our finest-granularity scheduling task.
-  const uint32_t tiles_per_slice_x = IREE_TASK_DISPATCH_TILES_PER_SLICE_X;
-  const uint32_t tiles_per_slice_y = IREE_TASK_DISPATCH_TILES_PER_SLICE_Y;
-  const uint32_t tiles_per_slice_z = IREE_TASK_DISPATCH_TILES_PER_SLICE_Z;
-  uint32_t slice_count_x = iree_max(1, workgroup_count[0] / tiles_per_slice_x);
-  uint32_t slice_count_y = iree_max(1, workgroup_count[1] / tiles_per_slice_y);
-  uint32_t slice_count_z = iree_max(1, workgroup_count[2] / tiles_per_slice_z);
-
-  // Compute how many slices each worker will process.
-  uint32_t slice_count = slice_count_x * slice_count_y * slice_count_z;
-  iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
-  uint32_t slices_per_worker = iree_max(1, slice_count / worker_count);
-
-  // Randomize starting worker.
-  iree_host_size_t worker_offset = iree_task_post_batch_select_worker(
-      post_batch, dispatch_task->header.affinity_set);
-  iree_host_size_t worker_index = worker_offset;
-
-  // TODO(benvanik): rework this with some science. For now we just iteratively
-  // divide up the space from outer->inner scheduling dimension, but ideally
-  // we'd use some fun cray-style torus scheduling or hilbert curve magic to
-  // try to ensure better locality using worker constructive sharing masks.
-  // TODO(benvanik): observe affinity_set here when dividing ranges.
-  iree_host_size_t worker_slice_count = 0;
-  for (uint32_t slice_z = 0; slice_z < slice_count_z; ++slice_z) {
-    for (uint32_t slice_y = 0; slice_y < slice_count_y; ++slice_y) {
-      for (uint32_t slice_x = 0; slice_x < slice_count_x; ++slice_x) {
-        uint32_t workgroup_base[3];
-        workgroup_base[0] = slice_x * tiles_per_slice_x;
-        workgroup_base[1] = slice_y * tiles_per_slice_y;
-        workgroup_base[2] = slice_z * tiles_per_slice_z;
-        uint32_t workgroup_range[3];
-        workgroup_range[0] = iree_min(workgroup_count[0],
-                                      workgroup_base[0] + tiles_per_slice_x) -
-                             1;
-        workgroup_range[1] = iree_min(workgroup_count[1],
-                                      workgroup_base[1] + tiles_per_slice_y) -
-                             1;
-        workgroup_range[2] = iree_min(workgroup_count[2],
-                                      workgroup_base[2] + tiles_per_slice_z) -
-                             1;
-
-        // Allocate and initialize the slice.
-        iree_task_dispatch_slice_t* slice_task =
-            iree_task_dispatch_slice_allocate(dispatch_task, workgroup_base,
-                                              workgroup_range, workgroup_count,
-                                              slice_task_pool);
-
-        // Enqueue on the worker selected for the task.
-        iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
-                                     &slice_task->header);
-        if (++worker_slice_count >= slices_per_worker) {
-          ++worker_index;
-          worker_slice_count = 0;
-        }
-      }
-    }
-  }
-
-  // NOTE: the dispatch is not retired until all slices complete. Upon the last
-  // slice completing the lucky worker will retire the task inline and
-  // potentially queue up more ready tasks that follow.
-  //
-  // The gotcha here is that it's possible for there to be zero slices within
-  // a dispatch (if, for example, and indirect dispatch had its workgroup counts
-  // set to zero to prevent it from running). We check for that here.
-  if (slice_count == 0) {
-    iree_task_dispatch_retire(dispatch_task, pending_submission);
-  }
-
-  IREE_TRACE_ZONE_END(z0);
-}
-
-void iree_task_dispatch_issue_sharded(
-    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
-    iree_task_submission_t* pending_submission,
-    iree_task_post_batch_t* post_batch) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-
-  // Mark the dispatch as having been issued; the next time it retires it'll be
-  // because all work has completed.
-  dispatch_task->header.flags |= IREE_TASK_FLAG_DISPATCH_RETIRE;
-
-  iree_task_dispatch_shard_state_t* shared_state =
-      &dispatch_task->shared.shard_state;
-
-  // Fetch the workgroup count (directly or indirectly).
   if (dispatch_task->header.flags & IREE_TASK_FLAG_DISPATCH_INDIRECT) {
     // By the task being ready to execute we know any dependencies on the
     // indirection buffer have been satisfied and its safe to read. We perform
@@ -537,35 +533,35 @@
   }
   const uint32_t* workgroup_count = dispatch_task->workgroup_count.value;
 
-#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
-  char xyz_string[32];
-  int xyz_string_length =
-      snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
-               workgroup_count[0], workgroup_count[1], workgroup_count[2]);
-  IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
-#endif  // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION
+  IREE_TRACE({
+    char xyz_string[32];
+    int xyz_string_length =
+        snprintf(xyz_string, IREE_ARRAYSIZE(xyz_string), "%ux%ux%u",
+                 workgroup_count[0], workgroup_count[1], workgroup_count[2]);
+    IREE_TRACE_ZONE_APPEND_TEXT_STRING_VIEW(z0, xyz_string, xyz_string_length);
+  });
 
   // Setup the iteration space for shards to pull work from the complete grid.
-  iree_atomic_store_int32(&shared_state->tile_index, 0,
+  iree_atomic_store_int32(&dispatch_task->tile_index, 0,
                           iree_memory_order_relaxed);
-  shared_state->tile_count =
+  dispatch_task->tile_count =
       workgroup_count[0] * workgroup_count[1] * workgroup_count[2];
 
   // Compute shard count - almost always worker_count unless we are a very small
   // dispatch (1x1x1, etc).
   iree_host_size_t worker_count = iree_task_post_batch_worker_count(post_batch);
   iree_host_size_t shard_count =
-      iree_min(shared_state->tile_count, worker_count);
+      iree_min(dispatch_task->tile_count, worker_count);
 
   // Compute how many tiles we want each shard to reserve at a time from the
   // larger grid. A higher number reduces overhead and improves locality while
   // a lower number reduces maximum worst-case latency (coarser work stealing).
-  if (shared_state->tile_count <
+  if (dispatch_task->tile_count <
       worker_count * IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION) {
     // Grid is small - allow it to be eagerly sliced up.
-    shared_state->tiles_per_reservation = 1;
+    dispatch_task->tiles_per_reservation = 1;
   } else {
-    shared_state->tiles_per_reservation =
+    dispatch_task->tiles_per_reservation =
         IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION;
   }
 
@@ -576,8 +572,8 @@
 
   for (iree_host_size_t i = 0; i < shard_count; ++i) {
     // Allocate and initialize the shard.
-    iree_task_dispatch_shard_t* shard_task = iree_task_dispatch_shard_allocate(
-        dispatch_task, shared_state, shard_task_pool);
+    iree_task_dispatch_shard_t* shard_task =
+        iree_task_dispatch_shard_allocate(dispatch_task, shard_task_pool);
 
     // Enqueue on the worker selected for the task.
     iree_task_post_batch_enqueue(post_batch, worker_index % worker_count,
@@ -602,6 +598,7 @@
 void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
                                iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
 
   // TODO(benvanik): attach statistics to the tracy zone.
 
@@ -611,160 +608,40 @@
       &dispatch_task->statistics,
       &dispatch_task->header.scope->dispatch_statistics);
 
-  iree_task_retire(&dispatch_task->header, pending_submission);
+  // Consume the status of the dispatch that may have been set from a workgroup
+  // and notify the scope. We need to do this here so that each shard retires
+  // before we discard any subsequent tasks: otherwise a failure of one shard
+  // would discard the shared dispatch task (and potentially everything) while
+  // other shards were still running. We also want to avoid fine-grained
+  // synchronization across shards that would occur by each checking to see if
+  // any other has hit an error; failure in a dispatch should be so exceedingly
+  // rare that allowing some shards to complete after one encounters an error is
+  // not a problem.
+  iree_status_t status = (iree_status_t)iree_atomic_exchange_intptr(
+      &dispatch_task->status, 0, iree_memory_order_seq_cst);
+
+  iree_task_retire(&dispatch_task->header, pending_submission, status);
   IREE_TRACE_ZONE_END(z0);
 }
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
-                                         const uint32_t workgroup_base[3],
-                                         const uint32_t workgroup_range[3],
-                                         const uint32_t workgroup_count[3],
-                                         iree_task_dispatch_slice_t* out_task) {
-  iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SLICE,
-                       dispatch_task->header.scope, &out_task->header);
-  iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
-  out_task->closure = dispatch_task->closure;
-
-  memcpy(out_task->workgroup_base, workgroup_base,
-         sizeof(out_task->workgroup_base));
-  memcpy(out_task->workgroup_range, workgroup_range,
-         sizeof(out_task->workgroup_range));
-  memcpy(out_task->workgroup_size, dispatch_task->workgroup_size,
-         sizeof(out_task->workgroup_size));
-  memcpy(out_task->workgroup_count, workgroup_count,
-         sizeof(out_task->workgroup_count));
-
-  // Each slice requires at most this amount of memory from the worker-local
-  // pool.
-  out_task->local_memory_size = dispatch_task->local_memory_size;
-
-  // Wire up dispatch statistics; we'll track on the slice while we run and
-  // then the per-slice statistics will roll up into the dispatch statistics.
-  out_task->dispatch_statistics = &dispatch_task->statistics;
-  memset(&out_task->slice_statistics, 0, sizeof(out_task->slice_statistics));
-}
-
-iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
-    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
-    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
-    iree_task_pool_t* slice_task_pool) {
-  iree_task_dispatch_slice_t* slice_task = NULL;
-  iree_status_t status =
-      iree_task_pool_acquire(slice_task_pool, (iree_task_t**)&slice_task);
-  if (!iree_status_is_ok(status)) {
-    iree_status_ignore(status);
-    return NULL;
-  }
-  iree_task_dispatch_slice_initialize(dispatch_task, workgroup_base,
-                                      workgroup_range, workgroup_count,
-                                      slice_task);
-  slice_task->header.pool = slice_task_pool;
-  return slice_task;
-}
-
-iree_status_t iree_task_dispatch_slice_execute(
-    iree_task_dispatch_slice_t* task, iree_byte_span_t local_memory,
-    iree_task_submission_t* pending_submission) {
-  IREE_TRACE_ZONE_BEGIN(z0);
-  IREE_TRACE_ZONE_SET_COLOR(z0,
-                            iree_math_ptr_to_xrgb(task->closure.user_context));
-
-  // TODO(benvanik): coroutine support. Ideally this function can be called
-  // multiple times for the same slice, and we'll have a way to ready up the
-  // slices on the same workers (some per-worker suspended list?).
-
-  // Prepare context shared for all tiles in the slice.
-  iree_task_tile_context_t tile_context;
-  memcpy(&tile_context.workgroup_size, task->workgroup_size,
-         sizeof(tile_context.workgroup_size));
-  memcpy(&tile_context.workgroup_count, task->workgroup_count,
-         sizeof(tile_context.workgroup_count));
-  tile_context.statistics = &task->slice_statistics;
-
-  // Map only the requested amount of worker local memory into the tile context.
-  // This ensures that how much memory is used by some executions does not
-  // inadvertently leak over into other executions.
-  if (IREE_UNLIKELY(task->local_memory_size > local_memory.data_length)) {
-    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
-                            "dispatch requires %ub of local memory but only "
-                            "%zub is available per-worker",
-                            task->local_memory_size, local_memory.data_length);
-  }
-  tile_context.local_memory =
-      iree_make_byte_span(local_memory.data, task->local_memory_size);
-
-  const uint32_t base_x = task->workgroup_base[0];
-  const uint32_t base_y = task->workgroup_base[1];
-  const uint32_t base_z = task->workgroup_base[2];
-  const uint32_t range_x = task->workgroup_range[0];
-  const uint32_t range_y = task->workgroup_range[1];
-  const uint32_t range_z = task->workgroup_range[2];
-  for (uint32_t z = base_z; z <= range_z; ++z) {
-    tile_context.workgroup_xyz[2] = z;
-    for (uint32_t y = base_y; y <= range_y; ++y) {
-      tile_context.workgroup_xyz[1] = y;
-      for (uint32_t x = base_x; x <= range_x; ++x) {
-        tile_context.workgroup_xyz[0] = x;
-        IREE_TRACE_ZONE_BEGIN_NAMED(z_tile,
-                                    "iree_task_dispatch_slice_execute_tile");
-        IREE_TRACE_ZONE_SET_COLOR(z_tile,
-                                  iree_task_tile_to_color(&tile_context));
-
-        // NOTE: these are useful for debugging but dramatically increase our
-        // cost here; only enable if needed for tracking work distribution:
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, x);
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, y);
-        IREE_TRACE_ZONE_APPEND_VALUE(z_tile, z);
-        // IREE_TRACE_ZONE_APPEND_VALUE(z_tile, (uint64_t)task->closure.fn);
-
-        iree_status_t status = task->closure.fn(
-            task->closure.user_context, &tile_context, pending_submission);
-
-        IREE_TRACE_ZONE_END(z_tile);
-        if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
-          // NOTE: we don't bother to update statistics here on failure as the
-          // partial results won't really help much.
-          IREE_TRACE_ZONE_END(z0);
-          return status;
-        }
-      }
-    }
-  }
-
-  // Push aggregate statistics up to the dispatch.
-  if (task->dispatch_statistics) {
-    iree_task_dispatch_statistics_merge(&task->slice_statistics,
-                                        task->dispatch_statistics);
-  }
-
-  iree_task_retire(&task->header, pending_submission);
-  IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
-}
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
-void iree_task_dispatch_shard_initialize(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_dispatch_shard_t* out_task) {
+static inline iree_task_dispatch_t* iree_task_dispatch_shard_parent(
+    iree_task_dispatch_shard_t* task) {
+  return (iree_task_dispatch_t*)task->header.completion_task;
+}
+
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task) {
   iree_task_initialize(IREE_TASK_TYPE_DISPATCH_SHARD,
                        dispatch_task->header.scope, &out_task->header);
   iree_task_set_completion_task(&out_task->header, &dispatch_task->header);
-  out_task->dispatch_task = dispatch_task;
-  out_task->shared_state = shared_state;
 }
 
 iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_pool_t* shard_task_pool) {
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool) {
   iree_task_dispatch_shard_t* shard_task = NULL;
   iree_status_t status =
       iree_task_pool_acquire(shard_task_pool, (iree_task_t**)&shard_task);
@@ -772,22 +649,41 @@
     iree_status_ignore(status);
     return NULL;
   }
-  iree_task_dispatch_shard_initialize(dispatch_task, shared_state, shard_task);
+  iree_task_dispatch_shard_initialize(dispatch_task, shard_task);
   shard_task->header.pool = shard_task_pool;
   return shard_task;
 }
 
-iree_status_t iree_task_dispatch_shard_execute(
-    iree_task_dispatch_shard_t* task, iree_byte_span_t local_memory,
+void iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
-  iree_task_dispatch_t* dispatch_task = task->dispatch_task;
+  iree_task_dispatch_t* dispatch_task = iree_task_dispatch_shard_parent(task);
+  IREE_TRACE_ZONE_APPEND_VALUE(z0, dispatch_task->dispatch_id);
   IREE_TRACE_ZONE_SET_COLOR(
       z0, iree_math_ptr_to_xrgb(dispatch_task->closure.user_context));
 
+  // Map only the requested amount of worker local memory into the tile context.
+  // This ensures that how much memory is used by some executions does not
+  // inadvertently leak over into other executions.
+  if (IREE_UNLIKELY(dispatch_task->local_memory_size >
+                    worker_local_memory.data_length)) {
+    iree_task_try_set_status(
+        &dispatch_task->status,
+        iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
+                         "dispatch requires %ub of local memory but only "
+                         "%zub is available per-worker",
+                         dispatch_task->local_memory_size,
+                         worker_local_memory.data_length));
+    iree_task_retire(&task->header, pending_submission, iree_ok_status());
+    IREE_TRACE_ZONE_END(z0);
+    return;
+  }
+  iree_byte_span_t local_memory = iree_make_byte_span(
+      worker_local_memory.data, dispatch_task->local_memory_size);
+
   // Prepare context shared for all tiles in the shard.
-  iree_task_dispatch_shard_state_t* shared_state = task->shared_state;
   iree_task_tile_context_t tile_context;
   memcpy(&tile_context.workgroup_size, dispatch_task->workgroup_size,
          sizeof(tile_context.workgroup_size));
@@ -795,20 +691,7 @@
          sizeof(tile_context.workgroup_count));
   uint32_t workgroup_count_x = tile_context.workgroup_count[0];
   uint32_t workgroup_count_y = tile_context.workgroup_count[1];
-
-  // Map only the requested amount of worker local memory into the tile context.
-  // This ensures that how much memory is used by some executions does not
-  // inadvertently leak over into other executions.
-  if (IREE_UNLIKELY(dispatch_task->local_memory_size >
-                    local_memory.data_length)) {
-    return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
-                            "dispatch requires %ub of local memory but only "
-                            "%zub is available per-worker",
-                            dispatch_task->local_memory_size,
-                            local_memory.data_length);
-  }
-  tile_context.local_memory =
-      iree_make_byte_span(local_memory.data, dispatch_task->local_memory_size);
+  tile_context.local_memory = local_memory;
 
   // We perform all our shard statistics work locally here and only push back to
   // the dispatch at the end; this avoids contention from each shard trying to
@@ -818,9 +701,9 @@
   tile_context.statistics = &shard_statistics;
 
   // Loop over all tiles until they are all processed.
-  const uint32_t tile_count = shared_state->tile_count;
-  const uint32_t tiles_per_reservation = shared_state->tiles_per_reservation;
-  uint32_t tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+  const uint32_t tile_count = dispatch_task->tile_count;
+  const uint32_t tiles_per_reservation = dispatch_task->tiles_per_reservation;
+  uint32_t tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
                                                    tiles_per_reservation,
                                                    iree_memory_order_relaxed);
   while (tile_base < tile_count) {
@@ -853,24 +736,35 @@
                                     &tile_context, pending_submission);
 
       IREE_TRACE_ZONE_END(z_tile);
-      if (IREE_UNLIKELY(!iree_status_is_ok(status))) {
-        // NOTE: we don't bother to update statistics here on failure as the
-        // partial results won't really help much.
-        IREE_TRACE_ZONE_END(z0);
-        return status;
+
+      // If any tile fails we bail early from the loop. This doesn't match
+      // what an accelerator would do but saves some unneeded work.
+      // Note that other shards may have completed execution, be executing
+      // concurrently with this one, or still be pending - this does not
+      // have any influence on them and they may continue to execute even
+      // after we bail from here.
+      if (!iree_status_is_ok(status)) {
+        // Propagate failures to the dispatch task.
+        iree_task_try_set_status(&dispatch_task->status, status);
+        goto abort_shard;  // out of the while-for nest
       }
     }
 
-    tile_base = iree_atomic_fetch_add_int32(&shared_state->tile_index,
+    // Try to grab the next slice of tiles.
+    tile_base = iree_atomic_fetch_add_int32(&dispatch_task->tile_index,
                                             tiles_per_reservation,
                                             iree_memory_order_relaxed);
   }
+abort_shard:
 
   // Push aggregate statistics up to the dispatch.
+  // Note that we may have partial information here if we errored out of the
+  // loop but that's still useful to know.
   iree_task_dispatch_statistics_merge(&shard_statistics,
                                       &dispatch_task->statistics);
 
-  iree_task_retire(&task->header, pending_submission);
+  // NOTE: even if an error was hit we retire OK - the error has already been
+  // propagated to the dispatch and it'll clean up after all shards are joined.
+  iree_task_retire(&task->header, pending_submission, iree_ok_status());
   IREE_TRACE_ZONE_END(z0);
-  return iree_ok_status();
 }

diff --git a/iree/task/task.h b/iree/task/task.h
index b90ab67..235a8d9 100644
--- a/iree/task/task.h
+++ b/iree/task/task.h

@@ -54,9 +54,8 @@
   IREE_TASK_TYPE_WAIT = 4u,
 
   // Task is a 3D grid dispatch of zero or more tiles.
-  // Dispatches are issued when ready by either being split into zero or more
-  // slices with one or more tiles each based on the workgroup count or one
-  // shard per worker that should process the dispatch.
+  // Dispatches are issued when ready by either being split into one shard per
+  // worker that should process the dispatch.
   //
   // If IREE_TASK_FLAG_DISPATCH_INDIRECT is set then the dispatch reads the
   // workgroup count from a buffer immediately prior to fan-out instead of using
@@ -64,26 +63,21 @@
   //
   // After a dispatch has been issued the IREE_TASK_FLAG_DISPATCH_RETIRE flag is
   // set to indicate that when the dispatch becomes ready again it will be after
-  // all slices have completed.
+  // all shards have completed.
   IREE_TASK_TYPE_DISPATCH = 5u,
 
-  // Task is a slice of a larger contiguous dispatch tile range. The full
-  // dispatch will be sliced into zero or more slices and each slice will be
-  // posted to a particular worker for executiion. If work progresses unevenly
-  // then entire slices will be stolen across workers to balance out the timing.
-  // Slices retire once they have completed the tiles assigned to them.
-  IREE_TASK_TYPE_DISPATCH_SLICE = 6u,
-
   // Task is one of potentially many shards processing a larger dispatch grid.
   // Each shard may have a preference as to which parts of grid it will focus
   // on but is able to otherwise steal any available region directly from the
   // shared dispatch coordination state. Shards retire once there are no more
   // tiles remaining in the dispatch grid.
-  IREE_TASK_TYPE_DISPATCH_SHARD = 7u,
+  IREE_TASK_TYPE_DISPATCH_SHARD = 6u,
 };
 typedef uint8_t iree_task_type_t;
 
 enum iree_task_flag_bits_t {
+  IREE_TASK_FLAG_NONE = 0u,
+
   // The wait handle the task is specified to wait on has resolved and the task
   // can now be considered complete.
   IREE_TASK_FLAG_WAIT_COMPLETED = 1u << 0,
@@ -95,40 +89,46 @@
   // issued.
   IREE_TASK_FLAG_DISPATCH_INDIRECT = 1u << 1,
 
-  // The dispatch should be sliced across workers via the low-contention
-  // IREE_TASK_TYPE_DISPATCH_SLICE mechanism. This moves the dispatch overhead
-  // to the time when the grid is sliced for a savings during when the grid is
-  // executed.
-  IREE_TASK_FLAG_DISPATCH_SLICED = 1u << 2,
-
   // The dispatch has been issued and the task is waiting for one or more
-  // slices to complete. After they complete the dispatch will be readied and
+  // shards to complete. After they complete the dispatch will be readied and
   // can be retired.
   //
   // Though added by the executor after issuing a dispatch users can also set
-  // this to indicate that all dispatch slices for a particular dispatch have
+  // this to indicate that all dispatch shards for a particular dispatch have
   // been statically scheduled. Executors will then skip issuing the dispatch
-  // and instead wait until all slices complete, enabling IREE_TASK_TYPE_BARRIER
+  // and instead wait until all shards complete, enabling IREE_TASK_TYPE_BARRIER
   // behavior but without an additional task as dispatches are still required
-  // to store information for slices.
-  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 3,
+  // to store information for shards.
+  IREE_TASK_FLAG_DISPATCH_RETIRE = 1u << 2,
+
+  // An error occurred at or before the task and it has been aborted.
+  // Aborted tasks may continue to execute if they're already in-flight but must
+  // not begin execution after the flag has been set.
+  //
+  // The actual error that occurred is routed to the parent task scope as it
+  // happens and may be available for querying before all tasks have been
+  // cleaned up.
+  IREE_TASK_FLAG_ABORTED = 1u << 3,
 };
 typedef uint16_t iree_task_flags_t;
 
 typedef struct iree_task_t iree_task_t;
 
 // A function called to cleanup tasks.
-// The provided |status| is unowned and must be cloned if used beyond the scope
-// of the cleanup function (such as when stored for later usage).
-typedef void(IREE_API_PTR* iree_task_cleanup_fn_t)(iree_task_t* task,
-                                                   iree_status_t status);
+// Each task has its associated cleanup function called exactly once.
+// The provided |status_code| indicates the execution status of the task prior
+// to cleanup and will usually be IREE_STATUS_OK indicating the task was
+// successfully issued or IREE_STATUS_ABORTED if the task was discard prior to
+// issuing.
+typedef void(IREE_API_PTR* iree_task_cleanup_fn_t)(
+    iree_task_t* task, iree_status_code_t status_code);
 
 // A task within the task system that runs on an executor.
 // Tasks have an iree_task_type_t that defines which parameters are valid and
 // how the executor is to treat the task. Dependency edges can be defined that
 // determine the execution order of tasks within the executors.
 struct iree_alignas(iree_max_align_t) iree_task_t {
-  // Instrusive pointer used to store tasks within iree_task_list_t and
+  // Intrusive pointer used to store tasks within iree_task_list_t and
   // iree_atomic_task_list_t singly-linked lists. This must come first in the
   // structure so that it is at the appropriate alignment.
   iree_task_t* next_task;
@@ -175,6 +175,9 @@
 };
 static_assert(offsetof(iree_task_t, next_task) == 0,
               "next_task intrusive pointer must be at offset 0");
+static_assert(sizeof(iree_task_t) <= 64,
+              "the task header greatly influences pool sizes due to alignment "
+              "requirements and should be kept tiny");
 
 // Initializes a task header with the given type.
 // Must be called on all tasks to ensure proper dependency tracking and list
@@ -185,7 +188,9 @@
                           iree_task_t* out_task);
 
 // Sets the optional function called when the task completes (whether successful
-// or not).
+// or not). The cleanup function will receive a status indicating whether the
+// cleanup is from expected execution as the task retires (IREE_STATUS_OK)
+// or because it was aborted (IREE_STATUS_ABORTED).
 void iree_task_set_cleanup_fn(iree_task_t* task,
                               iree_task_cleanup_fn_t cleanup_fn);
 
@@ -231,7 +236,7 @@
 //==============================================================================
 
 typedef iree_status_t(IREE_API_PTR* iree_task_call_closure_fn_t)(
-    uintptr_t user_context, iree_task_t* task,
+    void* user_context, iree_task_t* task,
     iree_task_submission_t* pending_submission);
 
 // A function closure representing the function to call and its arguments.
@@ -239,12 +244,11 @@
   // Function called per tile invocation.
   iree_task_call_closure_fn_t fn;
 
-  // User-defined argument passed to task functions during invocation.
-  // Opaque pointer-sized values that could point to user data structures or
-  // contain embedded values. No lifetime management is performed by the task
-  // system and it is required that users ensure that the memory referenced is
-  // live until after the task has completed.
-  uintptr_t user_context;
+  // Opaque pointer to a user-provided data structure.
+  // No lifetime management is performed by the task system and it is required
+  // that users ensure that the memory referenced is live until after the task
+  // has completed.
+  void* user_context;
 
   // TODO(benvanik): cleanup function? right now assume arg is never freed.
 } iree_task_call_closure_t;
@@ -253,7 +257,7 @@
 // If the arguments represent pointers they must remain live until the task
 // has completed execution.
 static inline iree_task_call_closure_t iree_task_make_call_closure(
-    iree_task_call_closure_fn_t fn, uintptr_t user_context) {
+    iree_task_call_closure_fn_t fn, void* user_context) {
   iree_task_call_closure_t closure = {fn, user_context};
   return closure;
 }
@@ -269,6 +273,16 @@
 
   // Function closure to call when the task is executed.
   iree_task_call_closure_t closure;
+
+  // Resulting status from the call available once all nested tasks have
+  // completed (or would have completed). It's possible for a call to nest
+  // additional work under it and then return a failure; to ensure we don't
+  // discard the root call while the nested tasks are still executing we set the
+  // status here and wait for the nested tasks to complete. We'll try not to
+  // issue work that was enqueued while the call was executing but it's possible
+  // for work to come from other angles and we need to err on the side of
+  // safety.
+  iree_atomic_intptr_t status;
 } iree_task_call_t;
 
 void iree_task_call_initialize(iree_task_scope_t* scope,
@@ -358,11 +372,14 @@
   iree_task_t header;
 
   // The external wait handle that the task is waiting on.
-  // TODO(benvanik): multiple wait handles.
+  // TODO(benvanik): null handle for sleep.
+  // TODO(benvanik): use a wait source with cached wait handle.
+  // TODO(benvanik): multiple wait handles (ptr owned by outer wrapper task).
   iree_wait_handle_t wait_handle;
 
   // TODO(benvanik): deadline_ns.
   // TODO(benvanik): condition (possibly a closure to evaluate) ala condvar.
+  // TODO(benvanik): whether a sleep.
 } iree_task_wait_t;
 
 void iree_task_wait_initialize(iree_task_scope_t* scope,
@@ -432,7 +449,7 @@
   // Contents are (today) undefined upon entry.
   iree_byte_span_t local_memory;
 
-  // Shared statistics counters for the dispatch slice.
+  // Shared statistics counters for the dispatch shard.
   iree_task_dispatch_statistics_t* statistics;
 
   // TODO(benvanik): cpuid uarch.
@@ -441,26 +458,12 @@
 
 typedef struct iree_task_dispatch_t iree_task_dispatch_t;
 
-// Shared state for all shards processing a dispatch.
-typedef iree_alignas(iree_max_align_t) struct {
-  // The tail tile index; the next reservation will start from here.
-  iree_atomic_int32_t tile_index;
-
-  // The total number of tiles in the dispatch bounding tile_index.
-  uint32_t tile_count;
-
-  // Maximum number of tiles to fetch per tile reservation from the grid.
-  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
-  // reasonable number chosen based on the tile and shard counts.
-  uint32_t tiles_per_reservation;
-} iree_task_dispatch_shard_state_t;
-
 //==============================================================================
 // Dispatch function closures
 //==============================================================================
 
 typedef iree_status_t(IREE_API_PTR* iree_task_dispatch_closure_fn_t)(
-    uintptr_t user_context, const iree_task_tile_context_t* tile_context,
+    void* user_context, const iree_task_tile_context_t* tile_context,
     iree_task_submission_t* pending_submission);
 
 // A function closure representing the function to call and its arguments.
@@ -473,14 +476,14 @@
   // contain embedded values. No lifetime management is performed by the task
   // system and it is required that users ensure that the memory referenced is
   // live until after the task has completed.
-  uintptr_t user_context;
+  void* user_context;
 } iree_task_dispatch_closure_t;
 
 // Binds a function pointer and the arguments it should be called with.
 // If the arguments represent pointers they must remain live until the task
 // has completed execution.
 static inline iree_task_dispatch_closure_t iree_task_make_dispatch_closure(
-    iree_task_dispatch_closure_fn_t fn, uintptr_t user_context) {
+    iree_task_dispatch_closure_fn_t fn, void* user_context) {
   iree_task_dispatch_closure_t closure = {fn, user_context};
   return closure;
 }
@@ -490,10 +493,10 @@
 //==============================================================================
 
 // An execution request across a tiled grid.
-// Dispatches are fork points where zero or more dispatch slice tasks are
+// Dispatches are fork points where zero or more dispatch shard tasks are
 // spawned and processed prior to joining again on the dispatch completion task.
 //
-// The total workgroup count indicates the [x,y,z] extents of the dispatch grid.
+// The total workgroup count defines the [x,y,z] extents of the dispatch grid.
 // The count may either be embedded directly into the dispatch or provided as a
 // pointer to the workgroup_count[3] that will be read immediately prior to
 // forking. If any dimension of the workgroup count is zero then the dispatch is
@@ -501,11 +504,11 @@
 //
 // Example:
 //   dispatch([5, 1, 1])
-//     forked into slices based on affinity/scheduling parameters:
-//     -> dispatch_slice([0-1, 1, 1])
-//     -> dispatch_slice([2-3, 1, 1])
-//     -> dispatch_slice([4-5, 1, 1])
-//   completion_task run after all slices complete
+//     forked into shards based on affinity/scheduling parameters:
+//     -> dispatch_shard for core 0, processes [0-1, 1, 1]
+//     -> dispatch_shard for core 1, processes [2-3, 1, 1]
+//     -> dispatch_shard for core 2, processes [4-5, 1, 1]
+//   completion_task run after all shards complete
 typedef iree_alignas(iree_max_align_t) struct iree_task_dispatch_t {
   // Task header: implementation detail, do not use.
   iree_task_t header;
@@ -533,14 +536,33 @@
   // dispatch closure.
   uint32_t local_memory_size;
 
-  // Statistics storage used for aggregating counters across all slices.
+  // Resulting status from the dispatch available once all workgroups have
+  // completed (or would have completed). If multiple shards processing the
+  // workgroups hit an error the first will be taken and the result ignored. A
+  // dispatch with a non-ok status will mark the parent task scope as failing
+  // when it retires.
+  iree_atomic_intptr_t status;
+
+  // Statistics storage used for aggregating counters across all shards.
   iree_task_dispatch_statistics_t statistics;
 
-  // Shared state across all slices/shards/etc.
-  // Stored once per dispatch and then referenced by all subtasks.
-  union {
-    iree_task_dispatch_shard_state_t shard_state;
-  } shared;
+  // The total number of tiles in the dispatch bounding tile_index.
+  uint32_t tile_count;
+
+  // Maximum number of tiles to fetch per tile reservation from the grid.
+  // Bounded by IREE_TASK_DISPATCH_MAX_TILES_PER_SHARD_RESERVATION and a
+  // reasonable number chosen based on the tile and shard counts.
+  uint32_t tiles_per_reservation;
+
+  // The tail tile index; the next reservation will start from here.
+  // This is used by shards to slice off the work to perform in their inner
+  // loop. Ideally we'd have no destructive interference with other shared data
+  // in this structure but the shared parts (status/statistics) are updated once
+  // per shard instead of once per slice and are less of a concern.
+  iree_atomic_int32_t tile_index;
+
+  // Incrementing process-lifetime dispatch identifier.
+  IREE_TRACE(int64_t dispatch_id;)
 } iree_task_dispatch_t;
 
 void iree_task_dispatch_initialize(iree_task_scope_t* scope,
@@ -555,83 +577,6 @@
     iree_task_dispatch_t* out_task);
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-// TODO(benvanik): per-region dependencies (allow slices to execute directly
-// across dispatches).
-
-// A slice of tiles within a larger dispatch grid.
-// These tasks are designed to be synthesized by the task system when processing
-// a dispatch task. Based on the workgroup count, affinity settings, and
-// available executor threads zero or more slices are enqueued, executed, and
-// retired as part of the complete dispatch task. The dispatch is only
-// considered completed and subsquent tasks readied once all slices are
-// complete.
-//
-// Slices aggregate statistics from all tiles within them and then upon
-// completion merge those into the shared dispatch statistics. As slices may
-// suspend and resume several times the dispatch-level statistics should only be
-// read once all slices have completed fully.
-//
-// In general slices represent a contiguous range of tiles along the most
-// rapidly changing dimension (x, then y, then z). This ensures that we at least
-// give the opportunity for cache locality to the tiles as they are processed.
-// If work stealing is enabled then slices may shed their trailing tiles to
-// other threads that have completed all of their work (at a cost of power vs.
-// potential latency savings).
-typedef iree_alignas(iree_max_align_t) struct {
-  // Task header: implementation detail, do not use.
-  iree_task_t header;
-
-  // NOTE: the following fields are mostly replicated from iree_task_dispatch_t.
-  // This removes the need for touching the dispatch struct when beginning a
-  // tile which would likely be a cache miss as we fan out to other cores.
-
-  // Function closure to call per tile (same as the closure in the dispatch).
-  iree_task_dispatch_closure_t closure;
-
-  // Base workgroup ID for the slice range.
-  uint32_t workgroup_base[3];
-  // Total count of tiles within the slice range.
-  uint32_t workgroup_range[3];
-
-  // Workgroup size for each invocation.
-  uint32_t workgroup_size[3];
-  // Total workgroup count for the task. Can be used in conjunction with the
-  // per-invocation workgroup_xyz and workgroup_size to compute offsets/indices.
-  uint32_t workgroup_count[3];
-
-  // Optional transient shared memory size in bytes to allocate and pass into
-  // the iree_task_tile_context_t::local_memory of each invocation of the
-  // dispatch closure.
-  uint32_t local_memory_size;
-
-  // Shared statistics counters for the entire dispatch. References the storage
-  // held in the parent iree_task_dispatch_t.
-  iree_task_dispatch_statistics_t* dispatch_statistics;
-  // Statistics just for this single slice. The statistics will be added to the
-  // dispatch_statistics after the slice completes to prevent excessive
-  // contention on the shared dispatch statistics across multiple threads.
-  iree_task_dispatch_statistics_t slice_statistics;
-
-  // Per-tile initialized coroutine storage for all tiles in the range
-  // initialized as each tile begins execution.
-  // TODO(benvanik): coroutine storage as iree_task_tile_storage_t.
-} iree_task_dispatch_slice_t;
-
-// TODO(benvanik): document initialize() for slice pre-planning/embeddeding.
-// This would be useful to reduce latency when the number of slices is small
-// (~<5) as the dispatch wouldn't need to be issued. This can also be used to
-// implement per-region dependencies as direct slice->slice deps vs. fork/join
-// dispatch->dispatch deps. Show how IREE_TASK_FLAG_DISPATCH_RETIRE is set.
-void iree_task_dispatch_slice_initialize(iree_task_dispatch_t* dispatch_task,
-                                         const uint32_t workgroup_base[3],
-                                         const uint32_t workgroup_range[3],
-                                         const uint32_t workgroup_count[3],
-                                         iree_task_dispatch_slice_t* out_task);
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
@@ -639,19 +584,12 @@
   // Task header: implementation detail, do not use.
   iree_task_t header;
 
-  // The root dispatch task that this shard is a part of.
-  iree_task_dispatch_t* dispatch_task;
-
-  // Active dispatch progress shared across all shards.
-  // Each shard will be read/modify/writing this and there's likely to be
-  // contention.
-  iree_task_dispatch_shard_state_t* shared_state;
+  // NOTE: the parent dispatch task this shard is applied to is in the
+  // header.completion_task field.
 } iree_task_dispatch_shard_t;
 
-void iree_task_dispatch_shard_initialize(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_dispatch_shard_t* out_task);
+void iree_task_dispatch_shard_initialize(iree_task_dispatch_t* dispatch_task,
+                                         iree_task_dispatch_shard_t* out_task);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/iree/task/task_impl.h b/iree/task/task_impl.h
index f938d87..11639d9 100644
--- a/iree/task/task_impl.h
+++ b/iree/task/task_impl.h

@@ -34,9 +34,10 @@
 // Executes and retires a user call.
 // May block the caller for an indeterminate amount of time and should only be
 // called from threads owned by or donated to the executor.
-// Returns the status of the user call.
-iree_status_t iree_task_call_execute(
-    iree_task_call_t* task, iree_task_submission_t* pending_submission);
+//
+// Errors are propagated to the parent scope.
+void iree_task_call_execute(iree_task_call_t* task,
+                            iree_task_submission_t* pending_submission);
 
 //==============================================================================
 // IREE_TASK_TYPE_BARRIER
@@ -78,83 +79,44 @@
 // IREE_TASK_TYPE_DISPATCH
 //==============================================================================
 
-// Schedules a dispatch by forking out to zero or more slices that will be
-// executed on workers. The slices are allocated from an executor-owned pool
-// and are generally not user-visible - they'll just see their dispatch begin
-// execution prior to the slices and end execution after the last slice
-// finishes.
-//
-// Only called during coordination and expects the coordinator lock to be held.
-void iree_task_dispatch_issue_sliced(iree_task_dispatch_t* dispatch_task,
-                                     iree_task_pool_t* slice_task_pool,
-                                     iree_task_submission_t* pending_submission,
-                                     iree_task_post_batch_t* post_batch);
-
 // Schedules a dispatch by forking out to zero or more shards that will be
 // executed on workers. The shards are allocated from an executor-owned pool
 // and are generally not user-visible - they'll just see their dispatch begin
-// execution prior to the slices and end execution after the last shard
+// execution prior to the shards and end execution after the last shard
 // finishes.
 //
 // Only called during coordination and expects the coordinator lock to be held.
-void iree_task_dispatch_issue_sharded(
-    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool,
-    iree_task_submission_t* pending_submission,
-    iree_task_post_batch_t* post_batch);
+void iree_task_dispatch_issue(iree_task_dispatch_t* dispatch_task,
+                              iree_task_pool_t* shard_task_pool,
+                              iree_task_submission_t* pending_submission,
+                              iree_task_post_batch_t* post_batch);
 
-// Retires a dispatch when all issued slices have completed executing.
+// Retires a dispatch when all issued shards have completed executing.
 //
 // Only called during coordination and expects the coordinator lock to be held.
 void iree_task_dispatch_retire(iree_task_dispatch_t* dispatch_task,
                                iree_task_submission_t* pending_submission);
 
 //==============================================================================
-// IREE_TASK_TYPE_DISPATCH_SLICE
-//==============================================================================
-
-// Allocates a dispatch slice task from the shared executor task pool.
-// The slice will be released back to the pool when it has completed execution.
-iree_task_dispatch_slice_t* iree_task_dispatch_slice_allocate(
-    iree_task_dispatch_t* dispatch_task, const uint32_t workgroup_base[3],
-    const uint32_t workgroup_range[3], const uint32_t workgroup_count[3],
-    iree_task_pool_t* slice_task_pool);
-
-// Executes and retires a dispatch slice task.
-// May block the caller for an indeterminate amount of time and should only be
-// called from threads owned by or donated to the executor.
-//
-// |local_memory| is a block of memory exclusively available to the slice
-// during execution. Contents are undefined both before and after execution.
-//
-// Returns ok if all tiles were successfully executed and otherwise returns
-// an unspecified status (probably the first non-ok status hit).
-iree_status_t iree_task_dispatch_slice_execute(
-    iree_task_dispatch_slice_t* task, iree_byte_span_t local_memory,
-    iree_task_submission_t* pending_submission);
-
-//==============================================================================
 // IREE_TASK_TYPE_DISPATCH_SHARD
 //==============================================================================
 
 // Allocates a dispatch shard task from the shared executor task pool.
 // The shard will be released back to the pool when it has completed execution.
 iree_task_dispatch_shard_t* iree_task_dispatch_shard_allocate(
-    iree_task_dispatch_t* dispatch_task,
-    iree_task_dispatch_shard_state_t* shared_state,
-    iree_task_pool_t* shard_task_pool);
+    iree_task_dispatch_t* dispatch_task, iree_task_pool_t* shard_task_pool);
 
 // Executes and retires a dispatch shard task.
 // May block the caller for an indeterminate amount of time and should only be
 // called from threads owned by or donated to the executor.
 //
-// |local_memory| is a block of memory exclusively available to the shard
+// |worker_local_memory| is a block of memory exclusively available to the shard
 // during execution. Contents are undefined both before and after execution.
 //
-// Returns ok if all tiles processed in the shard successfully executed and
-// otherwise returns an unspecified status (probably the first non-ok status
-// hit).
-iree_status_t iree_task_dispatch_shard_execute(
-    iree_task_dispatch_shard_t* task, iree_byte_span_t local_memory,
+// Errors are propagated to the parent scope and the dispatch will fail once
+// all shards have completed.
+void iree_task_dispatch_shard_execute(
+    iree_task_dispatch_shard_t* task, iree_byte_span_t worker_local_memory,
     iree_task_submission_t* pending_submission);
 
 #ifdef __cplusplus

diff --git a/iree/task/task_test_barrier.cc b/iree/task/task_test_barrier.cc
index 40a1c04..dcb8937 100644
--- a/iree/task/task_test_barrier.cc
+++ b/iree/task/task_test_barrier.cc

@@ -16,6 +16,10 @@
 
 namespace {
 
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
 class TaskBarrierTest : public TaskTest {};
 
 enum {
@@ -30,16 +34,16 @@
   std::atomic<uint32_t> tasks_called = {0};
 };
 
-#define MAKE_CALL_TASK_CLOSURE(task_ctx, task_id)      \
-  iree_task_make_call_closure(                         \
-      [](uintptr_t user_context, iree_task_t* task,    \
-         iree_task_submission_t* pending_submission) { \
-        auto* ctx = (TaskCtx*)user_context;            \
-        EXPECT_EQ(0, (ctx->tasks_called & (task_id))); \
-        ctx->tasks_called |= (task_id);                \
-        return iree_ok_status();                       \
-      },                                               \
-      (uintptr_t)task_ctx)
+#define MAKE_CALL_TASK_CLOSURE(task_ctx, task_id, status_code) \
+  iree_task_make_call_closure(                                 \
+      [](void* user_context, iree_task_t* task,                \
+         iree_task_submission_t* pending_submission) {         \
+        auto* ctx = (TaskCtx*)user_context;                    \
+        EXPECT_EQ(0, (ctx->tasks_called & (task_id)));         \
+        ctx->tasks_called |= (task_id);                        \
+        return iree_status_from_code(status_code);             \
+      },                                                       \
+      (void*)task_ctx)
 
 // Issues a standalone empty barrier:
 //  { barrier }
@@ -52,15 +56,17 @@
 
 // Issues a serialized sequence:
 //  { a | barrier | b }
-TEST_F(TaskBarrierTest, IssueSerializedSequence) {
+TEST_F(TaskBarrierTest, IssueSequence) {
   TaskCtx task_ctx;
 
   iree_task_call_t task_a;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A),
-                            &task_a);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
   iree_task_call_t task_b;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B),
-                            &task_b);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
 
   iree_task_t* dependent_tasks[1] = {&task_b.header};
   iree_task_barrier_t barrier_task;
@@ -72,23 +78,91 @@
   EXPECT_EQ(TASK_A | TASK_B, task_ctx.tasks_called);
 }
 
+// Issues a serialized sequence where task A fails:
+//  { a | barrier | b }
+// B should not be run.
+TEST_F(TaskBarrierTest, IssueSequenceFailure) {
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+
+  iree_task_t* dependent_tasks[1] = {&task_b.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+// Issues a deeply serialized sequence where task A fails:
+//  { a | barrier | b | barrier | c }
+// B and C should not be run.
+TEST_F(TaskBarrierTest, IssueDeepSequenceFailure) {
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+
+  iree_task_t* dependent_tasks_0[1] = {&task_b.header};
+  iree_task_barrier_t barrier_task_0;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks_0),
+                               dependent_tasks_0, &barrier_task_0);
+  iree_task_set_completion_task(&task_a.header, &barrier_task_0.header);
+
+  iree_task_t* dependent_tasks_1[1] = {&task_c.header};
+  iree_task_barrier_t barrier_task_1;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks_1),
+                               dependent_tasks_1, &barrier_task_1);
+  iree_task_set_completion_task(&task_b.header, &barrier_task_1.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_c.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
 // Issues a join:
 //  { a, b, c | barrier | d }
 TEST_F(TaskBarrierTest, IssueJoin) {
   TaskCtx task_ctx;
 
   iree_task_call_t task_a;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A),
-                            &task_a);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
   iree_task_call_t task_b;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B),
-                            &task_b);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
   iree_task_call_t task_c;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C),
-                            &task_c);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
   iree_task_call_t task_d;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D),
-                            &task_d);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
 
   iree_task_t* dependent_tasks[1] = {&task_d.header};
   iree_task_barrier_t barrier_task;
@@ -107,23 +181,69 @@
   EXPECT_EQ(TASK_A | TASK_B | TASK_C | TASK_D, task_ctx.tasks_called);
 }
 
+// Issues a join where a dependent task B fails:
+//  { a, b, c | barrier | d }
+// A, B, and C should all run but the barrier should fail and D should not.
+TEST_F(TaskBarrierTest, IssueJoinFailure) {
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_DATA_LOSS),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[1] = {&task_d.header};
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_b.header, &barrier_task.header);
+  iree_task_set_completion_task(&task_c.header, &barrier_task.header);
+
+  iree_task_submission_t submission;
+  iree_task_submission_initialize(&submission);
+  iree_task_submission_enqueue(&submission, &task_a.header);
+  iree_task_submission_enqueue(&submission, &task_b.header);
+  iree_task_submission_enqueue(&submission, &task_c.header);
+  IREE_ASSERT_OK(SubmitAndWaitIdle(&submission, &task_d.header));
+  EXPECT_EQ(TASK_A | TASK_B | TASK_C, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
 // Issues a fork:
 //  { a | barrier | b, c, d | nop }
 TEST_F(TaskBarrierTest, IssueFork) {
   TaskCtx task_ctx;
 
   iree_task_call_t task_a;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A),
-                            &task_a);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_OK),
+      &task_a);
   iree_task_call_t task_b;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B),
-                            &task_b);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
   iree_task_call_t task_c;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C),
-                            &task_c);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
   iree_task_call_t task_d;
-  iree_task_call_initialize(&scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D),
-                            &task_d);
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
 
   iree_task_t* dependent_tasks[3] = {
       &task_b.header,
@@ -146,4 +266,50 @@
   EXPECT_EQ(TASK_A | TASK_B | TASK_C | TASK_D, task_ctx.tasks_called);
 }
 
+// Issues a fork where task A fails:
+//  { a (fails) | barrier | b, c, d | nop }
+// The barrier should fail and none of the subsequent tasks B, C, D should run.
+TEST_F(TaskBarrierTest, IssueForkFailure) {
+  TaskCtx task_ctx;
+
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_A, IREE_STATUS_DATA_LOSS),
+      &task_a);
+  iree_task_call_t task_b;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_B, IREE_STATUS_OK),
+      &task_b);
+  iree_task_call_t task_c;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_C, IREE_STATUS_OK),
+      &task_c);
+  iree_task_call_t task_d;
+  iree_task_call_initialize(
+      &scope_, MAKE_CALL_TASK_CLOSURE(&task_ctx, TASK_D, IREE_STATUS_OK),
+      &task_d);
+
+  iree_task_t* dependent_tasks[3] = {
+      &task_b.header,
+      &task_c.header,
+      &task_d.header,
+  };
+  iree_task_barrier_t barrier_task;
+  iree_task_barrier_initialize(&scope_, IREE_ARRAYSIZE(dependent_tasks),
+                               dependent_tasks, &barrier_task);
+  iree_task_set_completion_task(&task_a.header, &barrier_task.header);
+
+  // Just to give us a tail task to wait on.
+  iree_task_nop_t nop_task;
+  iree_task_nop_initialize(&scope_, &nop_task);
+  iree_task_set_completion_task(&task_b.header, &nop_task.header);
+  iree_task_set_completion_task(&task_c.header, &nop_task.header);
+  iree_task_set_completion_task(&task_d.header, &nop_task.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &nop_task.header));
+  EXPECT_EQ(TASK_A, task_ctx.tasks_called);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
 }  // namespace

diff --git a/iree/task/task_test_call.cc b/iree/task/task_test_call.cc
index 05b18fb..014c3ca 100644
--- a/iree/task/task_test_call.cc
+++ b/iree/task/task_test_call.cc

@@ -17,8 +17,13 @@
 
 namespace {
 
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
 class TaskCallTest : public TaskTest {};
 
+// Tests issuing a single call and waiting for it to complete.
 TEST_F(TaskCallTest, Issue) {
   struct TestCtx {
     int did_call = 0;
@@ -28,7 +33,7 @@
   iree_task_call_t task;
   iree_task_call_initialize(&scope_,
                             iree_task_make_call_closure(
-                                [](uintptr_t user_context, iree_task_t* task,
+                                [](void* user_context, iree_task_t* task,
                                    iree_task_submission_t* pending_submission) {
                                   auto* ctx = (TestCtx*)user_context;
                                   EXPECT_TRUE(NULL != ctx);
@@ -36,10 +41,127 @@
                                   ++ctx->did_call;
                                   return iree_ok_status();
                                 },
-                                (uintptr_t)&ctx),
+                                (void*)&ctx),
                             &task);
   IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
   EXPECT_EQ(1, ctx.did_call);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+}
+
+// Tests issuing a single call that returns a failure.
+// The failure should be propagated back on the task scope.
+TEST_F(TaskCallTest, IssueFailure) {
+  struct TestCtx {
+    int did_call = 0;
+  };
+  TestCtx ctx;
+
+  // Call successfully issues but fails with some user error.
+  iree_task_call_t task;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call);
+                                  ++ctx->did_call;
+                                  return iree_make_status(
+                                      IREE_STATUS_UNAUTHENTICATED, "whoops!");
+                                },
+                                (void*)&ctx),
+                            &task);
+
+  // The task should still be cleaned up, even if it fails.
+  static int did_cleanup = 0;
+  did_cleanup = 0;
+  iree_task_set_cleanup_fn(
+      &task.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup;
+      });
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+
+  // Expect both the call to have been made and the task cleaned up.
+  // The scope has the failure status.
+  EXPECT_EQ(1, ctx.did_call);
+  EXPECT_EQ(1, did_cleanup);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kUnauthenticated));
+}
+
+// Tests issuing chained calls where the first fails.
+// The failure should be propagated back on the task scope and the chained call
+// should be aborted.
+TEST_F(TaskCallTest, IssueFailureChained) {
+  struct TestCtx {
+    int did_call_a = 0;
+    int did_call_b = 0;
+  };
+  TestCtx ctx;
+
+  // First call that will fail.
+  iree_task_call_t task_a;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call_a);
+                                  ++ctx->did_call_a;
+                                  // Force a failure.
+                                  return iree_make_status(
+                                      IREE_STATUS_UNAUTHENTICATED, "whoops!");
+                                },
+                                (void*)&ctx),
+                            &task_a);
+  static int did_cleanup_a = 0;
+  did_cleanup_a = 0;
+  iree_task_set_cleanup_fn(
+      &task_a.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        // Expect that the cleanup gets a signal indicating the task failed.
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup_a;
+      });
+
+  // Second call that will be aborted after the first fails.
+  iree_task_call_t task_b;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  // This should never get called!
+                                  auto* ctx = (TestCtx*)user_context;
+                                  EXPECT_TRUE(NULL != ctx);
+                                  EXPECT_EQ(0, ctx->did_call_b);
+                                  ++ctx->did_call_b;
+                                  return iree_ok_status();
+                                },
+                                (void*)&ctx),
+                            &task_b);
+  static int did_cleanup_b = 0;
+  did_cleanup_b = 0;
+  iree_task_set_cleanup_fn(
+      &task_b.header, +[](iree_task_t* task, iree_status_code_t status_code) {
+        // Expect that the cleanup gets a signal indicating the task failed.
+        EXPECT_EQ(status_code, IREE_STATUS_ABORTED);
+        ++did_cleanup_b;
+      });
+
+  // A -> B
+  iree_task_set_completion_task(&task_a.header, &task_b.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+
+  // Expect that A was called but B was not, and both were cleaned up.
+  EXPECT_EQ(1, ctx.did_call_a);
+  EXPECT_EQ(1, did_cleanup_a);
+  EXPECT_EQ(0, ctx.did_call_b);
+  EXPECT_EQ(1, did_cleanup_b);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kUnauthenticated));
 }
 
 // Issues task_a which then issues a nested task_b and waits for it to complete
@@ -63,7 +185,7 @@
   iree_task_call_initialize(
       &scope_,
       iree_task_make_call_closure(
-          [](uintptr_t user_context, iree_task_t* task,
+          [](void* user_context, iree_task_t* task,
              iree_task_submission_t* pending_submission) {
             auto* ctx = (TestCtx*)user_context;
             EXPECT_TRUE(NULL != ctx);
@@ -75,7 +197,7 @@
               iree_task_call_initialize(
                   task->scope,
                   iree_task_make_call_closure(
-                      [](uintptr_t user_context, iree_task_t* task,
+                      [](void* user_context, iree_task_t* task,
                          iree_task_submission_t* pending_submission) {
                         auto* ctx = (TestCtx*)user_context;
                         EXPECT_TRUE(NULL != ctx);
@@ -95,11 +217,96 @@
 
             return iree_ok_status();
           },
-          (uintptr_t)&ctx),
+          (void*)&ctx),
       &task_a);
   IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_a.header));
   EXPECT_EQ(2, ctx.did_call_a);
   EXPECT_EQ(1, ctx.did_call_b);
+  IREE_EXPECT_OK(iree_task_scope_consume_status(&scope_));
+}
+
+// Issues task_a which then issues a nested task_b and task_c; task_b fails and
+// it's expected that task_c completes before failing task_a.
+// Sibling tasks don't abort each other and as such we are guaranteed that C
+// will run: A -> [B fail, C ok] -> A fail
+TEST_F(TaskCallTest, IssueNestedFailure) {
+  struct TestCtx {
+    std::atomic<int> did_call_a = {0};
+    std::atomic<int> did_call_b = {0};
+    std::atomic<int> did_call_c = {0};
+    std::atomic<bool> has_issued = {false};
+    iree_task_call_t task_b;
+    iree_task_call_t task_c;
+  };
+  TestCtx ctx;
+
+  // task_a will get called only once due to the error: the pre-nesting call
+  // will schedule task_b/task_c and then the expected call after the tasks
+  // complete will not be made as task_b fails.
+  iree_task_call_t task_a;
+  iree_task_call_initialize(
+      &scope_,
+      iree_task_make_call_closure(
+          [](void* user_context, iree_task_t* task,
+             iree_task_submission_t* pending_submission) {
+            auto* ctx = (TestCtx*)user_context;
+            EXPECT_TRUE(NULL != ctx);
+
+            if (!ctx->has_issued) {
+              ctx->has_issued = true;
+              EXPECT_EQ(0, ctx->did_call_a);
+              ++ctx->did_call_a;
+
+              // task_b: (fails)
+              iree_task_call_initialize(
+                  task->scope,
+                  iree_task_make_call_closure(
+                      [](void* user_context, iree_task_t* task,
+                         iree_task_submission_t* pending_submission) {
+                        auto* ctx = (TestCtx*)user_context;
+                        EXPECT_TRUE(NULL != ctx);
+                        EXPECT_EQ(0, ctx->did_call_b);
+                        ++ctx->did_call_b;
+                        return iree_make_status(IREE_STATUS_DATA_LOSS, "uh oh");
+                      },
+                      user_context),
+                  &ctx->task_b);
+              iree_task_set_completion_task(&ctx->task_b.header, task);
+              iree_task_submission_enqueue(pending_submission,
+                                           &ctx->task_b.header);
+
+              // task_c: (ok)
+              iree_task_call_initialize(
+                  task->scope,
+                  iree_task_make_call_closure(
+                      [](void* user_context, iree_task_t* task,
+                         iree_task_submission_t* pending_submission) {
+                        auto* ctx = (TestCtx*)user_context;
+                        EXPECT_TRUE(NULL != ctx);
+                        EXPECT_EQ(0, ctx->did_call_c);
+                        ++ctx->did_call_c;
+                        return iree_ok_status();
+                      },
+                      user_context),
+                  &ctx->task_c);
+              iree_task_set_completion_task(&ctx->task_c.header, task);
+              iree_task_submission_enqueue(pending_submission,
+                                           &ctx->task_c.header);
+            } else {
+              EXPECT_EQ(1, ctx->did_call_a);
+              ++ctx->did_call_a;
+            }
+
+            return iree_ok_status();
+          },
+          (void*)&ctx),
+      &task_a);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_a.header));
+  EXPECT_EQ(1, ctx.did_call_a);
+  EXPECT_EQ(1, ctx.did_call_b);
+  EXPECT_EQ(1, ctx.did_call_c);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
 }
 
 }  // namespace

diff --git a/iree/task/task_test_dispatch.cc b/iree/task/task_test_dispatch.cc
index e8e7dee..19ff45c 100644
--- a/iree/task/task_test_dispatch.cc
+++ b/iree/task/task_test_dispatch.cc

@@ -17,6 +17,10 @@
 
 namespace {
 
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
 class GridCoverage {
  public:
   explicit GridCoverage(const uint32_t workgroup_count[3])
@@ -39,7 +43,7 @@
     return true;
   }
 
-  static iree_status_t Tile(uintptr_t user_context,
+  static iree_status_t Tile(void* user_context,
                             const iree_task_tile_context_t* tile_context,
                             iree_task_submission_t* pending_submission) {
     GridCoverage* coverage = reinterpret_cast<GridCoverage*>(user_context);
@@ -70,66 +74,38 @@
                              uint32_t dispatch_flags) {
     GridCoverage coverage(workgroup_count);
     iree_task_dispatch_t task;
-    iree_task_dispatch_initialize(&scope_,
-                                  iree_task_make_dispatch_closure(
-                                      GridCoverage::Tile, (uintptr_t)&coverage),
-                                  workgroup_size, workgroup_count, &task);
+    iree_task_dispatch_initialize(
+        &scope_,
+        iree_task_make_dispatch_closure(GridCoverage::Tile, (void*)&coverage),
+        workgroup_size, workgroup_count, &task);
     task.header.flags |= dispatch_flags;
     IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
     EXPECT_TRUE(coverage.Verify());
   }
 };
 
-TEST_F(TaskDispatchTest, Issue000Sharded) {
+TEST_F(TaskDispatchTest, Issue000) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {0, 0, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue000Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {0, 0, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue120Sharded) {
+TEST_F(TaskDispatchTest, Issue120) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {1, 2, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue120Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {1, 2, 0};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue111Sharded) {
+TEST_F(TaskDispatchTest, Issue111) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {1, 1, 1};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
-TEST_F(TaskDispatchTest, Issue111Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {1, 1, 1};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
-}
-
-TEST_F(TaskDispatchTest, Issue345Sharded) {
+TEST_F(TaskDispatchTest, Issue345) {
   const uint32_t kWorkgroupSize[3] = {1, 1, 1};
   const uint32_t kWorkgroupCount[3] = {3, 4, 5};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, 0);
-}
-
-TEST_F(TaskDispatchTest, Issue345Sliced) {
-  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
-  const uint32_t kWorkgroupCount[3] = {3, 4, 5};
-  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount,
-                        IREE_TASK_FLAG_DISPATCH_SLICED);
+  DispatchAndVerifyGrid(kWorkgroupSize, kWorkgroupCount, IREE_TASK_FLAG_NONE);
 }
 
 TEST_F(TaskDispatchTest, IssueIndirect) {
@@ -142,7 +118,7 @@
   iree_task_call_initialize(
       &scope_,
       iree_task_make_call_closure(
-          [](uintptr_t user_context, iree_task_t* task,
+          [](void* user_context, iree_task_t* task,
              iree_task_submission_t* pending_submission) {
             uint32_t* indirect_workgroup_count_ptr = (uint32_t*)user_context;
             for (size_t i = 0; i < IREE_ARRAYSIZE(kWorkgroupCount); ++i) {
@@ -150,13 +126,13 @@
             }
             return iree_ok_status();
           },
-          (uintptr_t)indirect_workgroup_count),
+          (void*)indirect_workgroup_count),
       &calculate_task);
 
   iree_task_dispatch_t dispatch_task;
   iree_task_dispatch_initialize_indirect(
       &scope_,
-      iree_task_make_dispatch_closure(GridCoverage::Tile, (uintptr_t)&coverage),
+      iree_task_make_dispatch_closure(GridCoverage::Tile, (void*)&coverage),
       kWorkgroupSize, indirect_workgroup_count, &dispatch_task);
   iree_task_set_completion_task(&calculate_task.header, &dispatch_task.header);
 
@@ -165,4 +141,63 @@
   EXPECT_TRUE(coverage.Verify());
 }
 
+TEST_F(TaskDispatchTest, IssueFailure) {
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {64, 1, 1};
+
+  auto tile = [](void* user_context,
+                 const iree_task_tile_context_t* tile_context,
+                 iree_task_submission_t* pending_submission) -> iree_status_t {
+    return tile_context->workgroup_xyz[0] == 32
+               ? iree_make_status(IREE_STATUS_DATA_LOSS, "whoops!")
+               : iree_ok_status();
+  };
+
+  iree_task_dispatch_t task;
+  iree_task_dispatch_initialize(&scope_,
+                                iree_task_make_dispatch_closure(tile, NULL),
+                                kWorkgroupSize, kWorkgroupCount, &task);
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task.header, &task.header));
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
+TEST_F(TaskDispatchTest, IssueFailureChained) {
+  const uint32_t kWorkgroupSize[3] = {1, 1, 1};
+  const uint32_t kWorkgroupCount[3] = {64, 1, 1};
+
+  auto tile = [](void* user_context,
+                 const iree_task_tile_context_t* tile_context,
+                 iree_task_submission_t* pending_submission) -> iree_status_t {
+    return tile_context->workgroup_xyz[0] == 32
+               ? iree_make_status(IREE_STATUS_DATA_LOSS, "whoops!")
+               : iree_ok_status();
+  };
+
+  iree_task_dispatch_t dispatch_task;
+  iree_task_dispatch_initialize(
+      &scope_, iree_task_make_dispatch_closure(tile, NULL), kWorkgroupSize,
+      kWorkgroupCount, &dispatch_task);
+
+  int did_call = 0;
+  iree_task_call_t call_task;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_ok_status();
+                                },
+                                &did_call),
+                            &call_task);
+  iree_task_set_completion_task(&dispatch_task.header, &call_task.header);
+
+  IREE_ASSERT_OK(
+      SubmitTasksAndWaitIdle(&dispatch_task.header, &call_task.header));
+  EXPECT_EQ(0, did_call);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
 }  // namespace

diff --git a/iree/task/task_test_fence.cc b/iree/task/task_test_fence.cc
index 95ddea4..6626378 100644
--- a/iree/task/task_test_fence.cc
+++ b/iree/task/task_test_fence.cc

@@ -11,8 +11,13 @@
 
 namespace {
 
+using iree::Status;
+using iree::StatusCode;
+using iree::testing::status::StatusIs;
+
 class TaskFenceTest : public TaskTest {};
 
+// Tests a chain of fences A -> B -> C.
 TEST_F(TaskFenceTest, IssueChained) {
   iree_task_fence_t task_a;
   iree_task_fence_initialize(&scope_, &task_a);
@@ -28,4 +33,46 @@
   IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_c.header));
 }
 
+// Tests that failures propagate through fences; task B should not be called.
+// A fails -> fence -> B
+TEST_F(TaskFenceTest, IssueChainedFailure) {
+  int did_call_a = 0;
+  iree_task_call_t task_a;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_make_status(IREE_STATUS_DATA_LOSS,
+                                                          "whoops!");
+                                },
+                                &did_call_a),
+                            &task_a);
+
+  iree_task_fence_t fence_task;
+  iree_task_fence_initialize(&scope_, &fence_task);
+  iree_task_set_completion_task(&task_a.header, &fence_task.header);
+
+  int did_call_b = 0;
+  iree_task_call_t task_b;
+  iree_task_call_initialize(&scope_,
+                            iree_task_make_call_closure(
+                                [](void* user_context, iree_task_t* task,
+                                   iree_task_submission_t* pending_submission) {
+                                  int* did_call_ptr = (int*)user_context;
+                                  ++(*did_call_ptr);
+                                  return iree_ok_status();
+                                },
+                                &did_call_b),
+                            &task_b);
+  iree_task_set_completion_task(&fence_task.header, &task_b.header);
+
+  IREE_ASSERT_OK(SubmitTasksAndWaitIdle(&task_a.header, &task_b.header));
+  EXPECT_EQ(1, did_call_a);
+  EXPECT_EQ(0, did_call_b);
+  EXPECT_THAT(Status(iree_task_scope_consume_status(&scope_)),
+              StatusIs(StatusCode::kDataLoss));
+}
+
 }  // namespace

diff --git a/iree/task/tuning.h b/iree/task/tuning.h
index b1c1c21..015e567 100644
--- a/iree/task/tuning.h
+++ b/iree/task/tuning.h

@@ -17,21 +17,15 @@
 // only <64 will ever be used (such as for devices with 2 cores).
 #define IREE_TASK_EXECUTOR_MAX_WORKER_COUNT (64)
 
-// Initial number of slice tasks that are allocated in the executor pool.
-// Increasing this number will decrease initial allocation storms in cases of
-// extremely wide fan-out (many dispatches with many thousands of slices) at the
-// cost of a higher minimum memory consumption.
-//
-// Set to zero if sliced dispatches will not be used or will be allocated by the
-// caller to avoid fixed overhead associated with the internal executor pool.
-#define IREE_TASK_EXECUTOR_INITIAL_SLICE_RESERVATION_PER_WORKER (0)
-
 // Initial number of shard tasks that are allocated in the executor pool.
 // Increasing this number will decrease initial allocation storms in cases of
 // extremely wide concurrency regions (many dispatches running at the same time)
 // at the cost of a higher minimum memory consumption.
 #define IREE_TASK_EXECUTOR_INITIAL_SHARD_RESERVATION_PER_WORKER (4)
 
+// Maximum number of events retained by the executor event pool.
+#define IREE_TASK_EXECUTOR_EVENT_POOL_CAPACITY 64
+
 // Maximum number of simultaneous waits an executor may perform as part of a
 // wait-any operation. A larger value may enable better wake coalescing by the
 // kernel. This is only a count limiting wait tasks that have been scheduled and
@@ -78,24 +72,6 @@
 #define IREE_TASK_EXECUTOR_MAX_THEFT_TASK_COUNT \
   IREE_TASK_EXECUTOR_MAX_WORKER_COUNT
 
-// Number of tiles that will be batched into a single slice along each XYZ dim.
-//
-// Larger numbers reduce overhead and ensure that more tiles are executed
-// locally on the same worker (== shared caches) while also increasing potential
-// latency as work-stealing (always per-slice) is less effective.
-//
-// Numbers >1 on Y and Z can be used to cluster tiles together within the same
-// slice when spatial coherence outside of the X dimension is useful.
-//
-// The current usage of this is provisional; we may do all of this from the
-// compiler and want this behavior to be relatively fixed so that we can predict
-// it better. The only thing we want to be introducing here is flexibility for
-// when worker topology differs at runtime from what is knowable during offline
-// compilation.
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_X (8)
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Y (1)
-#define IREE_TASK_DISPATCH_TILES_PER_SLICE_Z (1)
-
 // Number of tiles that will be batched into a single reservation from the grid.
 // This is a maximum; if there are fewer tiles that would otherwise allow for
 // maximum parallelism then this may be ignored.

diff --git a/iree/task/worker.c b/iree/task/worker.c
index 3f22467..ee64bbe 100644
--- a/iree/task/worker.c
+++ b/iree/task/worker.c

@@ -167,7 +167,7 @@
 // Executes a task on a worker.
 // Only task types that are scheduled to workers are handled; all others must be
 // handled by the coordinator during scheduling.
-static iree_status_t iree_task_worker_execute(
+static void iree_task_worker_execute(
     iree_task_worker_t* worker, iree_task_t* task,
     iree_task_submission_t* pending_submission) {
   // Execute the task and resolve the task and gather any tasks that are now
@@ -180,31 +180,22 @@
   // TODO(benvanik): handle partial tasks and re-queuing.
   switch (task->type) {
     case IREE_TASK_TYPE_CALL: {
-      IREE_RETURN_IF_ERROR(
-          iree_task_call_execute((iree_task_call_t*)task, pending_submission));
-      break;
-    }
-    case IREE_TASK_TYPE_DISPATCH_SLICE: {
-      IREE_RETURN_IF_ERROR(iree_task_dispatch_slice_execute(
-          (iree_task_dispatch_slice_t*)task, worker->local_memory,
-          pending_submission));
+      iree_task_call_execute((iree_task_call_t*)task, pending_submission);
       break;
     }
     case IREE_TASK_TYPE_DISPATCH_SHARD: {
-      IREE_RETURN_IF_ERROR(iree_task_dispatch_shard_execute(
-          (iree_task_dispatch_shard_t*)task, worker->local_memory,
-          pending_submission));
+      iree_task_dispatch_shard_execute((iree_task_dispatch_shard_t*)task,
+                                       worker->local_memory,
+                                       pending_submission);
       break;
     }
     default:
-      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                              "incorrect task type for worker execution");
+      IREE_ASSERT_UNREACHABLE("incorrect task type for worker execution");
+      break;
   }
 
-  // NOTE: task is invalidated here!
+  // NOTE: task is invalidated above and must not be used!
   task = NULL;
-
-  return iree_ok_status();
 }
 
 // Pumps the worker thread once, processing a single task.
@@ -252,22 +243,7 @@
 
   // Execute the task (may call out to arbitrary user code and may submit more
   // tasks for execution).
-  iree_status_t status =
-      iree_task_worker_execute(worker, task, pending_submission);
-
-  // TODO(#4026): propagate failure to task scope.
-  // We currently drop the error on the floor here; that's because the error
-  // should have already been propagated to the scope and everyone should be
-  // checking that before running things anyway.
-  //
-  // Since we can host work from multiple scopes and want to ensure an error
-  // in one doesn't bring down the whole system we pretend we executed
-  // something here by falling through.
-  if (!iree_status_is_ok(status)) {
-    iree_status_fprint(stderr, status);
-  }
-  IREE_ASSERT_TRUE(iree_status_is_ok(status));
-  iree_status_ignore(status);
+  iree_task_worker_execute(worker, task, pending_submission);
 
   IREE_TRACE_ZONE_END(z0);
   return true;  // try again

diff --git a/iree/task/worker.h b/iree/task/worker.h
index 9342727..1cc2d0f 100644
--- a/iree/task/worker.h
+++ b/iree/task/worker.h

@@ -58,8 +58,8 @@
 // not yet correctly) selected; see the 'LAYOUT' comments below.
 typedef struct iree_task_worker_t {
   // A LIFO mailbox used by coordinators to post tasks to this worker.
-  // As workers self-nominate to be coordinators and fan out dispatch slices
-  // they can directly emplace those slices into the workers that should execute
+  // As workers self-nominate to be coordinators and fan out dispatch shards
+  // they can directly emplace those shards into the workers that should execute
   // them based on the work distribution policy. When workers go to look for
   // more work after their local queue empties they will flush this list and
   // move all of the tasks into their local queue and restart processing.
@@ -127,7 +127,7 @@
   // workers.
   iree_byte_span_t local_memory;
 
-  // Worker-local FIFO queue containing the slices that will be processed by the
+  // Worker-local FIFO queue containing the tasks that will be processed by the
   // worker. This queue supports work-stealing by other workers if they run out
   // of work of their own.
   // LAYOUT: must be 64b away from mailbox_slist.

diff --git a/iree/vm/bytecode_dispatch_test.cc b/iree/vm/bytecode_dispatch_test.cc
index 77ab994..82b10a0 100644
--- a/iree/vm/bytecode_dispatch_test.cc
+++ b/iree/vm/bytecode_dispatch_test.cc

@@ -6,7 +6,7 @@
 
 // Tests covering the dispatch logic for individual ops.
 //
-// bytecode_dispatch_test.mlir contains the functions used here for testing. We
+// iree/vm/test/*.mlir contains the functions used here for testing. We
 // avoid defining the IR inline here so that we can run this test on platforms
 // that we can't run the full MLIR compiler stack on.
 

diff --git a/iree/vm/stack.h b/iree/vm/stack.h
index f28799d..277b3c1 100644
--- a/iree/vm/stack.h
+++ b/iree/vm/stack.h

@@ -143,7 +143,7 @@
 // The contents of the |storage| can be anything upon initialization and the
 // stack must be deinitialized with iree_vm_stack_deinitialize before the
 // storage is freed. The provided |allocator| is only used for stack growth
-// beyond the intial storage capacity and may be iree_allocator_null() to
+// beyond the initial storage capacity and may be iree_allocator_null() to
 // prevent growth. Use IREE_VM_STACK_DEFAULT_SIZE for a reasonable default or
 // use iree_vm_stack_allocate if the input programs may exceed reason.
 //
commit	e838b4aadaaed0d6f84203cfb7f20ce33e632fa3	[log] [tgz]
author	Ben Vanik <benvanik@google.com>	Tue Feb 01 14:18:29 2022 -0800
committer	GitHub <noreply@github.com>	Tue Feb 01 14:18:29 2022 -0800
tree	b2e9d0ea50dc244d65ed660e9cf5a7b09a4f32c0
parent	9b7d74d243db2b2214b9586e2ab0952bab60ce5f [diff]
parent	0adc497b35e6f0b40f6dee66a203aafe37d8e325 [diff]