Finally removing hal.ex.submit_and_wait 🎉.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertExperimentalOps.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertExperimentalOps.cpp
index 08609db..93b3f14 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertExperimentalOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/HALToVM/ConvertExperimentalOps.cpp
@@ -17,8 +17,6 @@
                                          RewritePatternSet &patterns) {
   patterns.insert<VMImportOpConversion<IREE::HAL::ExSharedDeviceOp>>(
       context, importSymbols, typeConverter, "hal.ex.shared_device");
-  patterns.insert<VMImportOpConversion<IREE::HAL::ExSubmitAndWaitOp>>(
-      context, importSymbols, typeConverter, "hal.ex.submit_and_wait");
 }
 
 }  // namespace iree_compiler
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
index c2a0701..67fd07a 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
@@ -45,15 +45,6 @@
   ];
 }
 
-def HAL_ExSubmitAndWaitOp : HAL_Op<"ex.submit_and_wait", [Util_YieldPoint]> {
-  let arguments = (ins
-    HAL_Device:$device,
-    HAL_CommandBuffer:$command_buffer
-  );
-
-  let assemblyFormat = "$device `,` $command_buffer attr-dict";
-}
-
 //===----------------------------------------------------------------------===//
 // Pseudo ops for conversion support
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir
index 8136e4a..1c8cabb 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/IR/test/experimental_ops.mlir
@@ -6,14 +6,3 @@
   %device = hal.ex.shared_device : !hal.device
   return %device : !hal.device
 }
-
-// -----
-
-// CHECK-LABEL: @submit_and_wait
-func.func @submit_and_wait() {
-  %0 = "test_hal.device"() : () -> !hal.device
-  %1 = "test_hal.command_buffer"() : () -> !hal.command_buffer
-  // CHECK: hal.ex.submit_and_wait %0, %1
-  hal.ex.submit_and_wait %0, %1
-  return
-}
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp
index 9c7fa98..65c72e3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/DumpExecutableBenchmarks.cpp
@@ -312,9 +312,27 @@
         forBuilder.create<scf::YieldOp>(loc);
       });
 
-  // Submit command buffer.
   funcBuilder.create<IREE::HAL::CommandBufferFinalizeOp>(loc, commandBuffer);
-  funcBuilder.create<IREE::HAL::ExSubmitAndWaitOp>(loc, device, commandBuffer);
+
+  // We begin executing immediately and then wait on a fence.
+  // TODO(benvanik): add fences to ABI so the benchmark tool can pipeline.
+  Value waitFence = funcBuilder.create<IREE::Util::NullOp>(
+      loc, funcBuilder.getType<IREE::HAL::FenceType>());
+  Value signalFence = funcBuilder.create<IREE::HAL::TimelineAdvanceOp>(
+      loc, funcBuilder.getType<IREE::HAL::FenceType>());
+
+  // Queue execution.
+  auto queueAffinity = funcBuilder.create<arith::ConstantIntOp>(loc, -1, 64);
+  funcBuilder.create<IREE::HAL::DeviceQueueExecuteOp>(
+      loc, device, queueAffinity, waitFence, signalFence,
+      ValueRange{commandBuffer});
+
+  // Block until it completes.
+  Value timeoutMillis = funcBuilder.create<arith::ConstantIntOp>(loc, -1, 32);
+  auto fenceOp = funcBuilder.create<IREE::HAL::FenceAwaitOp>(
+      loc, funcBuilder.getI32Type(), timeoutMillis, signalFence);
+  funcBuilder.create<IREE::Util::StatusCheckOkOp>(
+      loc, fenceOp.getStatus(), "failed to wait on timepoint");
 
   funcBuilder.create<mlir::func::ReturnOp>(loc);
 }
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir
index a862620..d80cdb3 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir
+++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/test/dump_executable_benchmarks.mlir
@@ -90,7 +90,7 @@
 
   // Submit and wait for dispatches to complete:
   // CHECK: hal.command_buffer.finalize<%[[CMD]] : !hal.command_buffer>
-  // CHECK: hal.ex.submit_and_wait %{{.+}}, %[[CMD]]
+  // CHECK: hal.fence.await
 
   // ===========================================================================
   // @dispatch1 benchmark logic (note two deduplicated dispatches):
diff --git a/runtime/src/iree/modules/hal/exports.inl b/runtime/src/iree/modules/hal/exports.inl
index ebc7c43..988841f 100644
--- a/runtime/src/iree/modules/hal/exports.inl
+++ b/runtime/src/iree/modules/hal/exports.inl
@@ -68,7 +68,6 @@
 EXPORT_FN("device.queue.flush", iree_hal_module_device_queue_flush, rI, v)
 
 EXPORT_FN("ex.shared_device", iree_hal_module_ex_shared_device, v, r)
-EXPORT_FN("ex.submit_and_wait", iree_hal_module_ex_submit_and_wait, rr, v)
 
 EXPORT_FN("executable.create", iree_hal_module_executable_create, rrrrCrD, r)
 
diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c
index bcf6dfc..bc0fe89 100644
--- a/runtime/src/iree/modules/hal/module.c
+++ b/runtime/src/iree/modules/hal/module.c
@@ -58,10 +58,6 @@
   // executables like ones for training vs inference in the same model, or just
   // always use this.
   iree_hal_executable_cache_t* executable_cache;
-
-  // TODO(benvanik): remove with submit_and_wait.
-  iree_hal_semaphore_t* submit_semaphore;
-  uint64_t submit_value;
 } iree_hal_module_state_t;
 
 static void IREE_API_PTR iree_hal_module_destroy(void* base_module) {
@@ -91,11 +87,6 @@
               state->shared_device, iree_string_view_empty(),
               iree_loop_inline(&state->loop_status), &state->executable_cache));
 
-  state->submit_value = 0ull;
-  IREE_RETURN_AND_END_ZONE_IF_ERROR(
-      z0, iree_hal_semaphore_create(state->shared_device, state->submit_value,
-                                    &state->submit_semaphore));
-
   *out_module_state = (iree_vm_module_state_t*)state;
   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
@@ -106,7 +97,6 @@
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_hal_module_state_t* state = (iree_hal_module_state_t*)module_state;
-  iree_hal_semaphore_release(state->submit_semaphore);
   iree_hal_executable_cache_release(state->executable_cache);
   iree_status_ignore(state->loop_status);
   iree_hal_device_release(state->shared_device);
@@ -140,40 +130,6 @@
   return iree_ok_status();
 }
 
-IREE_VM_ABI_EXPORT(iree_hal_module_ex_submit_and_wait,  //
-                   iree_hal_module_state_t,             //
-                   rr, v) {
-  iree_hal_device_t* device = NULL;
-  IREE_RETURN_IF_ERROR(iree_hal_device_check_deref(args->r0, &device));
-  iree_hal_command_buffer_t* command_buffer = NULL;
-  IREE_RETURN_IF_ERROR(
-      iree_hal_command_buffer_check_deref(args->r1, &command_buffer));
-
-  // Batch with our single command buffer.
-  iree_hal_submission_batch_t batch;
-  memset(&batch, 0, sizeof(batch));
-
-  iree_hal_command_buffer_t* command_buffer_ptrs[] = {command_buffer};
-  batch.command_buffer_count = IREE_ARRAYSIZE(command_buffer_ptrs);
-  batch.command_buffers = command_buffer_ptrs;
-
-  uint64_t next_semaphore_value = ++state->submit_value;
-  iree_hal_semaphore_t* signal_semaphore_ptrs[] = {state->submit_semaphore};
-  uint64_t signal_semaphore_values[] = {next_semaphore_value};
-  batch.signal_semaphores.count = IREE_ARRAYSIZE(signal_semaphore_ptrs);
-  batch.signal_semaphores.semaphores = signal_semaphore_ptrs;
-  batch.signal_semaphores.payload_values = signal_semaphore_values;
-
-  iree_status_t status = iree_hal_device_queue_submit(
-      device, IREE_HAL_COMMAND_CATEGORY_ANY, 0, 1, &batch);
-  if (iree_status_is_ok(status)) {
-    status = iree_hal_semaphore_wait(
-        state->submit_semaphore, next_semaphore_value, iree_infinite_timeout());
-  }
-
-  return status;
-}
-
 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//