blob: 38578d07ecb91e6d59d4ed9bb9cc07ed65a4a848 [file]
// Copyright 2026 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include "iree/hal/drivers/webgpu/webgpu_queue.h"
#include "iree/async/frontier.h"
#include "iree/async/frontier_tracker.h"
#include "iree/async/operation.h"
#include "iree/async/operations/semaphore.h"
#include "iree/async/platform/js/proactor.h"
#include "iree/hal/drivers/webgpu/webgpu.h"
#include "iree/hal/drivers/webgpu/webgpu_buffer.h"
#include "iree/hal/drivers/webgpu/webgpu_command_buffer.h"
#include "iree/hal/drivers/webgpu/webgpu_executable.h"
#include "iree/hal/drivers/webgpu/webgpu_fd_file.h"
#include "iree/hal/drivers/webgpu/webgpu_imports.h"
#include "iree/hal/drivers/webgpu/webgpu_semaphore.h"
//===----------------------------------------------------------------------===//
// iree_hal_webgpu_queue_t
//===----------------------------------------------------------------------===//
iree_status_t iree_hal_webgpu_queue_initialize(
iree_hal_webgpu_handle_t device_handle,
iree_hal_webgpu_handle_t queue_handle,
const iree_hal_webgpu_builtins_t* builtins, iree_async_proactor_t* proactor,
iree_async_frontier_tracker_t* frontier_tracker, iree_async_axis_t axis,
iree_allocator_t host_allocator, iree_hal_webgpu_queue_t* out_queue) {
IREE_ASSERT_ARGUMENT(out_queue);
out_queue->device_handle = device_handle;
out_queue->queue_handle = queue_handle;
out_queue->builtins = builtins;
out_queue->proactor = proactor;
out_queue->frontier_tracker = frontier_tracker;
out_queue->axis = axis;
iree_atomic_store(&out_queue->epoch, 0, iree_memory_order_relaxed);
out_queue->host_allocator = host_allocator;
// Initialize the shared block pool for instruction stream builders.
iree_arena_block_pool_initialize(/*total_block_size=*/65536, host_allocator,
&out_queue->block_pool);
// Initialize the scratch builder for queue operations (dynamic_count = 0,
// all slots are static).
iree_status_t status = iree_hal_webgpu_builder_initialize(
&out_queue->block_pool, /*dynamic_count=*/0, host_allocator,
&out_queue->scratch_builder);
if (!iree_status_is_ok(status)) {
iree_arena_block_pool_deinitialize(&out_queue->block_pool);
}
return status;
}
void iree_hal_webgpu_queue_deinitialize(iree_hal_webgpu_queue_t* queue) {
iree_hal_webgpu_builder_deinitialize(&queue->scratch_builder);
iree_arena_block_pool_deinitialize(&queue->block_pool);
}
static iree_hal_semaphore_list_t iree_hal_webgpu_semaphore_list_at_offsets(
void* base, iree_host_size_t count, iree_host_size_t semaphores_offset,
iree_host_size_t payload_values_offset) {
return (iree_hal_semaphore_list_t){
.count = count,
.semaphores =
(iree_hal_semaphore_t**)((uint8_t*)base + semaphores_offset),
.payload_values = (uint64_t*)((uint8_t*)base + payload_values_offset),
};
}
static void iree_hal_webgpu_semaphore_list_clone_into(
iree_hal_semaphore_list_t source_list,
iree_hal_semaphore_list_t target_list) {
IREE_ASSERT(source_list.count == target_list.count);
for (iree_host_size_t i = 0; i < source_list.count; ++i) {
target_list.semaphores[i] = source_list.semaphores[i];
iree_hal_semaphore_retain(target_list.semaphores[i]);
target_list.payload_values[i] = source_list.payload_values[i];
}
}
//===----------------------------------------------------------------------===//
// Epoch tracking
//===----------------------------------------------------------------------===//
// Atomically increments the epoch counter and returns the new value. Called at
// submit time to establish causal ordering. The frontier tracker is NOT
// advanced here — that happens at completion time (or immediately after for
// fast-path ops where submit IS completion).
static uint64_t iree_hal_webgpu_queue_reserve_epoch(
iree_hal_webgpu_queue_t* queue) {
return (uint64_t)iree_atomic_fetch_add(&queue->epoch, 1,
iree_memory_order_acq_rel) +
1;
}
// Advances the frontier tracker to the given epoch. Called at completion time
// (after GPU work finishes via onSubmittedWorkDone) or immediately after
// reserve_epoch for fast-path ops where submit IS completion.
static void iree_hal_webgpu_queue_advance_tracker(
iree_hal_webgpu_queue_t* queue, uint64_t epoch) {
if (!queue->frontier_tracker) return;
iree_async_frontier_tracker_advance(queue->frontier_tracker, queue->axis,
epoch);
}
//===----------------------------------------------------------------------===//
// Frontier construction
//===----------------------------------------------------------------------===//
// Builds a single-entry frontier on caller-provided stack storage for the
// queue's axis at the given epoch. Returns NULL if the queue has no frontier
// tracker (frontiers disabled). The returned pointer is valid for the lifetime
// of |out_frontier|.
static const iree_async_frontier_t* iree_hal_webgpu_queue_build_frontier(
iree_hal_webgpu_queue_t* queue, uint64_t epoch,
iree_async_single_frontier_t* out_frontier) {
if (!queue->frontier_tracker) return NULL;
iree_async_single_frontier_initialize(out_frontier, queue->axis, epoch);
return iree_async_single_frontier_as_const_frontier(out_frontier);
}
//===----------------------------------------------------------------------===//
// Scratch builder execution
//===----------------------------------------------------------------------===//
// Executes the scratch builder's instruction stream via the one-shot path
// (execute_instructions). Builds the binding table from the builder's slot map
// and passes it with the builtins descriptor to the JS processor.
static iree_status_t iree_hal_webgpu_queue_execute_scratch_builder(
iree_hal_webgpu_queue_t* queue) {
iree_hal_webgpu_builder_t* builder = &queue->scratch_builder;
uint32_t total_slots = iree_hal_webgpu_builder_total_slot_count(builder);
uint32_t static_count = iree_hal_webgpu_builder_static_slot_count(builder);
// Build the flat binding table in wire format.
// Use stack allocation for small tables (covers the vast majority of queue
// ops which touch 1-3 buffers).
iree_hal_webgpu_isa_binding_table_entry_t inline_entries[8];
iree_hal_webgpu_isa_binding_table_entry_t* entries = inline_entries;
if (total_slots > IREE_ARRAYSIZE(inline_entries)) {
IREE_RETURN_IF_ERROR(iree_allocator_malloc_array(
queue->host_allocator, total_slots,
sizeof(iree_hal_webgpu_isa_binding_table_entry_t), (void**)&entries));
}
const iree_hal_webgpu_builder_slot_entry_t* slot_entries =
iree_hal_webgpu_builder_static_slot_entries(builder);
for (uint32_t i = 0; i < static_count; ++i) {
entries[slot_entries[i].slot].gpu_buffer_handle =
slot_entries[i].gpu_buffer_handle;
entries[slot_entries[i].slot].base_offset = 0;
}
iree_hal_webgpu_isa_builtins_descriptor_t builtins_descriptor;
iree_hal_webgpu_builtins_get_descriptor(queue->builtins,
&builtins_descriptor);
uint32_t result = iree_hal_webgpu_import_execute_instructions(
queue->device_handle, queue->queue_handle,
(uint32_t)(uintptr_t)iree_hal_webgpu_builder_block_table(builder),
iree_hal_webgpu_builder_block_count(builder),
iree_hal_webgpu_builder_block_word_capacity(builder),
iree_hal_webgpu_builder_last_block_word_count(builder),
(uint32_t)(uintptr_t)entries, total_slots,
(uint32_t)(uintptr_t)&builtins_descriptor);
if (entries != inline_entries) {
iree_allocator_free(queue->host_allocator, entries);
}
if (result != 0) {
return iree_make_status(IREE_STATUS_INTERNAL,
"JS execute_instructions failed with code %u",
result);
}
return iree_ok_status();
}
//===----------------------------------------------------------------------===//
// Signal completion (proactor-driven onSubmittedWorkDone)
//===----------------------------------------------------------------------===//
// Holds an async operation and a snapshot of the signal semaphore list for
// deferred signaling after onSubmittedWorkDone completes. Allocated as a single
// block: [struct | semaphore_ptrs[] | payload_values[]].
typedef struct iree_hal_webgpu_signal_completion_t {
iree_async_operation_t operation;
iree_hal_semaphore_list_t signal_semaphore_list;
iree_hal_webgpu_queue_t* queue; // Borrowed, outlives completion.
uint64_t epoch;
iree_allocator_t allocator;
} iree_hal_webgpu_signal_completion_t;
// Completion callback invoked by the proactor when onSubmittedWorkDone fires.
// Builds a frontier for the queue's axis/epoch, signals (or fails) all
// semaphores with it, advances the frontier tracker, releases references,
// and frees the completion struct.
static void iree_hal_webgpu_signal_completion_fn(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_signal_completion_t* completion =
(iree_hal_webgpu_signal_completion_t*)base_operation;
if (iree_status_is_ok(status)) {
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(
completion->queue, completion->epoch, &frontier_storage);
iree_status_ignore(iree_hal_semaphore_list_signal(
completion->signal_semaphore_list, frontier));
} else {
iree_hal_semaphore_list_fail(completion->signal_semaphore_list, status);
}
iree_hal_webgpu_queue_advance_tracker(completion->queue, completion->epoch);
iree_hal_semaphore_list_release(completion->signal_semaphore_list);
iree_allocator_free(completion->allocator, completion);
}
// Registers an onSubmittedWorkDone callback that signals all semaphores in
// |signal_semaphore_list| when the currently submitted GPU work completes.
// The |epoch| is the pre-reserved epoch from reserve_epoch; the completion
// callback builds a frontier from it and advances the tracker.
// If the semaphore list is empty, advances the tracker immediately and returns.
static iree_status_t iree_hal_webgpu_queue_register_signal_completion(
iree_hal_webgpu_queue_t* queue, uint64_t epoch,
const iree_hal_semaphore_list_t signal_semaphore_list) {
if (signal_semaphore_list.count == 0) {
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
return iree_ok_status();
}
iree_host_size_t total_size = 0;
iree_host_size_t semaphores_offset = 0;
iree_host_size_t payload_values_offset = 0;
IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT(
sizeof(iree_hal_webgpu_signal_completion_t), &total_size,
IREE_STRUCT_FIELD(signal_semaphore_list.count, iree_hal_semaphore_t*,
&semaphores_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, uint64_t,
&payload_values_offset)));
iree_hal_webgpu_signal_completion_t* completion = NULL;
IREE_RETURN_IF_ERROR(iree_allocator_malloc(queue->host_allocator, total_size,
(void**)&completion));
completion->signal_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
completion, signal_semaphore_list.count, semaphores_offset,
payload_values_offset);
completion->queue = queue;
completion->epoch = epoch;
completion->allocator = queue->host_allocator;
iree_hal_webgpu_semaphore_list_clone_into(signal_semaphore_list,
completion->signal_semaphore_list);
iree_async_operation_initialize(
&completion->operation, IREE_ASYNC_OPERATION_TYPE_NOP,
IREE_ASYNC_OPERATION_FLAG_NONE, iree_hal_webgpu_signal_completion_fn,
/*user_data=*/NULL);
// Submit to the JS proactor's token table to get a completion token.
uint32_t token = UINT32_MAX;
iree_status_t status = iree_async_proactor_js_submit_external(
queue->proactor, &completion->operation, &token);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_release(completion->signal_semaphore_list);
iree_allocator_free(queue->host_allocator, completion);
return status;
}
// Register onSubmittedWorkDone with the JS bridge. When the GPU finishes
// the submitted work, JS writes {token, status_code} to the completion
// ring. The proactor's drain path dispatches our callback.
iree_hal_webgpu_import_queue_on_submitted_work_done(queue->queue_handle,
token);
return iree_ok_status();
}
// Forward declaration — defined in the FIFO wait elision section.
static void iree_hal_webgpu_queue_mark_signals_submitted(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t signal_semaphore_list);
// Finalizes and executes the scratch builder, then registers signal completion.
// Used by fast-path queue operations (fill, update, copy, dispatch) that build
// a single instruction into the scratch builder and submit it synchronously.
//
// Reserves an epoch and registers an onSubmittedWorkDone completion that
// carries the epoch's frontier. The completion callback signals semaphores
// with the frontier and advances the frontier tracker. On success, also marks
// the signal semaphores with submitted provenance for FIFO wait elision.
//
// |status| is the accumulated status from the caller's builder commands.
// If already failed, skips finalize/execute and fails the signal semaphores.
static iree_status_t iree_hal_webgpu_queue_submit_scratch_and_signal(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_status_t status) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_builder_finalize(&queue->scratch_builder);
}
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_queue_execute_scratch_builder(queue);
}
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_queue_register_signal_completion(
queue, epoch, signal_semaphore_list);
}
if (iree_status_is_ok(status)) {
iree_hal_webgpu_queue_mark_signals_submitted(queue, signal_semaphore_list);
} else {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
}
return status;
}
// Forward declarations for queue_execute helpers used in the wait callback.
static iree_status_t iree_hal_webgpu_queue_execute_recording(
iree_hal_webgpu_queue_t* queue, iree_hal_webgpu_handle_t recording_handle,
iree_hal_buffer_binding_table_t binding_table);
static iree_status_t iree_hal_webgpu_queue_execute_one_shot(
iree_hal_webgpu_queue_t* queue, iree_hal_command_buffer_t* command_buffer);
//===----------------------------------------------------------------------===//
// Unified async queue operation state
//===----------------------------------------------------------------------===//
//
// All async queue operations (except host_call, which has unique DEFERRED
// semantics) share a common state structure. The wait completion callback
// dispatches on op_type to perform the operation's work, then either signals
// inline (CPU-only ops) or transfers slab ownership to an embedded signal
// completion (GPU-submit ops) for GPU completion tracking.
//
// CPU-only (barrier, alloca, dealloca):
// wait → CPU work → signal inline → advance epoch → free slab.
//
// GPU-submit (fill, update, copy, dispatch, execute):
// wait → GPU work → release per-op resources → init embedded signal
// → register onSubmittedWorkDone → [slab lives on]
// → onSubmittedWorkDone fires → signal → advance epoch → free slab.
typedef enum iree_hal_webgpu_queue_op_type_e {
// CPU-only: wait → CPU work → signal inline.
IREE_HAL_WEBGPU_QUEUE_OP_BARRIER,
IREE_HAL_WEBGPU_QUEUE_OP_ALLOCA,
IREE_HAL_WEBGPU_QUEUE_OP_DEALLOCA,
// GPU-submit: wait → GPU work → signal on GPU completion.
IREE_HAL_WEBGPU_QUEUE_OP_FILL,
IREE_HAL_WEBGPU_QUEUE_OP_UPDATE,
IREE_HAL_WEBGPU_QUEUE_OP_COPY,
IREE_HAL_WEBGPU_QUEUE_OP_READ,
IREE_HAL_WEBGPU_QUEUE_OP_DISPATCH,
IREE_HAL_WEBGPU_QUEUE_OP_EXECUTE,
} iree_hal_webgpu_queue_op_type_t;
typedef struct iree_hal_webgpu_queue_state_t {
// Must be first — the proactor casts between base operation and this.
iree_async_semaphore_wait_operation_t wait_operation;
// Embedded signal operation for GPU-submit ops. After the wait callback
// submits GPU work, this is initialized as a NOP external operation with a
// proactor token. The slab stays alive until onSubmittedWorkDone fires and
// the signal callback runs.
iree_async_operation_t signal_operation;
iree_hal_webgpu_queue_t* queue;
// Pre-incremented epoch. The atomic counter is incremented at submit time
// for causal ordering; the frontier tracker is advanced at completion time.
uint64_t epoch;
iree_hal_semaphore_list_t wait_semaphore_list;
iree_hal_semaphore_list_t signal_semaphore_list;
iree_hal_webgpu_queue_op_type_t op_type;
union {
struct {
iree_hal_buffer_t* buffer; // Retained stub.
} alloca_op;
struct {
iree_hal_buffer_t* buffer; // Retained.
} dealloca;
struct {
iree_hal_buffer_t* target_buffer; // Retained.
iree_device_size_t target_offset;
iree_device_size_t length;
uint32_t pattern;
iree_host_size_t pattern_length;
} fill;
struct {
iree_hal_buffer_t* target_buffer; // Retained.
iree_device_size_t target_offset;
iree_device_size_t length;
void* captured_data; // Points into acquired block.
iree_arena_block_t* captured_block; // For releasing back to pool.
} update;
struct {
iree_hal_buffer_t* source_buffer; // Retained.
iree_device_size_t source_offset;
iree_hal_buffer_t* target_buffer; // Retained.
iree_device_size_t target_offset;
iree_device_size_t length;
} copy;
struct {
iree_hal_buffer_t* storage; // Retained if non-NULL (HOST_LOCAL).
iree_hal_file_t* source_file; // Retained if storage is NULL (FD).
uint64_t source_offset;
iree_hal_buffer_t* target_buffer; // Retained.
iree_device_size_t target_offset;
iree_device_size_t length;
} read;
struct {
iree_hal_webgpu_handle_t pipeline_handle;
iree_hal_webgpu_handle_t bind_group_layout_handle;
uint32_t workgroup_count[3];
iree_hal_executable_t* executable; // Retained.
iree_hal_buffer_ref_t* bindings; // Points into trailing slab.
uint32_t binding_count;
} dispatch;
struct {
iree_hal_command_buffer_t* command_buffer; // Retained.
iree_hal_buffer_binding_t* binding_table; // Points into trailing slab.
iree_host_size_t binding_count;
} execute;
};
iree_allocator_t allocator;
// Trailing arrays (via IREE_STRUCT_LAYOUT):
// iree_async_semaphore_t* wait_semaphores[wait_count]
// uint64_t wait_values[wait_count]
// iree_hal_semaphore_t* signal_semaphores[signal_count]
// uint64_t signal_values[signal_count]
// Per-op trailing data:
// DISPATCH: iree_hal_buffer_ref_t bindings[binding_count]
// EXECUTE: iree_hal_buffer_binding_t binding_table[binding_count]
} iree_hal_webgpu_queue_state_t;
// Releases per-op retained resources from the state's union. Called on both
// success (after work is done) and failure (wait error, submit error) paths.
// Each op type retains resources in its submit function and releases them here.
static void iree_hal_webgpu_queue_op_release_resources(
iree_hal_webgpu_queue_state_t* state) {
switch (state->op_type) {
case IREE_HAL_WEBGPU_QUEUE_OP_BARRIER:
break;
case IREE_HAL_WEBGPU_QUEUE_OP_ALLOCA:
iree_hal_buffer_release(state->alloca_op.buffer);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_DEALLOCA:
iree_hal_buffer_release(state->dealloca.buffer);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_FILL:
iree_hal_buffer_release(state->fill.target_buffer);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_UPDATE:
iree_hal_buffer_release(state->update.target_buffer);
iree_arena_block_pool_release(&state->queue->block_pool,
state->update.captured_block,
state->update.captured_block);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_COPY:
iree_hal_buffer_release(state->copy.source_buffer);
iree_hal_buffer_release(state->copy.target_buffer);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_READ:
if (state->read.storage) {
iree_hal_buffer_release(state->read.storage);
} else {
iree_hal_file_release(state->read.source_file);
}
iree_hal_buffer_release(state->read.target_buffer);
break;
case IREE_HAL_WEBGPU_QUEUE_OP_DISPATCH:
iree_hal_executable_release(state->dispatch.executable);
for (uint32_t i = 0; i < state->dispatch.binding_count; ++i) {
iree_hal_buffer_release(state->dispatch.bindings[i].buffer);
}
break;
case IREE_HAL_WEBGPU_QUEUE_OP_EXECUTE:
iree_hal_command_buffer_release(state->execute.command_buffer);
for (iree_host_size_t i = 0; i < state->execute.binding_count; ++i) {
iree_hal_buffer_release(state->execute.binding_table[i].buffer);
}
break;
default:
break;
}
}
// Cleans up a queue state slab after proactor submission fails. Fails signal
// semaphores (so downstream waiters see the error rather than hanging),
// releases per-op resources, both semaphore lists, and frees the slab.
static void iree_hal_webgpu_queue_state_submit_failed(
iree_hal_webgpu_queue_state_t* state, iree_status_t submit_status) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list,
iree_status_clone(submit_status));
iree_hal_webgpu_queue_op_release_resources(state);
iree_hal_semaphore_list_release(state->wait_semaphore_list);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
}
// Signal completion callback for GPU-submit ops. Fires when
// onSubmittedWorkDone delivers after GPU work completes. Recovers the
// queue_state from the embedded signal_operation via offsetof.
static void iree_hal_webgpu_queue_op_signal_completion(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_queue_state_t* state =
(iree_hal_webgpu_queue_state_t*)((uint8_t*)base_operation -
offsetof(iree_hal_webgpu_queue_state_t,
signal_operation));
if (iree_status_is_ok(status)) {
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(state->queue, state->epoch,
&frontier_storage);
iree_status_ignore(
iree_hal_semaphore_list_signal(state->signal_semaphore_list, frontier));
} else {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, status);
}
iree_hal_webgpu_queue_advance_tracker(state->queue, state->epoch);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
}
// Registers the embedded signal_operation for onSubmittedWorkDone completion
// tracking. Called from GPU-submit cases in the wait callback after GPU work
// has been submitted. On success: releases per-op resources (GPU holds its
// own references) and returns true — the caller MUST return immediately
// (slab ownership transfers to the signal callback). On failure: fails the
// signal semaphores and returns false — the caller should break to shared
// cleanup.
static bool iree_hal_webgpu_queue_op_register_embedded_signal(
iree_hal_webgpu_queue_state_t* state) {
iree_async_operation_initialize(
&state->signal_operation, IREE_ASYNC_OPERATION_TYPE_NOP,
IREE_ASYNC_OPERATION_FLAG_NONE,
iree_hal_webgpu_queue_op_signal_completion, /*user_data=*/NULL);
uint32_t token = UINT32_MAX;
iree_status_t status = iree_async_proactor_js_submit_external(
state->queue->proactor, &state->signal_operation, &token);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, status);
return false;
}
iree_hal_webgpu_import_queue_on_submitted_work_done(
state->queue->queue_handle, token);
// Per-op resources released now — GPU has its own references to buffers.
iree_hal_webgpu_queue_op_release_resources(state);
return true;
}
// Finalizes the scratch builder, executes it, and registers the embedded
// signal for GPU completion tracking. Called from GPU-submit cases in the wait
// callback after per-op builder commands have been recorded. Returns true on
// success (caller must return immediately — slab ownership transfers to the
// signal callback); false on failure (caller should break to shared cleanup).
//
// |work_status| is the accumulated status from the per-op builder calls.
// If already failed, skips finalize/execute and fails the signal semaphores.
static bool iree_hal_webgpu_queue_op_finalize_and_submit(
iree_hal_webgpu_queue_state_t* state, iree_status_t work_status) {
if (iree_status_is_ok(work_status)) {
work_status =
iree_hal_webgpu_builder_finalize(&state->queue->scratch_builder);
}
if (iree_status_is_ok(work_status)) {
work_status = iree_hal_webgpu_queue_execute_scratch_builder(state->queue);
}
if (!iree_status_is_ok(work_status)) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, work_status);
return false;
}
return iree_hal_webgpu_queue_op_register_embedded_signal(state);
}
// Wait completion callback for all unified async queue operations. Dispatches
// on op_type to perform the operation's work after input semaphores are
// satisfied. CPU-only ops signal inline and fall through to shared cleanup.
// GPU-submit ops submit GPU work and return, transferring slab ownership to
// the signal completion callback.
static void iree_hal_webgpu_queue_op_wait_completion(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t wait_status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_queue_state_t* state =
(iree_hal_webgpu_queue_state_t*)base_operation;
// Wait semaphores consumed regardless of outcome.
iree_hal_semaphore_list_release(state->wait_semaphore_list);
if (!iree_status_is_ok(wait_status)) {
// Propagate wait failure to signal semaphores and clean up everything.
iree_hal_semaphore_list_fail(state->signal_semaphore_list, wait_status);
iree_hal_webgpu_queue_op_release_resources(state);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
return;
}
// Build frontier once for all cases that signal inline. GPU-submit success
// cases don't use it here (they build their own in the signal callback).
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier = iree_hal_webgpu_queue_build_frontier(
state->queue, state->epoch, &frontier_storage);
switch (state->op_type) {
case IREE_HAL_WEBGPU_QUEUE_OP_BARRIER:
// CPU-only: no work — signal passes straight through.
iree_status_ignore(iree_hal_semaphore_list_signal(
state->signal_semaphore_list, frontier));
break;
case IREE_HAL_WEBGPU_QUEUE_OP_ALLOCA: {
// CPU-only: create the GPU buffer on the stub, then signal.
iree_status_t bind_status = iree_hal_webgpu_buffer_bind(
state->alloca_op.buffer, state->queue->device_handle);
if (iree_status_is_ok(bind_status)) {
iree_status_ignore(iree_hal_semaphore_list_signal(
state->signal_semaphore_list, frontier));
} else {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, bind_status);
}
break;
}
case IREE_HAL_WEBGPU_QUEUE_OP_DEALLOCA:
// CPU-only: detach GPU buffer from the wrapper, then signal.
iree_hal_webgpu_buffer_unbind(state->dealloca.buffer);
iree_status_ignore(iree_hal_semaphore_list_signal(
state->signal_semaphore_list, frontier));
break;
case IREE_HAL_WEBGPU_QUEUE_OP_FILL: {
// GPU-submit: scratch build fill → GPU execute → signal on completion.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_status_t work_status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(work_status)) {
iree_hal_buffer_ref_t target_ref = iree_hal_make_buffer_ref(
state->fill.target_buffer, state->fill.target_offset,
state->fill.length);
work_status = iree_hal_webgpu_builder_fill_buffer(
&queue->scratch_builder, target_ref, &state->fill.pattern,
state->fill.pattern_length);
}
if (iree_hal_webgpu_queue_op_finalize_and_submit(state, work_status))
return;
break;
}
case IREE_HAL_WEBGPU_QUEUE_OP_UPDATE: {
// GPU-submit: scratch build update → GPU execute → signal on completion.
// Source data was captured into a block pool block at submit time.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_status_t work_status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(work_status)) {
iree_hal_buffer_ref_t target_ref = iree_hal_make_buffer_ref(
state->update.target_buffer, state->update.target_offset,
state->update.length);
work_status = iree_hal_webgpu_builder_update_buffer(
&queue->scratch_builder, state->update.captured_data,
/*source_offset=*/0, target_ref);
}
if (iree_hal_webgpu_queue_op_finalize_and_submit(state, work_status))
return;
break;
}
case IREE_HAL_WEBGPU_QUEUE_OP_COPY: {
// GPU-submit: scratch build copy → GPU execute → signal on completion.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_status_t work_status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(work_status)) {
iree_hal_buffer_ref_t source_ref = iree_hal_make_buffer_ref(
state->copy.source_buffer, state->copy.source_offset,
state->copy.length);
iree_hal_buffer_ref_t target_ref = iree_hal_make_buffer_ref(
state->copy.target_buffer, state->copy.target_offset,
state->copy.length);
work_status = iree_hal_webgpu_builder_copy_buffer(
&queue->scratch_builder, source_ref, target_ref);
}
if (iree_hal_webgpu_queue_op_finalize_and_submit(state, work_status))
return;
break;
}
case IREE_HAL_WEBGPU_QUEUE_OP_READ: {
// GPU-submit: file → GPU via bridge import → signal on completion.
// No scratch builder needed — calls queue.writeBuffer() directly.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_hal_webgpu_handle_t gpu_handle = iree_hal_webgpu_buffer_handle(
iree_hal_buffer_allocated_buffer(state->read.target_buffer));
uint64_t gpu_offset =
iree_hal_buffer_byte_offset(state->read.target_buffer) +
state->read.target_offset;
iree_status_t work_status = iree_ok_status();
if (state->read.storage) {
// HOST_LOCAL: map the storage buffer and upload from host pointer.
iree_hal_buffer_mapping_t mapping = {{0}};
work_status = iree_hal_buffer_map_range(
state->read.storage, IREE_HAL_MAPPING_MODE_SCOPED,
IREE_HAL_MEMORY_ACCESS_READ, state->read.source_offset,
state->read.length, &mapping);
if (iree_status_is_ok(work_status)) {
iree_hal_webgpu_import_queue_write_buffer(
queue->queue_handle, gpu_handle, gpu_offset,
(uint32_t)(uintptr_t)mapping.contents.data, state->read.length);
iree_hal_buffer_unmap_range(&mapping);
}
} else {
// FD: use zero-copy bridge import.
int fd = iree_hal_webgpu_fd_file_fd(state->read.source_file);
iree_hal_webgpu_import_queue_write_buffer_from_file(
queue->queue_handle, gpu_handle, gpu_offset, (uint32_t)fd,
state->read.source_offset, state->read.length);
}
if (!iree_status_is_ok(work_status)) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, work_status);
break;
}
if (!iree_hal_webgpu_queue_op_register_embedded_signal(state)) break;
return;
}
case IREE_HAL_WEBGPU_QUEUE_OP_DISPATCH: {
// GPU-submit: scratch build dispatch → GPU execute → signal on
// completion.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_hal_buffer_ref_list_t binding_list = {
.values = state->dispatch.bindings,
.count = state->dispatch.binding_count,
};
iree_status_t work_status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(work_status)) {
work_status = iree_hal_webgpu_builder_dispatch(
&queue->scratch_builder, state->dispatch.pipeline_handle,
state->dispatch.bind_group_layout_handle,
state->dispatch.workgroup_count, binding_list);
}
if (iree_hal_webgpu_queue_op_finalize_and_submit(state, work_status))
return;
break;
}
case IREE_HAL_WEBGPU_QUEUE_OP_EXECUTE: {
// GPU-submit: command buffer submit → signal on GPU completion.
iree_hal_webgpu_queue_t* queue = state->queue;
iree_hal_webgpu_handle_t recording_handle =
iree_hal_webgpu_command_buffer_recording_handle(
state->execute.command_buffer);
iree_status_t work_status;
if (recording_handle) {
iree_hal_buffer_binding_table_t binding_table = {
.count = state->execute.binding_count,
.bindings = state->execute.binding_table,
};
work_status = iree_hal_webgpu_queue_execute_recording(
queue, recording_handle, binding_table);
} else {
work_status = iree_hal_webgpu_queue_execute_one_shot(
queue, state->execute.command_buffer);
}
if (!iree_status_is_ok(work_status)) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, work_status);
break;
}
if (!iree_hal_webgpu_queue_op_register_embedded_signal(state)) break;
return;
}
// GPU-submit cases that succeed return from their case above,
// transferring slab ownership to the signal completion callback. Error
// cases break to the shared cleanup below.
default:
// Ops that haven't been converted to the unified async pattern never
// create queue_state_t slabs. Reaching here is a programming error.
iree_hal_semaphore_list_fail(
state->signal_semaphore_list,
iree_make_status(IREE_STATUS_INTERNAL,
"unexpected op type %d in queue wait callback",
(int)state->op_type));
break;
}
// Shared cleanup for CPU-only completion and GPU-submit error recovery.
// GPU-submit success cases return from their switch case before reaching
// here.
iree_hal_webgpu_queue_advance_tracker(state->queue, state->epoch);
iree_hal_webgpu_queue_op_release_resources(state);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
}
// Initializes the common fields of a pre-allocated queue state slab.
// Sets up the wait operation with semaphore arrays pointing into the trailing
// slab (aliased via cast — valid because iree_async_semaphore_t is at offset 0
// in iree_hal_webgpu_semaphore_t), clones both semaphore lists, and initializes
// the operation base with the unified wait completion callback.
static void iree_hal_webgpu_queue_state_initialize(
iree_hal_webgpu_queue_state_t* state,
iree_hal_webgpu_queue_op_type_t op_type, iree_hal_webgpu_queue_t* queue,
uint64_t epoch, const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_host_size_t wait_semaphores_offset,
iree_host_size_t wait_values_offset,
iree_host_size_t signal_semaphores_offset,
iree_host_size_t signal_values_offset, iree_allocator_t allocator) {
state->op_type = op_type;
state->queue = queue;
state->epoch = epoch;
state->allocator = allocator;
// Clone the wait semaphore list into the trailing slab. The wait operation's
// semaphore pointer array aliases the same memory.
state->wait_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, wait_semaphore_list.count, wait_semaphores_offset,
wait_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(wait_semaphore_list,
state->wait_semaphore_list);
state->wait_operation.semaphores =
(iree_async_semaphore_t**)state->wait_semaphore_list.semaphores;
state->wait_operation.values = state->wait_semaphore_list.payload_values;
state->wait_operation.count = wait_semaphore_list.count;
state->wait_operation.mode = IREE_ASYNC_WAIT_MODE_ALL;
state->wait_operation.satisfied_index = 0;
// Clone the signal semaphore list into the trailing slab.
state->signal_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, signal_semaphore_list.count, signal_semaphores_offset,
signal_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(signal_semaphore_list,
state->signal_semaphore_list);
// Initialize the wait operation base with the unified completion callback.
iree_async_operation_initialize(
&state->wait_operation.base, IREE_ASYNC_OPERATION_TYPE_SEMAPHORE_WAIT,
IREE_ASYNC_OPERATION_FLAG_NONE, iree_hal_webgpu_queue_op_wait_completion,
/*user_data=*/NULL);
}
// Allocates and initializes a queue state slab for an async queue operation.
// Pre-increments the queue epoch, computes the slab layout (base struct +
// 4 trailing semaphore arrays + optional per-op trailing data), allocates the
// slab, and initializes common fields. On failure, fails the signal semaphores
// and returns the error.
//
// |trailing_count| and |trailing_element_size| specify an optional trailing
// array for per-op data (dispatch bindings, execute binding table). Pass 0
// for both when no extra trailing data is needed. If non-zero,
// |out_trailing_offset| receives the byte offset from the slab base.
static iree_status_t iree_hal_webgpu_queue_state_allocate(
iree_hal_webgpu_queue_t* queue, iree_hal_webgpu_queue_op_type_t op_type,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_host_size_t trailing_count, iree_host_size_t trailing_element_size,
iree_host_size_t* out_trailing_offset,
iree_hal_webgpu_queue_state_t** out_state) {
*out_state = NULL;
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_host_size_t total_size = 0;
iree_host_size_t wait_semaphores_offset = 0;
iree_host_size_t wait_values_offset = 0;
iree_host_size_t signal_semaphores_offset = 0;
iree_host_size_t signal_values_offset = 0;
iree_status_t status = IREE_STRUCT_LAYOUT(
sizeof(iree_hal_webgpu_queue_state_t), &total_size,
IREE_STRUCT_FIELD(wait_semaphore_list.count, iree_async_semaphore_t*,
&wait_semaphores_offset),
IREE_STRUCT_FIELD(wait_semaphore_list.count, uint64_t,
&wait_values_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, iree_hal_semaphore_t*,
&signal_semaphores_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, uint64_t,
&signal_values_offset));
// Append per-op trailing data after the semaphore arrays. STRUCT_LAYOUT
// produces a max_align_t-aligned total_size, which satisfies alignment for
// all per-op element types (they contain pointers and device_size_t fields,
// both <= max_align_t).
if (iree_status_is_ok(status) && trailing_count > 0) {
if (out_trailing_offset) *out_trailing_offset = total_size;
iree_host_size_t trailing_bytes = 0;
if (!iree_host_size_checked_mul(trailing_count, trailing_element_size,
&trailing_bytes) ||
!iree_host_size_checked_add(total_size, trailing_bytes, &total_size)) {
status = iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"trailing allocation size overflow");
}
}
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
iree_hal_webgpu_queue_state_t* state = NULL;
status =
iree_allocator_malloc(queue->host_allocator, total_size, (void**)&state);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
iree_hal_webgpu_queue_state_initialize(
state, op_type, queue, epoch, wait_semaphore_list, signal_semaphore_list,
wait_semaphores_offset, wait_values_offset, signal_semaphores_offset,
signal_values_offset, queue->host_allocator);
*out_state = state;
return iree_ok_status();
}
// Submits an initialized queue state to the proactor. On failure, cleans up
// the state (fails signals, releases resources and semaphore lists, frees
// slab).
static iree_status_t iree_hal_webgpu_queue_state_submit(
iree_hal_webgpu_queue_state_t* state) {
iree_status_t status = iree_async_proactor_submit_one(
state->queue->proactor, &state->wait_operation.base);
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_state_submit_failed(state, status);
}
return status;
}
//===----------------------------------------------------------------------===//
// FIFO wait elision
//===----------------------------------------------------------------------===//
// Returns true if all wait semaphores can be elided because this queue has
// already submitted (but possibly not completed) signals that will satisfy
// every wait. Uses the submitted signal provenance fields on each semaphore
// to check if GPU FIFO ordering guarantees the wait will be satisfied.
static bool iree_hal_webgpu_queue_can_elide_waits(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list) {
for (iree_host_size_t i = 0; i < wait_semaphore_list.count; ++i) {
if (!iree_hal_webgpu_semaphore_has_submitted_signal(
wait_semaphore_list.semaphores[i], queue->axis,
wait_semaphore_list.payload_values[i])) {
return false;
}
}
return true;
}
// Marks all semaphores in the signal list as having a pending submitted signal
// from this queue. Called after GPU work is submitted in fast-path queue
// operations so that subsequent same-queue ops can use FIFO wait elision.
static void iree_hal_webgpu_queue_mark_signals_submitted(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t signal_semaphore_list) {
for (iree_host_size_t i = 0; i < signal_semaphore_list.count; ++i) {
iree_hal_webgpu_semaphore_mark_submitted_signal(
signal_semaphore_list.semaphores[i], queue->axis,
signal_semaphore_list.payload_values[i]);
}
}
//===----------------------------------------------------------------------===//
// queue_alloca / queue_dealloca
//===----------------------------------------------------------------------===//
iree_status_t iree_hal_webgpu_queue_alloca(
iree_hal_webgpu_queue_t* queue, iree_hal_allocator_t* device_allocator,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_pool_t* pool, iree_hal_buffer_params_t params,
iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
*out_buffer = NULL;
// Validate and coerce parameters through the allocator. This ensures the
// stub buffer stores the correct memory type and usage flags for when
// buffer_bind creates the actual GPU buffer.
iree_hal_buffer_params_t compat_params;
iree_device_size_t compat_allocation_size = allocation_size;
iree_hal_buffer_compatibility_t compatibility =
iree_hal_allocator_query_buffer_compatibility(
device_allocator, params, allocation_size, &compat_params,
&compat_allocation_size);
if (!iree_all_bits_set(compatibility,
IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
iree_status_t status = iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"allocator cannot serve the requested buffer parameters");
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
// Create a stub buffer (handle = 0) with the coerced parameters. The GPU
// buffer is created later by buffer_bind, either inline (fast path) or in
// the async wait callback.
iree_hal_buffer_placement_t placement = {
.device = NULL,
.queue_affinity = compat_params.queue_affinity,
.flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE,
};
iree_hal_buffer_t* buffer = NULL;
{
iree_status_t status = iree_hal_webgpu_buffer_create_stub(
placement, compat_params.type, compat_params.access,
compat_params.usage, compat_allocation_size, queue->host_allocator,
&buffer);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
}
// Fast path: waits already satisfied (or FIFO-elided) — bind, signal, return.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_status_t status =
iree_hal_webgpu_buffer_bind(buffer, queue->device_handle);
if (iree_status_is_ok(status)) {
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(queue, epoch, &frontier_storage);
status = iree_hal_semaphore_list_signal(signal_semaphore_list, frontier);
}
if (iree_status_is_ok(status)) {
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
*out_buffer = buffer;
} else {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
iree_hal_buffer_release(buffer);
}
return status;
}
// Async path: give the stub to the caller immediately, then submit a wait
// that binds the GPU buffer in its completion callback.
iree_hal_webgpu_queue_state_t* state = NULL;
iree_status_t status = iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_ALLOCA, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state);
if (!iree_status_is_ok(status)) {
iree_hal_buffer_release(buffer);
return status;
}
// Retain the buffer for the slab — the caller also holds a reference via
// *out_buffer. The slab's retain is released in release_resources (callback
// success) or in the submit failure cleanup below.
iree_hal_buffer_retain(buffer);
state->alloca_op.buffer = buffer;
*out_buffer = buffer;
status = iree_hal_webgpu_queue_state_submit(state);
if (!iree_status_is_ok(status)) {
iree_hal_buffer_release(buffer); // Caller's ref — buffer unusable.
*out_buffer = NULL;
}
return status;
}
iree_status_t iree_hal_webgpu_queue_dealloca(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* buffer, iree_hal_dealloca_flags_t flags) {
// Fast path: waits already satisfied (or FIFO-elided) — unbind, signal.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_hal_webgpu_buffer_unbind(buffer);
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(queue, epoch, &frontier_storage);
iree_status_t status =
iree_hal_semaphore_list_signal(signal_semaphore_list, frontier);
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
return status;
}
// Async path: retain the buffer, submit a wait, and unbind in the callback.
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_DEALLOCA, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state));
iree_hal_buffer_retain(buffer);
state->dealloca.buffer = buffer;
return iree_hal_webgpu_queue_state_submit(state);
}
//===----------------------------------------------------------------------===//
// Queue operations (scratch builder)
//===----------------------------------------------------------------------===//
iree_status_t iree_hal_webgpu_queue_fill(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, const void* pattern,
iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
// Fast path: waits satisfied or FIFO-elided — execute synchronously.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
iree_status_t status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(status)) {
iree_hal_buffer_ref_t target_ref =
iree_hal_make_buffer_ref(target_buffer, target_offset, length);
status = iree_hal_webgpu_builder_fill_buffer(
&queue->scratch_builder, target_ref, pattern, pattern_length);
}
return iree_hal_webgpu_queue_submit_scratch_and_signal(
queue, signal_semaphore_list, status);
}
// Async path: capture params and submit wait. The callback does the scratch
// build + GPU execute and registers an embedded signal for completion.
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_FILL, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state));
iree_hal_buffer_retain(target_buffer);
state->fill.target_buffer = target_buffer;
state->fill.target_offset = target_offset;
state->fill.length = length;
IREE_ASSERT(pattern_length <= sizeof(state->fill.pattern));
memcpy(&state->fill.pattern, pattern, pattern_length);
state->fill.pattern_length = pattern_length;
return iree_hal_webgpu_queue_state_submit(state);
}
iree_status_t iree_hal_webgpu_queue_update(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
const void* source_buffer, iree_host_size_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_update_flags_t flags) {
// Fast path: waits satisfied or FIFO-elided — execute synchronously. The
// builder copies source data into the instruction stream blocks inline, so
// the caller's source_buffer is consumed before this function returns.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
iree_status_t status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(status)) {
iree_hal_buffer_ref_t target_ref =
iree_hal_make_buffer_ref(target_buffer, target_offset, length);
status = iree_hal_webgpu_builder_update_buffer(
&queue->scratch_builder, source_buffer, source_offset, target_ref);
}
return iree_hal_webgpu_queue_submit_scratch_and_signal(
queue, signal_semaphore_list, status);
}
// Async path: source data may be stack memory that becomes invalid after
// this function returns. Validate length fits in a block, then acquire a
// block from the pool and copy the data.
if (length > queue->block_pool.total_block_size) {
iree_status_t status = iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"queue_update length %" PRIdsz " exceeds block capacity %" PRIhsz,
length, queue->block_pool.total_block_size);
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
iree_arena_block_t* captured_block = NULL;
void* captured_data = NULL;
{
iree_status_t status = iree_arena_block_pool_acquire(
&queue->block_pool, &captured_block, &captured_data);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
}
memcpy(captured_data, (const uint8_t*)source_buffer + source_offset,
(size_t)length);
iree_hal_webgpu_queue_state_t* state = NULL;
iree_status_t status = iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_UPDATE, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state);
if (!iree_status_is_ok(status)) {
iree_arena_block_pool_release(&queue->block_pool, captured_block,
captured_block);
return status;
}
iree_hal_buffer_retain(target_buffer);
state->update.target_buffer = target_buffer;
state->update.target_offset = target_offset;
state->update.length = length;
state->update.captured_data = captured_data;
state->update.captured_block = captured_block;
return iree_hal_webgpu_queue_state_submit(state);
}
iree_status_t iree_hal_webgpu_queue_copy(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_copy_flags_t flags) {
// Fast path: waits satisfied or FIFO-elided — execute synchronously.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
iree_status_t status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(status)) {
iree_hal_buffer_ref_t source_ref =
iree_hal_make_buffer_ref(source_buffer, source_offset, length);
iree_hal_buffer_ref_t target_ref =
iree_hal_make_buffer_ref(target_buffer, target_offset, length);
status = iree_hal_webgpu_builder_copy_buffer(&queue->scratch_builder,
source_ref, target_ref);
}
return iree_hal_webgpu_queue_submit_scratch_and_signal(
queue, signal_semaphore_list, status);
}
// Async path: capture params (retain both buffers) and submit wait.
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_COPY, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state));
iree_hal_buffer_retain(source_buffer);
state->copy.source_buffer = source_buffer;
state->copy.source_offset = source_offset;
iree_hal_buffer_retain(target_buffer);
state->copy.target_buffer = target_buffer;
state->copy.target_offset = target_offset;
state->copy.length = length;
return iree_hal_webgpu_queue_state_submit(state);
}
// Performs a file-to-GPU transfer inline (no wait required). Calls the
// appropriate bridge import based on the file type, then registers signal
// completion for the GPU queue submission.
static iree_status_t iree_hal_webgpu_queue_read_inline(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_file_t* source_file, uint64_t source_offset,
iree_hal_buffer_t* storage, iree_hal_buffer_t* target_buffer,
iree_device_size_t target_offset, iree_device_size_t length) {
iree_hal_webgpu_handle_t gpu_handle = iree_hal_webgpu_buffer_handle(
iree_hal_buffer_allocated_buffer(target_buffer));
uint64_t gpu_offset =
iree_hal_buffer_byte_offset(target_buffer) + target_offset;
iree_status_t status = iree_ok_status();
if (storage) {
// HOST_LOCAL: map the storage buffer and upload from host pointer.
iree_hal_buffer_mapping_t mapping = {{0}};
status = iree_hal_buffer_map_range(storage, IREE_HAL_MAPPING_MODE_SCOPED,
IREE_HAL_MEMORY_ACCESS_READ,
source_offset, length, &mapping);
if (iree_status_is_ok(status)) {
iree_hal_webgpu_import_queue_write_buffer(
queue->queue_handle, gpu_handle, gpu_offset,
(uint32_t)(uintptr_t)mapping.contents.data, length);
iree_hal_buffer_unmap_range(&mapping);
}
} else {
// FD: use zero-copy bridge import. The import_file dispatch guarantees
// that files with NULL storage are webgpu_fd_files.
int fd = iree_hal_webgpu_fd_file_fd(source_file);
iree_hal_webgpu_import_queue_write_buffer_from_file(
queue->queue_handle, gpu_handle, gpu_offset, (uint32_t)fd,
source_offset, length);
}
// Register onSubmittedWorkDone for signal completion.
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_queue_register_signal_completion(
queue, epoch, signal_semaphore_list);
}
if (iree_status_is_ok(status)) {
iree_hal_webgpu_queue_mark_signals_submitted(queue, signal_semaphore_list);
} else {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
}
return status;
}
iree_status_t iree_hal_webgpu_queue_read(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_file_t* source_file, uint64_t source_offset,
iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
iree_device_size_t length, iree_hal_read_flags_t flags) {
if (source_offset + length > iree_hal_file_length(source_file)) {
return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"read range [%" PRIu64 ", %" PRIu64
") exceeds file length %" PRIu64,
source_offset, source_offset + length,
iree_hal_file_length(source_file));
}
// Determine the data source: HOST_LOCAL storage buffer or FD.
iree_hal_buffer_t* storage = iree_hal_file_storage_buffer(source_file);
// Fast path: waits satisfied or FIFO-elided — transfer inline.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
return iree_hal_webgpu_queue_read_inline(
queue, signal_semaphore_list, source_file, source_offset, storage,
target_buffer, target_offset, length);
}
// Async path: capture params and submit wait.
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_READ, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state));
if (storage) {
iree_hal_buffer_retain(storage);
state->read.storage = storage;
state->read.source_file = NULL;
} else {
state->read.storage = NULL;
iree_hal_file_retain(source_file);
state->read.source_file = source_file;
}
state->read.source_offset = source_offset;
iree_hal_buffer_retain(target_buffer);
state->read.target_buffer = target_buffer;
state->read.target_offset = target_offset;
state->read.length = length;
return iree_hal_webgpu_queue_state_submit(state);
}
//===----------------------------------------------------------------------===//
// queue_write (GPU → file) — three-phase async
//===----------------------------------------------------------------------===//
//
// GPU readback is always async in WebGPU (no synchronous map for non-mappable
// buffers). The three phases are:
//
// Phase 1 (waits satisfied): Create MAP_READ|COPY_DST staging buffer, encode
// copyBufferToBuffer(source → staging), submit, register
// onSubmittedWorkDone.
//
// Phase 2 (copy complete): mapAsync(staging, MAP_READ), register map
// completion via proactor.
//
// Phase 3 (map complete): Read data from staging into file (HOST_LOCAL:
// buffer_get_mapped_range → host ptr; FD: file_write_from_mapped). Unmap
// and destroy staging, signal semaphores, advance tracker, free state.
typedef struct iree_hal_webgpu_queue_write_state_t {
// Phase 0: wait for input semaphores (if async path).
iree_async_semaphore_wait_operation_t wait_operation;
// Phase 1→2: onSubmittedWorkDone after staging copy.
iree_async_operation_t copy_completion;
// Phase 2→3: mapAsync on staging buffer.
iree_async_operation_t map_completion;
iree_hal_webgpu_queue_t* queue;
uint64_t epoch;
iree_hal_semaphore_list_t wait_semaphore_list;
iree_hal_semaphore_list_t signal_semaphore_list;
// Source GPU buffer and range.
iree_hal_buffer_t* source_buffer; // Retained until Phase 1 submits.
iree_device_size_t source_offset;
iree_device_size_t length;
// Target: HOST_LOCAL storage buffer (from file's storage_buffer()) or FD.
iree_hal_buffer_t* target_storage; // Retained if non-NULL.
iree_hal_file_t* target_file; // Retained if target_storage NULL.
uint64_t target_offset;
// MAP_READ|COPY_DST staging buffer created in Phase 1.
iree_hal_webgpu_handle_t staging_handle;
iree_allocator_t allocator;
// Trailing: semaphore arrays for wait and signal lists.
} iree_hal_webgpu_queue_write_state_t;
// Forward declarations for the three phase callbacks.
static void iree_hal_webgpu_queue_write_phase1(
iree_hal_webgpu_queue_write_state_t* state);
static void iree_hal_webgpu_queue_write_phase2(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags);
static void iree_hal_webgpu_queue_write_phase3(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags);
// Cleans up the write state on failure. Fails signal semaphores, releases
// all retained resources, and frees the slab.
static void iree_hal_webgpu_queue_write_state_fail(
iree_hal_webgpu_queue_write_state_t* state, iree_status_t error) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, error);
iree_hal_webgpu_queue_advance_tracker(state->queue, state->epoch);
if (state->source_buffer) iree_hal_buffer_release(state->source_buffer);
if (state->target_storage) iree_hal_buffer_release(state->target_storage);
if (state->target_file) iree_hal_file_release(state->target_file);
if (state->staging_handle) {
iree_hal_webgpu_import_buffer_destroy(state->staging_handle);
}
iree_hal_semaphore_list_release(state->wait_semaphore_list);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
}
// Phase 1: Create staging buffer, copy source → staging, submit.
// Called either directly (fast path, waits already satisfied) or from the
// wait completion callback (async path).
static void iree_hal_webgpu_queue_write_phase1(
iree_hal_webgpu_queue_write_state_t* state) {
iree_hal_webgpu_queue_t* queue = state->queue;
// Create a MAP_READ | COPY_DST staging buffer.
state->staging_handle = iree_hal_webgpu_import_device_create_buffer(
queue->device_handle,
IREE_HAL_WEBGPU_BUFFER_USAGE_MAP_READ |
IREE_HAL_WEBGPU_BUFFER_USAGE_COPY_DST,
state->length, /*mapped_at_creation=*/0);
if (state->staging_handle == 0) {
iree_hal_webgpu_queue_write_state_fail(
state, iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
"failed to create staging buffer for "
"queue_write (%" PRIdsz " bytes)",
state->length));
return;
}
// Encode copyBufferToBuffer(source → staging).
iree_hal_webgpu_handle_t source_handle = iree_hal_webgpu_buffer_handle(
iree_hal_buffer_allocated_buffer(state->source_buffer));
uint64_t source_gpu_offset =
iree_hal_buffer_byte_offset(state->source_buffer) + state->source_offset;
uint32_t encoder_handle =
iree_hal_webgpu_import_device_create_command_encoder(
queue->device_handle);
iree_hal_webgpu_import_encoder_copy_buffer_to_buffer(
encoder_handle, source_handle, source_gpu_offset, state->staging_handle,
/*dst_offset=*/0, state->length);
uint32_t command_buffer_handle =
iree_hal_webgpu_import_encoder_finish(encoder_handle);
// Submit and release the source buffer (GPU has its own references).
iree_hal_webgpu_import_queue_submit(queue->queue_handle,
command_buffer_handle);
iree_hal_buffer_release(state->source_buffer);
state->source_buffer = NULL;
// Register onSubmittedWorkDone → Phase 2.
iree_async_operation_initialize(
&state->copy_completion, IREE_ASYNC_OPERATION_TYPE_NOP,
IREE_ASYNC_OPERATION_FLAG_NONE, iree_hal_webgpu_queue_write_phase2,
/*user_data=*/NULL);
uint32_t token = UINT32_MAX;
iree_status_t status = iree_async_proactor_js_submit_external(
queue->proactor, &state->copy_completion, &token);
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_write_state_fail(state, status);
return;
}
iree_hal_webgpu_import_queue_on_submitted_work_done(queue->queue_handle,
token);
}
// Phase 2: Copy complete → mapAsync on staging buffer.
static void iree_hal_webgpu_queue_write_phase2(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_queue_write_state_t* state =
(iree_hal_webgpu_queue_write_state_t*)((uint8_t*)base_operation -
offsetof(
iree_hal_webgpu_queue_write_state_t,
copy_completion));
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_write_state_fail(state, status);
return;
}
// Register mapAsync completion → Phase 3.
iree_async_operation_initialize(
&state->map_completion, IREE_ASYNC_OPERATION_TYPE_NOP,
IREE_ASYNC_OPERATION_FLAG_NONE, iree_hal_webgpu_queue_write_phase3,
/*user_data=*/NULL);
uint32_t token = UINT32_MAX;
status = iree_async_proactor_js_submit_external(
state->queue->proactor, &state->map_completion, &token);
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_write_state_fail(state, status);
return;
}
// Initiate mapAsync(staging, MAP_READ).
iree_hal_webgpu_import_buffer_map_async(state->staging_handle,
/*mode=*/1 /*MAP_READ*/, /*offset=*/0,
state->length, token);
}
// Phase 3: Map complete → read data into file → cleanup → signal.
static void iree_hal_webgpu_queue_write_phase3(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_queue_write_state_t* state =
(iree_hal_webgpu_queue_write_state_t*)((uint8_t*)base_operation -
offsetof(
iree_hal_webgpu_queue_write_state_t,
map_completion));
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_write_state_fail(state, status);
return;
}
// Read data from the mapped staging buffer into the target file.
if (state->target_storage) {
// HOST_LOCAL: map the target storage buffer and copy from staging.
iree_hal_buffer_mapping_t mapping = {{0}};
status = iree_hal_buffer_map_range(
state->target_storage, IREE_HAL_MAPPING_MODE_SCOPED,
IREE_HAL_MEMORY_ACCESS_WRITE, state->target_offset, state->length,
&mapping);
if (iree_status_is_ok(status)) {
iree_hal_webgpu_import_buffer_get_mapped_range(
state->staging_handle, /*offset=*/0, state->length,
(uint32_t)(uintptr_t)mapping.contents.data);
iree_hal_buffer_unmap_range(&mapping);
}
} else {
// FD: write from mapped staging buffer directly to the file object.
int fd = iree_hal_webgpu_fd_file_fd(state->target_file);
iree_hal_webgpu_import_file_write_from_mapped(
state->staging_handle, /*buffer_offset=*/0, state->length, (uint32_t)fd,
state->target_offset);
}
// Cleanup staging buffer.
iree_hal_webgpu_import_buffer_unmap(state->staging_handle);
iree_hal_webgpu_import_buffer_destroy(state->staging_handle);
state->staging_handle = 0;
// Release target resources.
if (state->target_storage) {
iree_hal_buffer_release(state->target_storage);
state->target_storage = NULL;
}
if (state->target_file) {
iree_hal_file_release(state->target_file);
state->target_file = NULL;
}
// Signal or fail.
if (iree_status_is_ok(status)) {
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(state->queue, state->epoch,
&frontier_storage);
iree_status_ignore(
iree_hal_semaphore_list_signal(state->signal_semaphore_list, frontier));
} else {
iree_hal_semaphore_list_fail(state->signal_semaphore_list, status);
}
iree_hal_webgpu_queue_advance_tracker(state->queue, state->epoch);
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_allocator_free(state->allocator, state);
}
// Wait completion callback for the async path. Releases wait semaphores,
// then enters Phase 1.
static void iree_hal_webgpu_queue_write_wait_completion(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t wait_status, iree_async_completion_flags_t flags) {
iree_hal_webgpu_queue_write_state_t* state =
(iree_hal_webgpu_queue_write_state_t*)base_operation;
iree_hal_semaphore_list_release(state->wait_semaphore_list);
state->wait_semaphore_list.count = 0; // Prevent double-release in fail.
if (!iree_status_is_ok(wait_status)) {
iree_hal_webgpu_queue_write_state_fail(state, wait_status);
return;
}
iree_hal_webgpu_queue_write_phase1(state);
}
iree_status_t iree_hal_webgpu_queue_write(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
iree_hal_file_t* target_file, uint64_t target_offset,
iree_device_size_t length, iree_hal_write_flags_t flags) {
if (target_offset + length > iree_hal_file_length(target_file)) {
return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
"write range [%" PRIu64 ", %" PRIu64
") exceeds file length %" PRIu64,
target_offset, target_offset + length,
iree_hal_file_length(target_file));
}
// Determine the target type: HOST_LOCAL storage buffer or FD.
iree_hal_buffer_t* target_storage = iree_hal_file_storage_buffer(target_file);
// Allocate the write state slab with trailing semaphore arrays.
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_host_size_t total_size = 0;
iree_host_size_t wait_semaphores_offset = 0;
iree_host_size_t wait_values_offset = 0;
iree_host_size_t signal_semaphores_offset = 0;
iree_host_size_t signal_values_offset = 0;
iree_status_t status = IREE_STRUCT_LAYOUT(
sizeof(iree_hal_webgpu_queue_write_state_t), &total_size,
IREE_STRUCT_FIELD(wait_semaphore_list.count, iree_async_semaphore_t*,
&wait_semaphores_offset),
IREE_STRUCT_FIELD(wait_semaphore_list.count, uint64_t,
&wait_values_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, iree_hal_semaphore_t*,
&signal_semaphores_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, uint64_t,
&signal_values_offset));
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
iree_hal_webgpu_queue_write_state_t* state = NULL;
status =
iree_allocator_malloc(queue->host_allocator, total_size, (void**)&state);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
memset(state, 0, sizeof(*state));
state->queue = queue;
state->epoch = epoch;
state->allocator = queue->host_allocator;
// Clone semaphore lists into trailing slab.
state->wait_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, wait_semaphore_list.count, wait_semaphores_offset,
wait_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(wait_semaphore_list,
state->wait_semaphore_list);
state->signal_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, signal_semaphore_list.count, signal_semaphores_offset,
signal_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(signal_semaphore_list,
state->signal_semaphore_list);
// Capture operation parameters.
iree_hal_buffer_retain(source_buffer);
state->source_buffer = source_buffer;
state->source_offset = source_offset;
state->length = length;
state->target_offset = target_offset;
if (target_storage) {
iree_hal_buffer_retain(target_storage);
state->target_storage = target_storage;
} else {
iree_hal_file_retain(target_file);
state->target_file = target_file;
}
// Fast path: waits already satisfied — go directly to Phase 1.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
iree_hal_semaphore_list_release(state->wait_semaphore_list);
state->wait_semaphore_list.count = 0; // Prevent double-release in fail.
iree_hal_webgpu_queue_write_phase1(state);
return iree_ok_status();
}
// Async path: set up the wait operation and submit to proactor.
state->wait_operation.semaphores =
(iree_async_semaphore_t**)state->wait_semaphore_list.semaphores;
state->wait_operation.values = state->wait_semaphore_list.payload_values;
state->wait_operation.count = wait_semaphore_list.count;
state->wait_operation.mode = IREE_ASYNC_WAIT_MODE_ALL;
state->wait_operation.satisfied_index = 0;
iree_async_operation_initialize(
&state->wait_operation.base, IREE_ASYNC_OPERATION_TYPE_SEMAPHORE_WAIT,
IREE_ASYNC_OPERATION_FLAG_NONE,
iree_hal_webgpu_queue_write_wait_completion, /*user_data=*/NULL);
status = iree_async_proactor_submit_one(queue->proactor,
&state->wait_operation.base);
if (!iree_status_is_ok(status)) {
iree_hal_webgpu_queue_write_state_fail(state, iree_status_clone(status));
}
return status;
}
//===----------------------------------------------------------------------===//
// Host call (inline + proactor-driven async)
//===----------------------------------------------------------------------===//
// Executes a host call inline and handles the result status.
// On OK: signals all signal semaphores with the given frontier.
// On DEFERRED: the callback has cloned the signal list and will signal later.
// The DEFERRED path does not use the frontier — the callback manages its own
// signaling and can build a frontier itself if needed.
// On error: fails all signal semaphores with the error status.
// If NON_BLOCKING: signals semaphores before calling, ignores the result.
static iree_status_t iree_hal_webgpu_queue_execute_host_call(
iree_hal_device_t* device, iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t signal_semaphore_list,
const iree_async_frontier_t* frontier, iree_hal_host_call_t call,
const uint64_t args[4], iree_hal_host_call_flags_t flags) {
if (flags & IREE_HAL_HOST_CALL_FLAG_NON_BLOCKING) {
// Signal semaphores immediately, then fire the call. The callback cannot
// observe the signal list and its result is ignored.
iree_status_t signal_status =
iree_hal_semaphore_list_signal(signal_semaphore_list, frontier);
iree_hal_host_call_context_t context = {
.device = device,
.queue_affinity = queue_affinity,
.signal_semaphore_list = iree_hal_semaphore_list_empty(),
};
iree_status_ignore(call.fn(call.user_data, args, &context));
return signal_status;
}
iree_hal_host_call_context_t context = {
.device = device,
.queue_affinity = queue_affinity,
.signal_semaphore_list = signal_semaphore_list,
};
iree_status_t call_status = call.fn(call.user_data, args, &context);
if (iree_status_is_ok(call_status)) {
return iree_hal_semaphore_list_signal(signal_semaphore_list, frontier);
} else if (iree_status_code(call_status) == IREE_STATUS_DEFERRED) {
// The callback has cloned the signal list and will signal later.
iree_status_ignore(call_status);
return iree_ok_status();
} else {
// Fail all signal semaphores with the error.
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(call_status));
return call_status;
}
}
// State for a deferred host call that waits on input semaphores via the
// proactor before executing. Allocated as a single slab:
// [struct | wait iree_async_semaphore_t*[] | wait uint64_t[] |
// signal iree_hal_semaphore_t*[] | signal uint64_t[]]
typedef struct iree_hal_webgpu_host_call_state_t {
iree_async_semaphore_wait_operation_t wait_operation;
iree_hal_host_call_t call;
uint64_t args[4];
iree_hal_host_call_flags_t flags;
iree_hal_device_t* device;
iree_hal_webgpu_queue_t* queue;
iree_hal_queue_affinity_t queue_affinity;
uint64_t epoch; // Pre-incremented at submit for causal ordering.
iree_hal_semaphore_list_t signal_semaphore_list;
iree_hal_semaphore_list_t wait_semaphore_list;
iree_allocator_t allocator;
} iree_hal_webgpu_host_call_state_t;
// Completion callback invoked by the proactor when all wait semaphores are
// satisfied (or one has failed).
static void iree_hal_webgpu_host_call_completion_fn(
void* user_data, iree_async_operation_t* base_operation,
iree_status_t wait_status, iree_async_completion_flags_t completion_flags) {
iree_hal_webgpu_host_call_state_t* state =
(iree_hal_webgpu_host_call_state_t*)base_operation;
if (iree_status_is_ok(wait_status)) {
// Wait succeeded — build frontier and execute the host call. This handles
// signaling/failing the signal semaphores based on the call result.
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(state->queue, state->epoch,
&frontier_storage);
iree_status_ignore(iree_hal_webgpu_queue_execute_host_call(
state->device, state->queue_affinity, state->signal_semaphore_list,
frontier, state->call, state->args, state->flags));
} else {
// Wait itself failed (semaphore failure propagation). Fail all signal
// semaphores with the wait failure status.
iree_hal_semaphore_list_fail(state->signal_semaphore_list, wait_status);
}
iree_hal_webgpu_queue_advance_tracker(state->queue, state->epoch);
// Release both semaphore lists and free the slab.
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_hal_semaphore_list_release(state->wait_semaphore_list);
iree_allocator_free(state->allocator, state);
}
iree_status_t iree_hal_webgpu_queue_host_call(
iree_hal_webgpu_queue_t* queue, iree_hal_device_t* device,
iree_hal_queue_affinity_t queue_affinity,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_host_call_t call, const uint64_t args[4],
iree_hal_host_call_flags_t flags) {
// Fast path: no wait semaphores or all already satisfied — execute inline.
// NO FIFO elision — host calls execute on the CPU, not the GPU FIFO.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list)) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(queue, epoch, &frontier_storage);
iree_status_t status = iree_hal_webgpu_queue_execute_host_call(
device, queue_affinity, signal_semaphore_list, frontier, call, args,
flags);
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
return status;
}
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
// Async path: allocate a slab with the wait operation, call state, and
// trailing arrays for both wait and signal semaphore lists.
iree_host_size_t total_size = 0;
iree_host_size_t wait_semaphores_offset = 0;
iree_host_size_t wait_values_offset = 0;
iree_host_size_t signal_semaphores_offset = 0;
iree_host_size_t signal_values_offset = 0;
iree_status_t status = IREE_STRUCT_LAYOUT(
sizeof(iree_hal_webgpu_host_call_state_t), &total_size,
IREE_STRUCT_FIELD(wait_semaphore_list.count, iree_async_semaphore_t*,
&wait_semaphores_offset),
IREE_STRUCT_FIELD(wait_semaphore_list.count, uint64_t,
&wait_values_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, iree_hal_semaphore_t*,
&signal_semaphores_offset),
IREE_STRUCT_FIELD(signal_semaphore_list.count, uint64_t,
&signal_values_offset));
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
iree_hal_webgpu_host_call_state_t* state = NULL;
status =
iree_allocator_malloc(queue->host_allocator, total_size, (void**)&state);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
return status;
}
// Copy call parameters.
state->call = call;
memcpy(state->args, args, sizeof(state->args));
state->flags = flags;
state->device = device;
state->queue = queue;
state->queue_affinity = queue_affinity;
state->epoch = epoch;
state->allocator = queue->host_allocator;
// Set up the wait semaphore list and clone the caller's data into it.
// The wait operation's semaphore pointers alias the same array: WebGPU
// semaphores have iree_async_semaphore_t at offset 0, so the pointer bits
// are identical regardless of which type they're cast to.
state->wait_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, wait_semaphore_list.count, wait_semaphores_offset,
wait_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(wait_semaphore_list,
state->wait_semaphore_list);
state->wait_operation.semaphores =
(iree_async_semaphore_t**)state->wait_semaphore_list.semaphores;
state->wait_operation.values = state->wait_semaphore_list.payload_values;
state->wait_operation.count = wait_semaphore_list.count;
state->wait_operation.mode = IREE_ASYNC_WAIT_MODE_ALL;
state->wait_operation.satisfied_index = 0;
// Set up the signal semaphore list and clone the caller's data into it.
state->signal_semaphore_list = iree_hal_webgpu_semaphore_list_at_offsets(
state, signal_semaphore_list.count, signal_semaphores_offset,
signal_values_offset);
iree_hal_webgpu_semaphore_list_clone_into(signal_semaphore_list,
state->signal_semaphore_list);
// Initialize the wait operation base and submit to the proactor.
iree_async_operation_initialize(
&state->wait_operation.base, IREE_ASYNC_OPERATION_TYPE_SEMAPHORE_WAIT,
IREE_ASYNC_OPERATION_FLAG_NONE, iree_hal_webgpu_host_call_completion_fn,
/*user_data=*/NULL);
status = iree_async_proactor_submit_one(queue->proactor,
&state->wait_operation.base);
if (!iree_status_is_ok(status)) {
iree_hal_semaphore_list_fail(state->signal_semaphore_list,
iree_status_clone(status));
iree_hal_semaphore_list_release(state->signal_semaphore_list);
iree_hal_semaphore_list_release(state->wait_semaphore_list);
iree_allocator_free(queue->host_allocator, state);
}
return status;
}
//===----------------------------------------------------------------------===//
// queue_dispatch
//===----------------------------------------------------------------------===//
iree_status_t iree_hal_webgpu_queue_dispatch(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_executable_t* executable,
iree_hal_executable_export_ordinal_t export_ordinal,
iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
// Extract pipeline/bgl handles at submit time. These are bridge table
// indices (uint32 values) that remain valid as long as the executable lives.
iree_hal_webgpu_handle_t pipeline_handle =
iree_hal_webgpu_executable_pipeline_handle(executable, export_ordinal);
iree_hal_webgpu_handle_t bind_group_layout_handle =
iree_hal_webgpu_executable_bind_group_layout_handle(executable,
export_ordinal);
// Fast path: waits already satisfied (or FIFO-elided) — execute
// synchronously.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
iree_status_t status =
iree_hal_webgpu_builder_reset(&queue->scratch_builder);
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_builder_dispatch(
&queue->scratch_builder, pipeline_handle, bind_group_layout_handle,
config.workgroup_count, bindings);
}
return iree_hal_webgpu_queue_submit_scratch_and_signal(
queue, signal_semaphore_list, status);
}
// Async path: capture params, retain executable + per-binding buffers,
// snapshot bindings into trailing slab.
iree_host_size_t bindings_offset = 0;
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_DISPATCH, wait_semaphore_list,
signal_semaphore_list, bindings.count, sizeof(iree_hal_buffer_ref_t),
&bindings_offset, &state));
state->dispatch.pipeline_handle = pipeline_handle;
state->dispatch.bind_group_layout_handle = bind_group_layout_handle;
memcpy(state->dispatch.workgroup_count, config.workgroup_count,
sizeof(state->dispatch.workgroup_count));
iree_hal_executable_retain(executable);
state->dispatch.executable = executable;
// Snapshot bindings into the trailing slab and retain each buffer.
state->dispatch.bindings =
(iree_hal_buffer_ref_t*)((uint8_t*)state + bindings_offset);
state->dispatch.binding_count = (uint32_t)bindings.count;
for (iree_host_size_t i = 0; i < bindings.count; ++i) {
state->dispatch.bindings[i] = bindings.values[i];
if (bindings.values[i].buffer) {
iree_hal_buffer_retain(bindings.values[i].buffer);
}
}
return iree_hal_webgpu_queue_state_submit(state);
}
// Submits an async barrier (wait → signal inline) to the proactor using the
// unified queue state. Called when iree_hal_semaphore_list_poll indicates the
// wait semaphores are not yet satisfied. On success, ownership transfers to
// the proactor — the wait completion callback signals and frees the state.
static iree_status_t iree_hal_webgpu_queue_submit_barrier(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list) {
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_BARRIER, wait_semaphore_list,
signal_semaphore_list, /*trailing_count=*/0,
/*trailing_element_size=*/0, /*out_trailing_offset=*/NULL, &state));
return iree_hal_webgpu_queue_state_submit(state);
}
//===----------------------------------------------------------------------===//
// queue_execute
//===----------------------------------------------------------------------===//
// Executes a reusable command buffer's cached recording. Dynamic bindings from
// the binding table are resolved to GPU buffer handles for the JS processor.
static iree_status_t iree_hal_webgpu_queue_execute_recording(
iree_hal_webgpu_queue_t* queue, iree_hal_webgpu_handle_t recording_handle,
iree_hal_buffer_binding_table_t binding_table) {
iree_hal_webgpu_isa_binding_table_entry_t inline_entries[8];
iree_hal_webgpu_isa_binding_table_entry_t* dynamic_entries = inline_entries;
if (binding_table.count > IREE_ARRAYSIZE(inline_entries)) {
IREE_RETURN_IF_ERROR(iree_allocator_malloc_array(
queue->host_allocator, binding_table.count,
sizeof(iree_hal_webgpu_isa_binding_table_entry_t),
(void**)&dynamic_entries));
}
for (iree_host_size_t i = 0; i < binding_table.count; ++i) {
const iree_hal_buffer_binding_t* binding = &binding_table.bindings[i];
if (binding->buffer) {
iree_hal_buffer_t* allocated =
iree_hal_buffer_allocated_buffer(binding->buffer);
dynamic_entries[i].gpu_buffer_handle =
iree_hal_webgpu_buffer_handle(allocated);
dynamic_entries[i].base_offset =
(uint32_t)iree_hal_buffer_byte_offset(binding->buffer);
} else {
dynamic_entries[i].gpu_buffer_handle = 0;
dynamic_entries[i].base_offset = 0;
}
}
uint32_t result = iree_hal_webgpu_import_execute_recording(
recording_handle, queue->queue_handle,
(uint32_t)(uintptr_t)dynamic_entries);
if (dynamic_entries != inline_entries) {
iree_allocator_free(queue->host_allocator, dynamic_entries);
}
if (result != 0) {
return iree_make_status(IREE_STATUS_INTERNAL,
"JS execute_recording failed with code %u", result);
}
return iree_ok_status();
}
// Executes a ONE_SHOT command buffer's instruction stream directly. Builds the
// binding table from the builder's static slot map and passes it with the
// builtins descriptor to the JS processor.
static iree_status_t iree_hal_webgpu_queue_execute_one_shot(
iree_hal_webgpu_queue_t* queue, iree_hal_command_buffer_t* command_buffer) {
iree_hal_webgpu_builder_t* builder =
iree_hal_webgpu_command_buffer_builder(command_buffer);
uint32_t total_slots = iree_hal_webgpu_builder_total_slot_count(builder);
uint32_t static_count = iree_hal_webgpu_builder_static_slot_count(builder);
iree_hal_webgpu_isa_binding_table_entry_t inline_entries[8];
iree_hal_webgpu_isa_binding_table_entry_t* entries = inline_entries;
if (total_slots > IREE_ARRAYSIZE(inline_entries)) {
IREE_RETURN_IF_ERROR(iree_allocator_malloc_array(
queue->host_allocator, total_slots,
sizeof(iree_hal_webgpu_isa_binding_table_entry_t), (void**)&entries));
}
const iree_hal_webgpu_builder_slot_entry_t* slot_entries =
iree_hal_webgpu_builder_static_slot_entries(builder);
for (uint32_t i = 0; i < static_count; ++i) {
entries[slot_entries[i].slot].gpu_buffer_handle =
slot_entries[i].gpu_buffer_handle;
entries[slot_entries[i].slot].base_offset = 0;
}
iree_hal_webgpu_isa_builtins_descriptor_t builtins_descriptor;
iree_hal_webgpu_builtins_get_descriptor(queue->builtins,
&builtins_descriptor);
uint32_t result = iree_hal_webgpu_import_execute_instructions(
queue->device_handle, queue->queue_handle,
(uint32_t)(uintptr_t)iree_hal_webgpu_builder_block_table(builder),
iree_hal_webgpu_builder_block_count(builder),
iree_hal_webgpu_builder_block_word_capacity(builder),
iree_hal_webgpu_builder_last_block_word_count(builder),
(uint32_t)(uintptr_t)entries, total_slots,
(uint32_t)(uintptr_t)&builtins_descriptor);
if (entries != inline_entries) {
iree_allocator_free(queue->host_allocator, entries);
}
if (result != 0) {
return iree_make_status(IREE_STATUS_INTERNAL,
"JS execute_instructions failed with code %u",
result);
}
return iree_ok_status();
}
iree_status_t iree_hal_webgpu_queue_execute(
iree_hal_webgpu_queue_t* queue,
const iree_hal_semaphore_list_t wait_semaphore_list,
const iree_hal_semaphore_list_t signal_semaphore_list,
iree_hal_command_buffer_t* command_buffer,
iree_hal_buffer_binding_table_t binding_table,
iree_hal_execute_flags_t flags) {
// Barrier-only submission: no command buffer, just wait->signal.
// NO FIFO elision — barrier signals are CPU-side with no GPU FIFO backing,
// so FIFO ordering does not guarantee the signal is visible to consumers.
if (!command_buffer) {
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list)) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_async_single_frontier_t frontier_storage;
const iree_async_frontier_t* frontier =
iree_hal_webgpu_queue_build_frontier(queue, epoch, &frontier_storage);
iree_status_t status =
iree_hal_semaphore_list_signal(signal_semaphore_list, frontier);
iree_hal_webgpu_queue_advance_tracker(queue, epoch);
return status;
}
return iree_hal_webgpu_queue_submit_barrier(queue, wait_semaphore_list,
signal_semaphore_list);
}
// Fast path: waits already satisfied (or FIFO-elided) — execute
// synchronously.
if (wait_semaphore_list.count == 0 ||
iree_hal_semaphore_list_poll(wait_semaphore_list) ||
iree_hal_webgpu_queue_can_elide_waits(queue, wait_semaphore_list)) {
uint64_t epoch = iree_hal_webgpu_queue_reserve_epoch(queue);
iree_hal_webgpu_handle_t recording_handle =
iree_hal_webgpu_command_buffer_recording_handle(command_buffer);
iree_status_t status;
if (recording_handle) {
status = iree_hal_webgpu_queue_execute_recording(queue, recording_handle,
binding_table);
} else {
status = iree_hal_webgpu_queue_execute_one_shot(queue, command_buffer);
}
if (iree_status_is_ok(status)) {
status = iree_hal_webgpu_queue_register_signal_completion(
queue, epoch, signal_semaphore_list);
}
if (iree_status_is_ok(status)) {
iree_hal_webgpu_queue_mark_signals_submitted(queue,
signal_semaphore_list);
} else {
iree_hal_semaphore_list_fail(signal_semaphore_list,
iree_status_clone(status));
}
return status;
}
// Async path: retain command buffer, snapshot binding table into trailing
// slab (retaining each buffer), submit wait.
iree_host_size_t binding_table_offset = 0;
iree_hal_webgpu_queue_state_t* state = NULL;
IREE_RETURN_IF_ERROR(iree_hal_webgpu_queue_state_allocate(
queue, IREE_HAL_WEBGPU_QUEUE_OP_EXECUTE, wait_semaphore_list,
signal_semaphore_list, binding_table.count,
sizeof(iree_hal_buffer_binding_t), &binding_table_offset, &state));
iree_hal_command_buffer_retain(command_buffer);
state->execute.command_buffer = command_buffer;
// Snapshot the binding table into the trailing slab and retain each buffer.
state->execute.binding_table =
(iree_hal_buffer_binding_t*)((uint8_t*)state + binding_table_offset);
state->execute.binding_count = binding_table.count;
for (iree_host_size_t i = 0; i < binding_table.count; ++i) {
state->execute.binding_table[i] = binding_table.bindings[i];
if (binding_table.bindings[i].buffer) {
iree_hal_buffer_retain(binding_table.bindings[i].buffer);
}
}
return iree_hal_webgpu_queue_state_submit(state);
}
//===----------------------------------------------------------------------===//
// queue_flush
//===----------------------------------------------------------------------===//
iree_status_t iree_hal_webgpu_queue_flush(iree_hal_webgpu_queue_t* queue) {
// WebGPU's queue.submit() is not buffered — commands are submitted
// immediately. No flush is needed.
return iree_ok_status();
}