blob: 92403205f4d058fb9716956bb86d57ecd557083c [file] [log] [blame]
Ben Vanike7c2cba2021-07-19 15:45:39 -07001// Copyright 2021 The IREE Authors
2//
3// Licensed under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
7#include "experimental/webgpu/command_buffer.h"
8
9#include <assert.h>
10#include <stddef.h>
11#include <stdint.h>
12
13#include "experimental/webgpu/buffer.h"
14#include "experimental/webgpu/executable.h"
15#include "experimental/webgpu/pipeline_layout.h"
16#include "iree/base/api.h"
17#include "iree/base/internal/arena.h"
Ben Vanike7c2cba2021-07-19 15:45:39 -070018
19//===----------------------------------------------------------------------===//
20// Segmented submission management
21//===----------------------------------------------------------------------===//
22// WebGPU - like Metal - has a rather obtuse multi-level recording model with
23// the most obtuse design point being that DMA operations happen on the queue
24// directly. In trying to model a single command buffer we may need to make
25// multiple ordered submissions to the device queue, which is unfortunate as
26// the queue submission routine only takes command buffers and we need to
27// interleave the command buffer submissions with other queue operations.
28
29typedef enum iree_hal_webgpu_command_segment_action_e {
30 // wgpuQueueSubmit of a command buffer.
31 IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE,
32 // wgpuQueueWriteBuffer for a host->device transfer.
33 IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER,
34} iree_hal_webgpu_command_segment_action_t;
35
36struct iree_hal_webgpu_command_segment_t;
37typedef struct iree_hal_webgpu_command_segment_t {
38 struct iree_hal_webgpu_command_segment_t* next_segment;
39 iree_hal_webgpu_command_segment_action_t action;
40 union {
41 struct {
42 WGPUCommandBuffer command_buffer;
43 } execute;
44 struct {
45 const void* source_buffer;
46 iree_host_size_t source_offset;
47 WGPUBuffer target_buffer;
48 iree_device_size_t target_offset;
49 iree_host_size_t length;
50 } write_buffer;
51 };
52} iree_hal_webgpu_command_segment_t;
53
54typedef struct iree_hal_webgpu_command_segment_list_t {
55 iree_hal_webgpu_command_segment_t* head;
56 iree_hal_webgpu_command_segment_t* tail;
57} iree_hal_webgpu_command_segment_list_t;
58
59static void iree_hal_webgpu_command_segment_list_reset(
60 iree_hal_webgpu_command_segment_list_t* list) {
61 for (iree_hal_webgpu_command_segment_t* segment = list->head; segment;
62 segment = segment->next_segment) {
63 switch (segment->action) {
64 case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER:
65 iree_wgpuCommandBufferDrop(segment->execute.command_buffer);
66 break;
67 default:
68 case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE:
69 // Nothing to do.
70 break;
71 }
72 }
73 memset(list, 0, sizeof(*list));
74}
75
76static void iree_hal_webgpu_command_segment_list_push_front(
77 iree_hal_webgpu_command_segment_list_t* list,
78 iree_hal_webgpu_command_segment_t* segment) {
79 segment->next_segment = list->head;
80 list->head = segment;
81 if (!list->tail) list->tail = segment;
82}
83
84static void iree_hal_webgpu_command_segment_list_push_back(
85 iree_hal_webgpu_command_segment_list_t* list,
86 iree_hal_webgpu_command_segment_t* segment) {
87 segment->next_segment = NULL;
88 if (list->tail) {
89 list->tail->next_segment = segment;
90 list->tail = segment;
91 } else {
92 list->head = list->tail = segment;
93 }
94}
95
96static void iree_hal_webgpu_command_segment_issue_execute(
97 iree_hal_webgpu_command_segment_t* segment, WGPUQueue queue) {
98 IREE_TRACE_ZONE_BEGIN(z0);
99 wgpuQueueSubmit(queue, 1, &segment->execute.command_buffer);
100 IREE_TRACE_ZONE_END(z0);
101}
102
103static void iree_hal_webgpu_command_segment_issue_write_buffer(
104 iree_hal_webgpu_command_segment_t* segment, WGPUQueue queue) {
105 IREE_TRACE_ZONE_BEGIN(z0);
106 wgpuQueueWriteBuffer(queue, segment->write_buffer.target_buffer,
107 segment->write_buffer.target_offset,
108 ((const uint8_t*)segment->write_buffer.source_buffer) +
109 segment->write_buffer.source_offset,
110 segment->write_buffer.length);
111 IREE_TRACE_ZONE_END(z0);
112}
113
114//===----------------------------------------------------------------------===//
115// iree_hal_webgpu_command_buffer_t
116//===----------------------------------------------------------------------===//
117
118typedef struct iree_hal_webgpu_command_buffer_t {
119 iree_hal_command_buffer_t base;
120 iree_allocator_t host_allocator;
121 WGPUDevice device;
122
123 // Shared staging uniform buffer with queue-ordered data. We use this
124 // for push constant emulation by recording all of the push constants per
125 // dispatch and then updating the buffer prior to issuing the commands using
126 // it. This works because there's no out-of-order or overlapping execution in
127 // WebGPU (unfortunately) and we know that if we write in queue-order the
128 // updates will be visible to the subsequently issued commands.
129 iree_hal_webgpu_staging_buffer_t* staging_buffer;
130
131 // Device-shared WGPUBindGroup cache.
132 iree_hal_webgpu_bind_group_cache_t* bind_group_cache;
133
134 // Shaders emulating functionality not present in WebGPU.
135 // Owned by the parent device.
136 iree_hal_webgpu_builtins_t* builtins;
137
138 // Arena used for all allocations; references the shared device block pool.
139 iree_arena_allocator_t arena;
140
141 // Linked list of queue submission actions.
142 iree_hal_webgpu_command_segment_list_t segments;
143
144 struct {
145 // Valid only when recording.
146 WGPUCommandEncoder encoder;
147 // Currently open pass - NULL if no open pass.
148 WGPUComputePassEncoder compute_pass;
149
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700150 // All available push constants updated each time constants is called.
Ben Vanike7c2cba2021-07-19 15:45:39 -0700151 // Reset only with the command buffer and otherwise will maintain its values
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700152 // during recording to allow for partial constants updates.
153 uint32_t constants[IREE_HAL_WEBGPU_MAX_PUSH_CONSTANT_COUNT];
Ben Vanike7c2cba2021-07-19 15:45:39 -0700154
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700155 // TODO(benvanik): add a constants dirty bit so we know if we need to
Ben Vanike7c2cba2021-07-19 15:45:39 -0700156 // upload more. Today we'll stage the same values for each dispatch.
157
158 // Snapshot of descriptor sets as populated by push_descriptor_set.
159 // Each push_descriptor_set will invalidate the bind group handle and
160 // subsequent dispatches will acquire new bind groups from the cache. If
161 // future updates are no-ops the same bind group handle can be used.
162 struct {
163 WGPUBindGroup handle;
164 iree_hal_webgpu_bind_group_binding_t
165 bindings[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT];
166 } bind_groups[IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_COUNT];
167
168 // Bitfield tracking which bind groups are set to an empty group.
169 uint64_t bind_groups_empty;
170 } state;
171} iree_hal_webgpu_command_buffer_t;
172
173extern const iree_hal_command_buffer_vtable_t
174 iree_hal_webgpu_command_buffer_vtable;
175
176static iree_hal_webgpu_command_buffer_t* iree_hal_webgpu_command_buffer_cast(
177 iree_hal_command_buffer_t* base_value) {
178 IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_webgpu_command_buffer_vtable);
179 return (iree_hal_webgpu_command_buffer_t*)base_value;
180}
181
182iree_status_t iree_hal_webgpu_command_buffer_create(
Ben Vanikf8f29962024-07-09 13:09:35 -0700183 iree_hal_allocator_t* device_allocator, WGPUDevice device_handle,
Ben Vanike7c2cba2021-07-19 15:45:39 -0700184 iree_hal_command_buffer_mode_t mode,
185 iree_hal_command_category_t command_categories,
186 iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
187 iree_arena_block_pool_t* block_pool,
188 iree_hal_webgpu_staging_buffer_t* staging_buffer,
189 iree_hal_webgpu_bind_group_cache_t* bind_group_cache,
190 iree_hal_webgpu_builtins_t* builtins, iree_allocator_t host_allocator,
191 iree_hal_command_buffer_t** out_command_buffer) {
Ben Vanikf8f29962024-07-09 13:09:35 -0700192 IREE_ASSERT_ARGUMENT(device_allocator);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700193 IREE_ASSERT_ARGUMENT(block_pool);
194 IREE_ASSERT_ARGUMENT(staging_buffer);
195 IREE_ASSERT_ARGUMENT(bind_group_cache);
196 IREE_ASSERT_ARGUMENT(builtins);
197 IREE_ASSERT_ARGUMENT(out_command_buffer);
198 *out_command_buffer = NULL;
199
200 if (binding_capacity > 0) {
201 // TODO(#10144): support indirect command buffers with binding tables.
202 return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
203 "indirect command buffers not yet implemented");
204 }
205
206 IREE_TRACE_ZONE_BEGIN(z0);
207
208 iree_hal_webgpu_command_buffer_t* command_buffer = NULL;
209 iree_status_t status = iree_allocator_malloc(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700210 host_allocator,
211 sizeof(*command_buffer) +
212 iree_hal_command_buffer_validation_state_size(mode, binding_capacity),
213 (void**)&command_buffer);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700214 if (iree_status_is_ok(status)) {
215 iree_hal_command_buffer_initialize(
Ben Vanikf8f29962024-07-09 13:09:35 -0700216 device_allocator, mode, command_categories, queue_affinity,
217 binding_capacity, (uint8_t*)command_buffer + sizeof(*command_buffer),
Ben Vanike7c2cba2021-07-19 15:45:39 -0700218 &iree_hal_webgpu_command_buffer_vtable, &command_buffer->base);
219 command_buffer->host_allocator = host_allocator;
220 command_buffer->device = device_handle;
221 command_buffer->staging_buffer = staging_buffer;
222 command_buffer->bind_group_cache = bind_group_cache;
223 command_buffer->builtins = builtins;
224
225 iree_arena_initialize(block_pool, &command_buffer->arena);
226 iree_hal_webgpu_command_segment_list_reset(&command_buffer->segments);
227
228 *out_command_buffer = &command_buffer->base;
229 }
230
231 IREE_TRACE_ZONE_END(z0);
232 return status;
233}
234
235bool iree_hal_webgpu_command_buffer_isa(
236 iree_hal_command_buffer_t* command_buffer) {
237 return iree_hal_resource_is(&command_buffer->resource,
238 &iree_hal_webgpu_command_buffer_vtable);
239}
240
241static void* iree_hal_webgpu_command_buffer_dyn_cast(
242 iree_hal_command_buffer_t* command_buffer, const void* vtable) {
243 if (vtable == &iree_hal_webgpu_command_buffer_vtable) {
244 IREE_HAL_ASSERT_TYPE(command_buffer, vtable);
245 return command_buffer;
246 }
247 return NULL;
248}
249
250static void iree_hal_webgpu_command_buffer_reset(
251 iree_hal_webgpu_command_buffer_t* command_buffer) {
252 IREE_TRACE_ZONE_BEGIN(z0);
253
254 if (command_buffer->state.compute_pass) {
255 wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
256 }
257 if (command_buffer->state.encoder) {
258 const WGPUCommandBufferDescriptor descriptor = {
259 .nextInChain = NULL,
260 .label = NULL,
261 };
262 iree_wgpuCommandBufferDrop(
263 wgpuCommandEncoderFinish(command_buffer->state.encoder, &descriptor));
264 command_buffer->state.encoder = NULL;
265 }
266
267 command_buffer->state.bind_groups_empty = 0;
268
269 iree_hal_webgpu_staging_buffer_reset(command_buffer->staging_buffer);
270 iree_hal_webgpu_command_segment_list_reset(&command_buffer->segments);
271 iree_arena_reset(&command_buffer->arena);
272
Ben Vanik8dc68202024-08-12 13:36:40 -0700273 // Pad up to IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX with empty bind groups.
274 WGPUBindGroup empty_handle = command_buffer->staging_buffer->empty_bind_group;
275 for (iree_host_size_t i = 0; i < IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX;
276 ++i) {
277 wgpuComputePassEncoderSetBindGroup(compute_pass, (uint32_t)i, empty_handle,
278 0, NULL);
279 command_buffer->state.bind_groups[i].handle = empty_handle;
280 command_buffer->state.bind_groups_empty |= 1ull << i;
281 }
282
Ben Vanike7c2cba2021-07-19 15:45:39 -0700283 IREE_TRACE_ZONE_END(z0);
284}
285
286static void iree_hal_webgpu_command_buffer_destroy(
287 iree_hal_command_buffer_t* base_command_buffer) {
288 iree_hal_webgpu_command_buffer_t* command_buffer =
289 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
290 iree_allocator_t host_allocator = command_buffer->host_allocator;
291 IREE_TRACE_ZONE_BEGIN(z0);
292
293 iree_hal_webgpu_command_buffer_reset(command_buffer);
294 iree_arena_deinitialize(&command_buffer->arena);
295 iree_allocator_free(host_allocator, command_buffer);
296
297 IREE_TRACE_ZONE_END(z0);
298}
299
300iree_status_t iree_hal_webgpu_command_buffer_issue(
301 iree_hal_command_buffer_t* base_command_buffer, WGPUQueue queue) {
302 iree_hal_webgpu_command_buffer_t* command_buffer =
303 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
304 IREE_ASSERT(command_buffer);
305 IREE_TRACE_ZONE_BEGIN(z0);
306
307 for (iree_hal_webgpu_command_segment_t* segment =
308 command_buffer->segments.head;
309 segment; segment = segment->next_segment) {
310 switch (segment->action) {
311 case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE:
312 iree_hal_webgpu_command_segment_issue_execute(segment, queue);
313 break;
314 case IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER:
315 iree_hal_webgpu_command_segment_issue_write_buffer(segment, queue);
316 break;
317 default:
318 break;
319 }
320 }
321
322 IREE_TRACE_ZONE_END(z0);
323 return iree_ok_status();
324}
325
326static iree_status_t iree_hal_webgpu_command_buffer_flush_encoder(
327 iree_hal_webgpu_command_buffer_t* command_buffer) {
328 if (!command_buffer->state.encoder) return iree_ok_status();
329
330 // End any open compute pass.
331 if (command_buffer->state.compute_pass) {
332 wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
333 command_buffer->state.compute_pass = NULL;
334 }
335
336 // Finalize encoder and produce a command buffer.
337 const WGPUCommandBufferDescriptor descriptor = {
338 .nextInChain = NULL,
339 .label = NULL,
340 };
341 WGPUCommandBuffer handle =
342 wgpuCommandEncoderFinish(command_buffer->state.encoder, &descriptor);
343 command_buffer->state.encoder = NULL;
344
345 iree_hal_webgpu_command_segment_t* segment = NULL;
346 iree_status_t status = iree_arena_allocate(
347 &command_buffer->arena, sizeof(*segment), (void**)&segment);
348 if (iree_status_is_ok(status)) {
349 // Attach the command buffer segment.
350 segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_EXECUTE;
351 segment->execute.command_buffer = handle;
352 iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
353 segment);
354 } else {
355 iree_wgpuCommandBufferDrop(handle);
356 }
357 return status;
358}
359
360static iree_status_t iree_hal_webgpu_command_buffer_acquire_command_encoder(
361 iree_hal_webgpu_command_buffer_t* command_buffer,
362 WGPUCommandEncoder* out_command_encoder) {
363 // Close active compute pass, if any.
364 if (command_buffer->state.compute_pass) {
365 wgpuComputePassEncoderEnd(command_buffer->state.compute_pass);
366 command_buffer->state.compute_pass = NULL;
367 }
368
369 // Reuse an open encoder, if any.
370 if (command_buffer->state.encoder) {
371 *out_command_encoder = command_buffer->state.encoder;
372 return iree_ok_status();
373 }
374
375 // Open a new encoder.
376 const WGPUCommandEncoderDescriptor descriptor = {
377 .nextInChain = NULL,
378 .label = NULL,
379 };
380 command_buffer->state.encoder =
381 wgpuDeviceCreateCommandEncoder(command_buffer->device, &descriptor);
382 *out_command_encoder = command_buffer->state.encoder;
383
384 return iree_ok_status();
385}
386
387static iree_status_t iree_hal_webgpu_command_buffer_acquire_compute_pass(
388 iree_hal_webgpu_command_buffer_t* command_buffer,
389 WGPUComputePassEncoder* out_compute_pass) {
390 // Reuse an open compute pass, if any.
391 if (command_buffer->state.compute_pass) {
392 *out_compute_pass = command_buffer->state.compute_pass;
393 return iree_ok_status();
394 }
395
396 // Open/reuse an encoder.
397 WGPUCommandEncoder command_encoder = NULL;
398 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
399 command_buffer, &command_encoder));
400
401 // Open a new compute pass.
402 const WGPUComputePassDescriptor descriptor = {
403 .nextInChain = NULL,
404 .label = NULL,
405 };
406 command_buffer->state.compute_pass =
407 wgpuCommandEncoderBeginComputePass(command_encoder, &descriptor);
408 *out_compute_pass = command_buffer->state.compute_pass;
409
410 // Reset all device-side state for the compute pass - nothing carries over
411 // across passes and we will need to rebind things.
412 for (iree_host_size_t i = 0;
413 i < IREE_ARRAYSIZE(command_buffer->state.bind_groups); ++i) {
414 command_buffer->state.bind_groups[i].handle = NULL;
415 }
416 command_buffer->state.bind_groups_empty = 0;
417
418 return iree_ok_status();
419}
420
421static iree_status_t iree_hal_webgpu_command_buffer_flush(
422 iree_hal_webgpu_command_buffer_t* command_buffer) {
423 // Flush any active encoder as we are beginning a new segment.
424 IREE_RETURN_IF_ERROR(
425 iree_hal_webgpu_command_buffer_flush_encoder(command_buffer));
426
427 // Flush the staging buffer to get the upload parameters.
428 void* source_buffer = NULL;
429 WGPUBuffer target_buffer = NULL;
430 iree_host_size_t upload_length = 0;
431 iree_hal_webgpu_staging_buffer_flush(command_buffer->staging_buffer,
432 &source_buffer, &target_buffer,
433 &upload_length);
434
435 // Enqueue new segment.
436 uint8_t* storage_base = NULL;
437 iree_hal_webgpu_command_segment_t* segment = NULL;
438 IREE_RETURN_IF_ERROR(iree_arena_allocate(&command_buffer->arena,
439 sizeof(*segment) + upload_length,
440 (void**)&storage_base));
441
442 // Copy the staging upload data into the command buffer so the host staging
443 // buffer can be reused immediately. This results in an extra copy but this
444 // is mostly small. We could - if executing inline - submit this to the
445 // queue immediately without the segment overhead.
446 uint8_t* storage_buffer = storage_base + sizeof(*segment);
447 memcpy(storage_buffer, source_buffer, upload_length);
448
449 // Attach the write_buffer segment.
450 segment = (iree_hal_webgpu_command_segment_t*)storage_base;
451 segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER;
452 segment->write_buffer.source_buffer = storage_buffer;
453 segment->write_buffer.source_offset = 0;
454 segment->write_buffer.target_buffer = target_buffer;
455 segment->write_buffer.target_offset = 0;
456 segment->write_buffer.length = upload_length;
457 iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
458 segment);
459
460 return iree_ok_status();
461}
462
463static iree_status_t iree_hal_webgpu_command_buffer_append_parameters(
464 iree_hal_webgpu_command_buffer_t* command_buffer,
465 iree_const_byte_span_t source, uint32_t* out_offset) {
466 // Try to append the parameters - this may fail if the staging buffer is
467 // exhausted and needs to be flushed. If so we flush and then try again.
468 iree_status_t try_status = iree_hal_webgpu_staging_buffer_append(
469 command_buffer->staging_buffer, source, out_offset);
470 if (iree_status_is_ok(try_status) ||
471 !iree_status_is_resource_exhausted(try_status)) {
472 return try_status; // NOTE: may be a failure.
473 }
474
475 // Flush any pending commands and the current staging buffer state.
476 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_flush(command_buffer));
477
478 // Try to stage the parameters again. If this fails it's not because it needed
479 // a flush.
480 return iree_hal_webgpu_staging_buffer_append(command_buffer->staging_buffer,
481 source, out_offset);
482}
483
484static iree_status_t iree_hal_webgpu_command_buffer_begin(
485 iree_hal_command_buffer_t* base_command_buffer) {
486 iree_hal_webgpu_command_buffer_t* command_buffer =
487 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
488 iree_hal_webgpu_command_buffer_reset(command_buffer);
489 return iree_ok_status();
490}
491
492static iree_status_t iree_hal_webgpu_command_buffer_end(
493 iree_hal_command_buffer_t* base_command_buffer) {
494 iree_hal_webgpu_command_buffer_t* command_buffer =
495 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
496 return iree_hal_webgpu_command_buffer_flush(command_buffer);
497}
498
499static void iree_hal_webgpu_command_buffer_begin_debug_group(
500 iree_hal_command_buffer_t* base_command_buffer, iree_string_view_t label,
501 iree_hal_label_color_t label_color,
502 const iree_hal_label_location_t* location) {
503 iree_hal_webgpu_command_buffer_t* command_buffer =
504 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
505
506 WGPUCommandEncoder command_encoder = NULL;
507 iree_status_t status = iree_hal_webgpu_command_buffer_acquire_command_encoder(
508 command_buffer, &command_encoder);
509 if (!iree_status_is_ok(status)) {
510 // TODO(benvanik): mark recording as failed.
511 iree_status_ignore(status);
512 return;
513 }
514
515 // TODO(benvanik): ensure this works right when in a compute pass.
516 char label_str[128] = {0};
517 memcpy(label_str, label.data, iree_min(sizeof(label_str) - 1, label.size));
518 wgpuCommandEncoderPushDebugGroup(command_encoder, label_str);
519}
520
521static void iree_hal_webgpu_command_buffer_end_debug_group(
522 iree_hal_command_buffer_t* base_command_buffer) {
523 iree_hal_webgpu_command_buffer_t* command_buffer =
524 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
525
526 WGPUCommandEncoder command_encoder = NULL;
527 iree_status_t status = iree_hal_webgpu_command_buffer_acquire_command_encoder(
528 command_buffer, &command_encoder);
529 if (!iree_status_is_ok(status)) {
530 // TODO(benvanik): mark recording as failed.
531 iree_status_ignore(status);
532 return;
533 }
534
535 wgpuCommandEncoderPopDebugGroup(command_encoder);
536}
537
538static iree_status_t iree_hal_webgpu_command_buffer_execution_barrier(
539 iree_hal_command_buffer_t* base_command_buffer,
540 iree_hal_execution_stage_t source_stage_mask,
541 iree_hal_execution_stage_t target_stage_mask,
542 iree_hal_execution_barrier_flags_t flags,
543 iree_host_size_t memory_barrier_count,
544 const iree_hal_memory_barrier_t* memory_barriers,
545 iree_host_size_t buffer_barrier_count,
546 const iree_hal_buffer_barrier_t* buffer_barriers) {
547 // No-op: barriers are automatic in WebGPU.
548 return iree_ok_status();
549}
550
551static iree_status_t iree_hal_webgpu_command_buffer_signal_event(
552 iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
553 iree_hal_execution_stage_t source_stage_mask) {
554 // No-op: no events in WebGPU.
555 return iree_ok_status();
556}
557
558static iree_status_t iree_hal_webgpu_command_buffer_reset_event(
559 iree_hal_command_buffer_t* base_command_buffer, iree_hal_event_t* event,
560 iree_hal_execution_stage_t source_stage_mask) {
561 // No-op: no events in WebGPU.
562 return iree_ok_status();
563}
564
565static iree_status_t iree_hal_webgpu_command_buffer_wait_events(
566 iree_hal_command_buffer_t* base_command_buffer,
567 iree_host_size_t event_count, const iree_hal_event_t** events,
568 iree_hal_execution_stage_t source_stage_mask,
569 iree_hal_execution_stage_t target_stage_mask,
570 iree_host_size_t memory_barrier_count,
571 const iree_hal_memory_barrier_t* memory_barriers,
572 iree_host_size_t buffer_barrier_count,
573 const iree_hal_buffer_barrier_t* buffer_barriers) {
574 // No-op: no events in WebGPU.
575 return iree_ok_status();
576}
577
578static iree_status_t iree_hal_webgpu_command_buffer_discard_buffer(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700579 iree_hal_command_buffer_t* base_command_buffer,
580 iree_hal_buffer_ref_t buffer_ref) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700581 // No-op: though maybe it'd be a useful addition to the spec as otherwise
582 // false dependencies can creep in.
583 return iree_ok_status();
584}
585
586// Splats a pattern value of 1, 2, or 4 bytes out to a 4 byte value.
587static uint32_t iree_hal_webgpu_splat_pattern(const void* pattern,
588 size_t pattern_length) {
589 switch (pattern_length) {
590 case 1: {
591 uint32_t pattern_value = *(const uint8_t*)(pattern);
592 return (pattern_value << 24) | (pattern_value << 16) |
593 (pattern_value << 8) | pattern_value;
594 }
595 case 2: {
596 uint32_t pattern_value = *(const uint16_t*)(pattern);
597 return (pattern_value << 16) | pattern_value;
598 }
599 case 4: {
600 uint32_t pattern_value = *(const uint32_t*)(pattern);
601 return pattern_value;
602 }
603 default:
604 return 0; // Already verified that this should not be possible.
605 }
606}
607
608static iree_status_t iree_hal_webgpu_command_buffer_fill_buffer(
609 iree_hal_command_buffer_t* base_command_buffer,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700610 iree_hal_buffer_ref_t target_ref, const void* pattern,
Ben Vanike7c2cba2021-07-19 15:45:39 -0700611 iree_host_size_t pattern_length) {
612 iree_hal_webgpu_command_buffer_t* command_buffer =
613 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
614
615 iree_hal_webgpu_builtin_fill_buffer_t* builtin =
616 &command_buffer->builtins->fill_buffer;
Ben Vanik9ffe4732024-07-08 17:10:45 -0700617 iree_device_size_t target_offset =
618 iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset;
Ben Vanike7c2cba2021-07-19 15:45:39 -0700619
620 // TODO(scotttodd): change to using what the vulkan emulation does
621 uint32_t dword_pattern =
622 iree_hal_webgpu_splat_pattern(pattern, pattern_length);
623
624 // If the pattern is zero and both the offset and length are multiples of 4,
625 // we can use the native wgpuCommandEncoderClearBuffer function. Otherwise,
626 // we dispatch our own fill emulation shader.
627 uint32_t zero_pattern = 0;
628 if (memcmp(&dword_pattern, &zero_pattern, pattern_length) == 0 &&
629 target_offset % 4 == 0 && length % 4 == 0) {
630 WGPUCommandEncoder command_encoder = NULL;
631 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
632 command_buffer, &command_encoder));
633
634 wgpuCommandEncoderClearBuffer(
635 command_encoder,
636 iree_hal_webgpu_buffer_handle(
637 iree_hal_buffer_allocated_buffer(target_buffer)),
638 target_offset, length);
639 return iree_ok_status();
640 }
641
642 // need to handle %4!=0 offset and pattern length as with vulkan
643
644 // Upload push constant data - this may incur a segment flush if the staging
645 // buffer is exhausted.
646 const uint32_t params_data[] = {
647 /*offset=*/target_offset,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700648 /*length=*/target_ref.length,
Ben Vanike7c2cba2021-07-19 15:45:39 -0700649 /*pattern=*/dword_pattern,
650 };
651 uint32_t params_offset = 0;
652 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_append_parameters(
653 command_buffer,
654 iree_make_const_byte_span(params_data, sizeof(params_data)),
655 &params_offset));
656
657 // Acquire the compute pass we'll encode the dispatch into - this may be
658 // fresh or reused from prior commands.
659 WGPUComputePassEncoder compute_pass = NULL;
660 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_compute_pass(
661 command_buffer, &compute_pass));
662 wgpuComputePassEncoderSetPipeline(compute_pass, builtin->pipeline);
663
664 // Bind the push constant emulation bind group at the staging buffer relative
665 // offset for this dispatch.
666 wgpuComputePassEncoderSetBindGroup(compute_pass, /*groupIndex=*/0,
667 command_buffer->staging_buffer->bind_group,
668 1, &params_offset);
669 command_buffer->state.bind_groups[0].handle = NULL;
670
671 // Grab a (probably uncached) bind group for the target buffer binding.
672 const iree_hal_webgpu_bind_group_binding_t buffer_binding = {
673 .type = WGPUBufferBindingType_Storage,
674 .buffer = iree_hal_webgpu_buffer_handle(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700675 iree_hal_buffer_allocated_buffer(target_ref.buffer)),
Ben Vanike7c2cba2021-07-19 15:45:39 -0700676 .offset = 0,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700677 .length = target_ref.length,
Ben Vanike7c2cba2021-07-19 15:45:39 -0700678 };
679 WGPUBindGroup buffer_group = iree_hal_webgpu_bind_group_cache_acquire(
680 command_buffer->bind_group_cache, builtin->buffer_group_layout,
681 &buffer_binding, /*binding_mask=*/1);
682 wgpuComputePassEncoderSetBindGroup(compute_pass, /*groupIndex=*/1,
683 buffer_group, 0, NULL);
684 command_buffer->state.bind_groups[1].handle = NULL;
685
686 // NOTE: this is not the right way to do this - we need to be tiling inside
687 // the fill.
Ben Vanik9ffe4732024-07-08 17:10:45 -0700688 wgpuComputePassEncoderDispatchWorkgroups(compute_pass, target_ref.length, 1,
689 1);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700690
691 return iree_ok_status();
692}
693
694static iree_status_t iree_hal_webgpu_command_buffer_update_buffer(
695 iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700696 iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700697 iree_hal_webgpu_command_buffer_t* command_buffer =
698 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
699
700 // Flush any active encoder as we are beginning a new segment.
701 IREE_RETURN_IF_ERROR(
702 iree_hal_webgpu_command_buffer_flush_encoder(command_buffer));
703
704 // Enqueue new segment.
705 uint8_t* storage_base = NULL;
706 iree_hal_webgpu_command_segment_t* segment = NULL;
707 iree_status_t status = iree_arena_allocate(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700708 &command_buffer->arena, sizeof(*segment) + target_ref.length,
709 (void**)&storage_base);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700710 if (iree_status_is_ok(status)) {
711 // Copy the update data into the command buffer so the user can change
712 // it immediately after this call returns. This results in a double copy
713 // because we need to put it in our command buffer and then when issuing
714 // copy again into the WebGPU queue. Thankfully these updates are restricted
715 // to a handful of KB so that's not really our biggest inefficiency.
716 uint8_t* storage_buffer = storage_base + sizeof(*segment);
717 memcpy(storage_buffer, (const uint8_t*)source_buffer + source_offset,
718 length);
719
720 // Attach the write_buffer segment.
721 segment = (iree_hal_webgpu_command_segment_t*)storage_base;
722 segment->action = IREE_HAL_WEBGPU_COMMAND_SEGMENT_ACTION_WRITE_BUFFER;
723 segment->write_buffer.source_buffer = storage_buffer;
724 segment->write_buffer.source_offset = 0;
725 segment->write_buffer.target_buffer =
Ben Vanik9ffe4732024-07-08 17:10:45 -0700726 iree_hal_webgpu_buffer_handle(target_ref.buffer);
727 segment->write_buffer.target_offset = target_ref.offset;
728 segment->write_buffer.length = target_ref.length;
Ben Vanike7c2cba2021-07-19 15:45:39 -0700729 iree_hal_webgpu_command_segment_list_push_back(&command_buffer->segments,
730 segment);
731 }
732 return status;
733}
734
735static iree_status_t iree_hal_webgpu_command_buffer_copy_buffer(
736 iree_hal_command_buffer_t* base_command_buffer,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700737 iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700738 iree_hal_webgpu_command_buffer_t* command_buffer =
739 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
740
741 WGPUCommandEncoder command_encoder = NULL;
742 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_command_encoder(
743 command_buffer, &command_encoder));
744
745 wgpuCommandEncoderCopyBufferToBuffer(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700746 command_encoder, iree_hal_webgpu_buffer_handle(source_ref.buffer),
747 source_ref.offset, iree_hal_webgpu_buffer_handle(target_ref.buffer),
748 target_ref.offset, target_ref.length);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700749
750 return iree_ok_status();
751}
752
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700753static iree_status_t iree_hal_webgpu_command_buffer_constants(
Ben Vanike7c2cba2021-07-19 15:45:39 -0700754 iree_hal_command_buffer_t* base_command_buffer,
755 iree_hal_pipeline_layout_t* pipeline_layout, iree_host_size_t offset,
756 const void* values, iree_host_size_t values_length) {
757 iree_hal_webgpu_command_buffer_t* command_buffer =
758 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
759
760 if (IREE_UNLIKELY(offset + values_length >=
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700761 sizeof(command_buffer->state.constants))) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700762 return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
Scott Todd60b07642023-06-15 09:41:01 -0700763 "push constant range %" PRIhsz " (length=%" PRIhsz
764 ") out of range",
Ben Vanike7c2cba2021-07-19 15:45:39 -0700765 offset, values_length);
766 }
767
768 // NOTE: command buffer state change only; enqueues no tasks.
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700769 memcpy((uint8_t*)&command_buffer->state.constants + offset, values,
Ben Vanike7c2cba2021-07-19 15:45:39 -0700770 values_length);
771
772 return iree_ok_status();
773}
774
775static iree_status_t iree_hal_webgpu_command_buffer_push_descriptor_set(
776 iree_hal_command_buffer_t* base_command_buffer,
777 iree_hal_pipeline_layout_t* pipeline_layout, uint32_t set,
Ben Vanik9ffe4732024-07-08 17:10:45 -0700778 iree_host_size_t binding_count, const iree_hal_buffer_ref_t* bindings) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700779 iree_hal_webgpu_command_buffer_t* command_buffer =
780 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
781
782 // NOTE: we don't check for redundant sets here as the compiler should have
783 // done that for us.
784 command_buffer->state.bind_groups[set].handle = NULL;
785 iree_hal_webgpu_bind_group_binding_t* group_bindings =
786 command_buffer->state.bind_groups[set].bindings;
787 for (iree_host_size_t i = 0; i < binding_count; ++i) {
Ben Vanik9ffe4732024-07-08 17:10:45 -0700788 uint32_t ordinal = bindings[i].ordinal;
Ben Vanike7c2cba2021-07-19 15:45:39 -0700789 if (ordinal >= IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT) {
790 return iree_make_status(
791 IREE_STATUS_INVALID_ARGUMENT,
792 "binding ordinal %d is out of range, must be 0-%d", ordinal,
793 IREE_HAL_WEBGPU_MAX_DESCRIPTOR_SET_BINDING_COUNT);
794 }
795 iree_hal_webgpu_bind_group_binding_t* group_binding =
Ben Vanik9ffe4732024-07-08 17:10:45 -0700796 &group_bindings[ordinal];
Ben Vanike7c2cba2021-07-19 15:45:39 -0700797
798 // TODO(benvanik): lookup binding type from layout. We should also be
799 // tagging whether it's dynamic here.
800 group_binding->type = WGPUBufferBindingType_Storage;
801
802 group_binding->buffer =
803 bindings[i].buffer ? iree_hal_webgpu_buffer_handle(bindings[i].buffer)
804 : NULL;
805 group_binding->offset = bindings[i].offset;
806 group_binding->length = bindings[i].length;
807 }
808
809 return iree_ok_status();
810}
811
812static iree_status_t iree_hal_webgpu_command_buffer_prepare_dispatch(
813 iree_hal_webgpu_command_buffer_t* command_buffer,
814 iree_hal_executable_t* executable, uint32_t ordinal,
Ben Vanik8dc68202024-08-12 13:36:40 -0700815 iree_const_byte_span_t constants, iree_hal_buffer_ref_list_t bindings,
816 iree_hal_dispatch_flags_t flags, WGPUComputePassEncoder* out_compute_pass) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700817 const iree_hal_webgpu_entry_point_t* entry_point =
818 iree_hal_webgpu_executable_lookup_entry_point(executable, ordinal);
819
820 // Upload push constant data - this may incur a segment flush if the staging
821 // buffer is exhausted.
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700822 iree_host_size_t constant_count =
823 iree_hal_webgpu_pipeline_layout_constant_count(entry_point->layout);
824 iree_const_byte_span_t constant_data = iree_make_const_byte_span(
825 command_buffer->state.constants,
826 constant_count * sizeof(command_buffer->state.constants[0]));
Ben Vanike7c2cba2021-07-19 15:45:39 -0700827 uint32_t params_offset = 0;
828 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_append_parameters(
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700829 command_buffer, constant_data, &params_offset));
Ben Vanike7c2cba2021-07-19 15:45:39 -0700830
831 // Acquire the compute pass we'll encode the dispatch into - this may be
832 // fresh or reused from prior commands.
833 WGPUComputePassEncoder compute_pass = NULL;
834 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_compute_pass(
835 command_buffer, &compute_pass));
836 wgpuComputePassEncoderSetPipeline(compute_pass, entry_point->pipeline);
837
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700838 if (constant_count > 0) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700839 // Bind the push constant emulation bind group at the staging buffer
840 // relative offset for this dispatch.
841 wgpuComputePassEncoderSetBindGroup(
842 compute_pass, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX,
843 command_buffer->staging_buffer->bind_group, 1, &params_offset);
844 }
845
846 // Set all bindings.
847 const iree_hal_webgpu_set_binding_info_t* binding_info =
848 iree_hal_webgpu_pipeline_layout_set_binding_info(entry_point->layout);
849 for (iree_host_size_t i = 0; i < binding_info->set_count; ++i) {
850 // If there are no bindings in this set we can skip it.
851 if (binding_info->set_masks[i] == 0) continue;
852
853 // If there is a bind group handle then it means we've done the lookup and
854 // set the bind group on the device already - we can skip.
855 if (command_buffer->state.bind_groups[i].handle) continue;
856
857 // Acquire the bind group to use for the current descriptor set.
858 WGPUBindGroup handle = iree_hal_webgpu_bind_group_cache_acquire(
859 command_buffer->bind_group_cache, binding_info->set_layouts[i],
860 command_buffer->state.bind_groups[i].bindings,
861 binding_info->set_masks[i]);
862
863 // NOTE: today we don't support dynamic offsets for push descriptor sets.
864 // This will be a larger change we'll need to handle in the compiler. If we
865 // wanted to improve caching we could make all the bindings dynamic and then
866 // always cache the base offsets, however
867 // maxDynamicStorageBuffersPerPipelineLayout is minimally 4 and that's not
868 // a lot of bindings.
869 wgpuComputePassEncoderSetBindGroup(compute_pass, (uint32_t)i, handle, 0,
870 NULL);
871 command_buffer->state.bind_groups[i].handle = handle;
872 command_buffer->state.bind_groups_empty &= ~(1ull << i);
873 }
874
Ben Vanike2a2b2b2024-08-22 11:56:59 -0700875 if (constant_count > 0) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700876 // Pad up to IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX with empty bind groups.
877 WGPUBindGroup empty_handle =
878 command_buffer->staging_buffer->empty_bind_group;
879 for (iree_host_size_t i = binding_info->set_count;
880 i < IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX; ++i) {
881 // Skip if an empty group is already set at this index.
882 if ((command_buffer->state.bind_groups_empty >> i) & 1ull) continue;
883
884 wgpuComputePassEncoderSetBindGroup(compute_pass, (uint32_t)i,
885 empty_handle, 0, NULL);
886 command_buffer->state.bind_groups[i].handle = empty_handle;
887 command_buffer->state.bind_groups_empty |= 1ull << i;
888 }
889 }
890
891 *out_compute_pass = compute_pass;
892 return iree_ok_status();
893}
894
895static iree_status_t iree_hal_webgpu_command_buffer_dispatch(
896 iree_hal_command_buffer_t* base_command_buffer,
897 iree_hal_executable_t* executable, int32_t entry_point,
Ben Vanik3ea13572024-07-24 12:07:45 -0700898 uint32_t workgroup_x, uint32_t workgroup_y, uint32_t workgroup_z,
899 iree_hal_dispatch_flags_t flags) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700900 iree_hal_webgpu_command_buffer_t* command_buffer =
901 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
902
903 WGPUComputePassEncoder compute_pass = NULL;
904 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
905 command_buffer, executable, entry_point, &compute_pass));
906 wgpuComputePassEncoderDispatchWorkgroups(compute_pass, workgroup_x,
907 workgroup_y, workgroup_z);
908
909 return iree_ok_status();
910}
911
912static iree_status_t iree_hal_webgpu_command_buffer_dispatch_indirect(
913 iree_hal_command_buffer_t* base_command_buffer,
914 iree_hal_executable_t* executable, int32_t entry_point,
Ben Vanik3ea13572024-07-24 12:07:45 -0700915 iree_hal_buffer_ref_t workgroups_ref, iree_hal_dispatch_flags_t flags) {
Ben Vanike7c2cba2021-07-19 15:45:39 -0700916 iree_hal_webgpu_command_buffer_t* command_buffer =
917 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
918
919 WGPUComputePassEncoder compute_pass = NULL;
920 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
921 command_buffer, executable, entry_point, &compute_pass));
922 wgpuComputePassEncoderDispatchWorkgroupsIndirect(
Ben Vanik9ffe4732024-07-08 17:10:45 -0700923 compute_pass, iree_hal_webgpu_buffer_handle(workgroups_ref.buffer),
924 workgroups_ref.offset);
Ben Vanike7c2cba2021-07-19 15:45:39 -0700925
926 return iree_ok_status();
927}
928
Ben Vanik7dc8c262024-08-22 14:43:33 -0700929static iree_status_t iree_hal_webgpu_command_buffer_prepare_dispatch(
Ben Vanik8dc68202024-08-12 13:36:40 -0700930 iree_hal_webgpu_command_buffer_t* command_buffer,
931 iree_hal_executable_t* executable, uint32_t ordinal,
932 iree_const_byte_span_t constants, iree_hal_buffer_ref_list_t bindings,
933 iree_hal_dispatch_flags_t flags, WGPUComputePassEncoder* out_compute_pass) {
934 const iree_hal_webgpu_entry_point_t* entry_point =
935 iree_hal_webgpu_executable_lookup_entry_point(executable, ordinal);
936
937 // Upload push constant data - this may incur a segment flush if the staging
938 // buffer is exhausted.
939 uint32_t params_offset = 0;
940 if (!iree_const_byte_span_is_empty(constants)) {
941 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_append_parameters(
942 command_buffer, constants, &params_offset));
943 }
944
945 // Acquire the compute pass we'll encode the dispatch into - this may be
946 // fresh or reused from prior commands.
947 WGPUComputePassEncoder compute_pass = NULL;
948 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_acquire_compute_pass(
949 command_buffer, &compute_pass));
950 wgpuComputePassEncoderSetPipeline(compute_pass, entry_point->pipeline);
951
952 if (!iree_const_byte_span_is_empty(constants)) {
953 // Bind the push constant emulation bind group at the staging buffer
954 // relative offset for this dispatch.
955 wgpuComputePassEncoderSetBindGroup(
956 compute_pass, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX,
957 command_buffer->staging_buffer->bind_group, 1, &params_offset);
958 }
959
960 // Set all bindings.
961 const iree_hal_webgpu_set_binding_info_t* binding_info =
962 iree_hal_webgpu_pipeline_layout_set_binding_info(entry_point->layout);
963
964 // TODO: change the bind group cache to take the bindings list directly and
965 // avoid this copy.
966 iree_hal_webgpu_bind_group_binding_t* group_bindings =
967 (iree_hal_webgpu_bind_group_binding_t*)iree_alloca(
968 bindings.count * sizeof(iree_hal_webgpu_bind_group_binding_t));
969 iree_hal_webgpu_binding_mask_t binding_mask = 0;
970 for (iree_host_size_t i = 0; i < bindings.count; ++i) {
971 binding_mask |= 1u << i;
972 group_bindings[i].type = WGPUBufferBindingType_Storage;
973 group_bindings[i].buffer =
974 bindings[i].buffer ? iree_hal_webgpu_buffer_handle(bindings[i].buffer)
975 : NULL;
976 group_bindings[i] offset = bindings[i].offset;
977 group_bindings[i] length = bindings[i].length;
978 }
979
980 // Acquire the bind group to use for the current descriptor set.
981 WGPUBindGroup handle = iree_hal_webgpu_bind_group_cache_acquire(
982 command_buffer->bind_group_cache, binding_info->set_layout,
983 group_bindings, binding_mask);
984
985 // NOTE: today we don't support dynamic offsets for push descriptor sets.
986 // This will be a larger change we'll need to handle in the compiler. If we
987 // wanted to improve caching we could make all the bindings dynamic and then
988 // always cache the base offsets, however
989 // maxDynamicStorageBuffersPerPipelineLayout is minimally 4 and that's not
990 // a lot of bindings.
991 wgpuComputePassEncoderSetBindGroup(compute_pass, 0, handle, 0, NULL);
992
993 *out_compute_pass = compute_pass;
994 return iree_ok_status();
995}
996
Ben Vanik7dc8c262024-08-22 14:43:33 -0700997static iree_status_t iree_hal_webgpu_command_buffer_dispatch(
Ben Vanik8dc68202024-08-12 13:36:40 -0700998 iree_hal_command_buffer_t* base_command_buffer,
999 iree_hal_executable_t* executable, int32_t entry_point,
1000 const uint32_t workgroup_count[3], iree_const_byte_span_t constants,
1001 iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
1002 iree_hal_webgpu_command_buffer_t* command_buffer =
1003 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
1004
1005 WGPUComputePassEncoder compute_pass = NULL;
Ben Vanik7dc8c262024-08-22 14:43:33 -07001006 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
Ben Vanik8dc68202024-08-12 13:36:40 -07001007 command_buffer, executable, entry_point, constants, bindings, flags,
1008 &compute_pass));
1009 wgpuComputePassEncoderDispatchWorkgroups(
1010 compute_pass, workgroup_count[0], workgroup_count[1], workgroup_count[2]);
1011
1012 return iree_ok_status();
1013}
1014
Ben Vanik7dc8c262024-08-22 14:43:33 -07001015static iree_status_t iree_hal_webgpu_command_buffer_dispatch_indirect(
Ben Vanik8dc68202024-08-12 13:36:40 -07001016 iree_hal_command_buffer_t* base_command_buffer,
1017 iree_hal_executable_t* executable, int32_t entry_point,
1018 iree_hal_buffer_ref_t workgroups_ref, iree_const_byte_span_t constants,
1019 iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) {
1020 iree_hal_webgpu_command_buffer_t* command_buffer =
1021 iree_hal_webgpu_command_buffer_cast(base_command_buffer);
1022
1023 WGPUComputePassEncoder compute_pass = NULL;
Ben Vanik7dc8c262024-08-22 14:43:33 -07001024 IREE_RETURN_IF_ERROR(iree_hal_webgpu_command_buffer_prepare_dispatch(
Ben Vanik8dc68202024-08-12 13:36:40 -07001025 command_buffer, executable, entry_point, constants, bindings, flags,
1026 &compute_pass));
1027 wgpuComputePassEncoderDispatchWorkgroupsIndirect(
1028 compute_pass, iree_hal_webgpu_buffer_handle(workgroups_ref.buffer),
1029 workgroups_ref.offset);
1030
1031 return iree_ok_status();
1032}
1033
Ben Vanike7c2cba2021-07-19 15:45:39 -07001034const iree_hal_command_buffer_vtable_t iree_hal_webgpu_command_buffer_vtable = {
1035 .destroy = iree_hal_webgpu_command_buffer_destroy,
1036 .begin = iree_hal_webgpu_command_buffer_begin,
1037 .end = iree_hal_webgpu_command_buffer_end,
1038 .begin_debug_group = iree_hal_webgpu_command_buffer_begin_debug_group,
1039 .end_debug_group = iree_hal_webgpu_command_buffer_end_debug_group,
1040 .execution_barrier = iree_hal_webgpu_command_buffer_execution_barrier,
1041 .signal_event = iree_hal_webgpu_command_buffer_signal_event,
1042 .reset_event = iree_hal_webgpu_command_buffer_reset_event,
1043 .wait_events = iree_hal_webgpu_command_buffer_wait_events,
1044 .discard_buffer = iree_hal_webgpu_command_buffer_discard_buffer,
1045 .fill_buffer = iree_hal_webgpu_command_buffer_fill_buffer,
1046 .update_buffer = iree_hal_webgpu_command_buffer_update_buffer,
1047 .copy_buffer = iree_hal_webgpu_command_buffer_copy_buffer,
Ben Vanike2a2b2b2024-08-22 11:56:59 -07001048 .constants = iree_hal_webgpu_command_buffer_constants,
Ben Vanike7c2cba2021-07-19 15:45:39 -07001049 .push_descriptor_set = iree_hal_webgpu_command_buffer_push_descriptor_set,
1050 .dispatch = iree_hal_webgpu_command_buffer_dispatch,
1051 .dispatch_indirect = iree_hal_webgpu_command_buffer_dispatch_indirect,
Ben Vanik7dc8c262024-08-22 14:43:33 -07001052 .dispatch = iree_hal_webgpu_command_buffer_dispatch,
1053 .dispatch_indirect = iree_hal_webgpu_command_buffer_dispatch_indirect,
Ben Vanike7c2cba2021-07-19 15:45:39 -07001054};