Polyfill non-4-byte-aligned fills in the Vulkan HAL. (#7317)
This fixes https://github.com/google/iree/issues/7228 by introducing a new `BuiltinExecutables` class under the Vulkan HAL that uses a 1x1x1 dispatch to fill any unaligned bytes within a buffer fill operation. Aligned regions of buffer fills will still use the native `vkCmdFillBuffer`. New CTS test cases provide coverage for each [pattern_size x offset x fill_length] permutation.
Inserting a dispatch into a command buffer (that was expecting to just have a native `vkCmdFillBuffer` transfer operation inside it) requires some special handling:
* Descriptor set layout index 3 is used (Android typically only supports 4, so this is the last slot), to avoid overlap with descriptor set layouts used by the compiler.
* Push constants are tracked for each command buffer so when this dispatch uses its own push constants it can restore from that backup.
* Command buffers that may go down this polyfill path need to run on a queue compatible with dispatch operations (i.e. not transfer exclusive). We had some ideas for adding a compiler analysis and attribute for "will only contain aligned fills", but without that I'm just defaulting to `command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH` where needed.
---
I have not yet tested this in a real program and there are still a few TODOs for optimal performance and correct synchronization behavior.
diff --git a/iree/hal/cts/command_buffer_test.cc b/iree/hal/cts/command_buffer_test.cc
index 234bc8e..bd9a1cd 100644
--- a/iree/hal/cts/command_buffer_test.cc
+++ b/iree/hal/cts/command_buffer_test.cc
@@ -37,6 +37,64 @@
}
protected:
+ std::vector<uint8_t> RunFillBufferTest(iree_device_size_t buffer_size,
+ iree_device_size_t target_offset,
+ iree_device_size_t fill_length,
+ const void* pattern,
+ iree_host_size_t pattern_length) {
+ iree_hal_command_buffer_t* command_buffer;
+ IREE_CHECK_OK(iree_hal_command_buffer_create(
+ device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+ IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+ &command_buffer));
+ iree_hal_buffer_t* device_buffer;
+ IREE_CHECK_OK(iree_hal_allocator_allocate_buffer(
+ iree_hal_device_allocator(device_),
+ IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+ IREE_HAL_BUFFER_USAGE_ALL, buffer_size, &device_buffer));
+
+ IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+ // Start with a zero fill on the entire buffer...
+ uint8_t zero_val = 0x0;
+ IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
+ command_buffer, device_buffer, /*target_offset=*/0,
+ /*length=*/buffer_size, &zero_val,
+ /*pattern_length=*/sizeof(zero_val)));
+ // (buffer barrier between the fill operations)
+ iree_hal_buffer_barrier_t buffer_barrier;
+ buffer_barrier.source_scope = IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE;
+ buffer_barrier.target_scope = IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE |
+ IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE;
+ buffer_barrier.buffer = device_buffer;
+ buffer_barrier.offset = 0;
+ buffer_barrier.length = buffer_size;
+ IREE_CHECK_OK(iree_hal_command_buffer_execution_barrier(
+ command_buffer, IREE_HAL_EXECUTION_STAGE_TRANSFER,
+ IREE_HAL_EXECUTION_STAGE_TRANSFER | IREE_HAL_EXECUTION_STAGE_DISPATCH,
+ IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, NULL,
+ /*buffer_barrier_count=*/1, &buffer_barrier));
+ // ... then fill the pattern on top.
+ IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
+ command_buffer, device_buffer,
+ /*target_offset=*/target_offset, /*length=*/fill_length,
+ /*pattern=*/pattern,
+ /*pattern_length=*/pattern_length));
+ IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+ IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+ command_buffer));
+
+ std::vector<uint8_t> actual_data(buffer_size);
+ IREE_CHECK_OK(
+ iree_hal_buffer_read_data(device_buffer, /*source_offset=*/0,
+ /*target_buffer=*/actual_data.data(),
+ /*data_length=*/buffer_size));
+
+ iree_hal_command_buffer_release(command_buffer);
+ iree_hal_buffer_release(device_buffer);
+
+ return actual_data;
+ }
+
static constexpr iree_device_size_t kBufferSize = 4096;
};
@@ -83,67 +141,6 @@
iree_hal_command_buffer_release(command_buffer);
}
-TEST_P(CommandBufferTest, FillBufferWithRepeatedBytes) {
- iree_hal_command_buffer_t* command_buffer;
- IREE_ASSERT_OK(iree_hal_command_buffer_create(
- device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
- IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
- &command_buffer));
-
- iree_hal_buffer_t* device_buffer;
- IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
- device_allocator_,
- IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
- IREE_HAL_BUFFER_USAGE_ALL, kBufferSize, &device_buffer));
-
- std::vector<uint8_t> reference_buffer(kBufferSize);
-
- IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
-
- // Fill the device buffer with segments of different values so that we can
- // test both fill and offset/size.
- uint8_t val1 = 0x07;
- IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
- command_buffer, device_buffer,
- /*target_offset=*/0, /*length=*/kBufferSize / 4, /*pattern=*/&val1,
- /*pattern_length=*/sizeof(val1)));
- std::memset(reference_buffer.data(), val1, kBufferSize / 4);
-
- uint8_t val2 = 0xbe;
- IREE_ASSERT_OK(
- iree_hal_command_buffer_fill_buffer(command_buffer, device_buffer,
- /*target_offset=*/kBufferSize / 4,
- /*length=*/kBufferSize / 4,
- /*pattern=*/&val2,
- /*pattern_length=*/sizeof(val2)));
- std::memset(reference_buffer.data() + kBufferSize / 4, val2, kBufferSize / 4);
-
- uint8_t val3 = 0x54;
- IREE_ASSERT_OK(
- iree_hal_command_buffer_fill_buffer(command_buffer, device_buffer,
- /*target_offset=*/kBufferSize / 2,
- /*length=*/kBufferSize / 2,
- /*pattern=*/&val3,
- /*pattern_length=*/sizeof(val3)));
- std::memset(reference_buffer.data() + kBufferSize / 2, val3, kBufferSize / 2);
-
- IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
-
- IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_TRANSFER,
- command_buffer));
-
- // Read the device buffer and compare.
- std::vector<uint8_t> actual_data(kBufferSize);
- IREE_ASSERT_OK(iree_hal_buffer_read_data(device_buffer, /*source_offset=*/0,
- /*target_buffer=*/actual_data.data(),
- /*data_length=*/kBufferSize));
- EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
-
- // Must release the command buffer before resources used by it.
- iree_hal_command_buffer_release(command_buffer);
- iree_hal_buffer_release(device_buffer);
-}
-
TEST_P(CommandBufferTest, CopyWholeBuffer) {
iree_hal_command_buffer_t* command_buffer;
IREE_ASSERT_OK(iree_hal_command_buffer_create(
@@ -257,6 +254,126 @@
iree_hal_buffer_release(host_buffer);
}
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length1) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 1;
+ uint8_t pattern = 0x07;
+ std::vector<uint8_t> reference_buffer{0x07, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length3) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 3;
+ uint8_t pattern = 0x07;
+ std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length8) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 8;
+ uint8_t pattern = 0x07;
+ std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x07, //
+ 0x07, 0x07, 0x07, 0x07, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset2_length8) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 2;
+ iree_device_size_t fill_length = 8;
+ uint8_t pattern = 0x07;
+ std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x07, 0x07, //
+ 0x07, 0x07, 0x07, 0x07, //
+ 0x07, 0x07, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset0_length8) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 8;
+ uint16_t pattern = 0xAB23;
+ std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB, //
+ 0x23, 0xAB, 0x23, 0xAB, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset0_length10) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 10;
+ uint16_t pattern = 0xAB23;
+ std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB, //
+ 0x23, 0xAB, 0x23, 0xAB, //
+ 0x23, 0xAB, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset2_length8) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 2;
+ iree_device_size_t fill_length = 8;
+ uint16_t pattern = 0xAB23;
+ std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x23, 0xAB, //
+ 0x23, 0xAB, 0x23, 0xAB, //
+ 0x23, 0xAB, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern4_offset0_length8) {
+ iree_device_size_t buffer_size = 16;
+ iree_device_size_t target_offset = 0;
+ iree_device_size_t fill_length = 8;
+ uint32_t pattern = 0xAB23CD45;
+ std::vector<uint8_t> reference_buffer{0x45, 0xCD, 0x23, 0xAB, //
+ 0x45, 0xCD, 0x23, 0xAB, //
+ 0x00, 0x00, 0x00, 0x00, //
+ 0x00, 0x00, 0x00, 0x00};
+ std::vector<uint8_t> actual_buffer =
+ RunFillBufferTest(buffer_size, target_offset, fill_length,
+ (void*)&pattern, sizeof(pattern));
+ EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
INSTANTIATE_TEST_SUITE_P(
AllDrivers, CommandBufferTest,
::testing::ValuesIn(testing::EnumerateAvailableDrivers()),
diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 97e243a..d531e68 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD
@@ -26,6 +26,8 @@
name = "vulkan",
srcs = [
"api.cc",
+ "builtin_executables.cc",
+ "builtin_executables.h",
"command_queue.h",
"debug_reporter.cc",
"debug_reporter.h",
@@ -92,6 +94,7 @@
"//iree/base/internal:synchronization",
"//iree/base/internal/flatcc:parsing",
"//iree/hal",
+ "//iree/hal/vulkan/builtin",
"//iree/hal/vulkan/util:arena",
"//iree/hal/vulkan/util:intrusive_list",
"//iree/hal/vulkan/util:ref_ptr",
diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 8126033..caee945 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt
@@ -23,6 +23,8 @@
"vulkan_driver.h"
SRCS
"api.cc"
+ "builtin_executables.cc"
+ "builtin_executables.h"
"command_queue.h"
"debug_reporter.cc"
"debug_reporter.h"
@@ -82,6 +84,7 @@
iree::base::logging
iree::base::tracing
iree::hal
+ iree::hal::vulkan::builtin
iree::hal::vulkan::util::arena
iree::hal::vulkan::util::intrusive_list
iree::hal::vulkan::util::ref_ptr
diff --git a/iree/hal/vulkan/builtin/BUILD b/iree/hal/vulkan/builtin/BUILD
new file mode 100644
index 0000000..083e92f
--- /dev/null
+++ b/iree/hal/vulkan/builtin/BUILD
@@ -0,0 +1,24 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+ default_visibility = ["//visibility:public"],
+ features = ["layering_check"],
+ licenses = ["notice"], # Apache 2.0
+)
+
+c_embed_data(
+ name = "builtin",
+ srcs = [
+ "fill_unaligned.spv",
+ ],
+ c_file_output = "builtin_shaders_spv.c",
+ flatten = True,
+ h_file_output = "builtin_shaders_spv.h",
+ identifier = "builtin_shaders_spv",
+)
diff --git a/iree/hal/vulkan/builtin/CMakeLists.txt b/iree/hal/vulkan/builtin/CMakeLists.txt
new file mode 100644
index 0000000..2514196
--- /dev/null
+++ b/iree/hal/vulkan/builtin/CMakeLists.txt
@@ -0,0 +1,28 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
+# iree/hal/vulkan/builtin/BUILD #
+# #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
+# CMake-only content. #
+# #
+# To disable autogeneration for this file entirely, delete this header. #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+ NAME
+ builtin
+ SRCS
+ "fill_unaligned.spv"
+ C_FILE_OUTPUT
+ "builtin_shaders_spv.c"
+ H_FILE_OUTPUT
+ "builtin_shaders_spv.h"
+ IDENTIFIER
+ "builtin_shaders_spv"
+ FLATTEN
+ PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/iree/hal/vulkan/builtin/compile_shaders.sh b/iree/hal/vulkan/builtin/compile_shaders.sh
new file mode 100644
index 0000000..fd5f571
--- /dev/null
+++ b/iree/hal/vulkan/builtin/compile_shaders.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Compiles input .glsl files into output .spv binary files. As these files are
+# updated infrequently and their binary sizes are small, we check in both files
+# and don't take a hard dependency on the shader compiler tool.
+#
+# To use, ensure `glslc` is on your PATH (such as by installing the Vulkan SDK
+# or builting it from its source at https://github.com/google/shaderc) and run
+# the script.
+
+set -e
+set -x
+
+BUILTIN_DIR="$(dirname $0)"
+
+glslc \
+ -Os -fshader-stage=compute -mfmt=bin \
+ ${BUILTIN_DIR}/fill_unaligned.glsl \
+ -o ${BUILTIN_DIR}/fill_unaligned.spv
diff --git a/iree/hal/vulkan/builtin/fill_unaligned.glsl b/iree/hal/vulkan/builtin/fill_unaligned.glsl
new file mode 100644
index 0000000..9ba434e
--- /dev/null
+++ b/iree/hal/vulkan/builtin/fill_unaligned.glsl
@@ -0,0 +1,64 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#version 450
+
+// Polyfill for buffer fills that are not aligned to 4 byte offsets or lengths.
+// This only implements the unaligned edges of fill operations. vkCmdFillBuffer
+// should be used for the aligned interior (if any).
+//
+// Repeats the 4 byte value |fill_pattern| into |output_elements|, between
+// |fill_offset_bytes| and |fill_offset_bytes| + |fill_length_bytes|.
+
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 3, binding = 0) buffer OutputBuffer { uint output_elements[]; };
+
+layout(push_constant) uniform Constants {
+ // TODO(scotttodd): low and high for 8 byte pattern
+ uint fill_pattern;
+ uint fill_pattern_width; // should be 1 or 2 (or 8 later on)
+ uint fill_offset_bytes; // must be aligned to pattern width
+ uint fill_length_bytes;
+} input_constants;
+
+void FillBufferUnalignedHelper(uint fill_offset_bytes, uint fill_length_bytes) {
+ uint fill_aligned_offset = fill_offset_bytes % 4;
+ uint fill_aligned_start_bytes = fill_offset_bytes - fill_aligned_offset;
+ uint fill_aligned_start_index = fill_aligned_start_bytes / 4;
+
+ uint shifted_pattern = 0x00000000;
+ if (input_constants.fill_pattern_width == 1) {
+ // Shift the pattern into each segment that is within the fill range.
+ uint fill_start = fill_aligned_offset;
+ uint fill_end = min(4, fill_start + fill_length_bytes);
+ for (uint i = fill_start; i < fill_end; ++i) {
+ shifted_pattern |= input_constants.fill_pattern << (8 * i);
+ }
+ } else if (input_constants.fill_pattern_width == 2) {
+ // Shift the pattern into the only supported segment in the fill range.
+ shifted_pattern = input_constants.fill_pattern << (8 * fill_aligned_offset);
+ }
+ output_elements[fill_aligned_start_index] = shifted_pattern;
+}
+
+void main() {
+ uint start_byte = input_constants.fill_offset_bytes;
+ uint end_byte =
+ input_constants.fill_offset_bytes + input_constants.fill_length_bytes;
+
+ // Unaligned start fill, if needed.
+ if (start_byte % 4 != 0 || input_constants.fill_length_bytes < 4) {
+ FillBufferUnalignedHelper(start_byte, input_constants.fill_length_bytes);
+ }
+ // Unaligned end fill, if needed.
+ if ((end_byte % 4 != 0) &&
+ (start_byte % 4 + input_constants.fill_length_bytes > 4)) {
+ uint end_rounded_down = (end_byte / 4) * 4;
+ uint length_end = end_byte - end_rounded_down;
+ FillBufferUnalignedHelper(end_rounded_down, length_end);
+ }
+}
diff --git a/iree/hal/vulkan/builtin/fill_unaligned.spv b/iree/hal/vulkan/builtin/fill_unaligned.spv
new file mode 100644
index 0000000..d457e5d
--- /dev/null
+++ b/iree/hal/vulkan/builtin/fill_unaligned.spv
Binary files differ
diff --git a/iree/hal/vulkan/builtin_executables.cc b/iree/hal/vulkan/builtin_executables.cc
new file mode 100644
index 0000000..3d6b918
--- /dev/null
+++ b/iree/hal/vulkan/builtin_executables.cc
@@ -0,0 +1,204 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/builtin_executables.h"
+
+#include <cstddef>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/builtin/builtin_shaders_spv.h"
+#include "iree/hal/vulkan/native_descriptor_set.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+typedef struct iree_hal_vulkan_builtin_fill_unaligned_constants_t {
+ uint32_t fill_pattern;
+ uint32_t fill_pattern_width;
+ uint32_t fill_offset_bytes;
+ uint32_t fill_length_bytes;
+} iree_hal_vulkan_builtin_fill_unaligned_constants_t;
+
+static_assert(sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t) ==
+ IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT,
+ "push constant count must match struct size");
+
+} // namespace
+
+BuiltinExecutables::BuiltinExecutables(VkDeviceHandle* logical_device)
+ : logical_device_(logical_device) {}
+
+BuiltinExecutables::~BuiltinExecutables() {
+ if (pipeline_ != VK_NULL_HANDLE) {
+ logical_device_->syms()->vkDestroyPipeline(*logical_device_, pipeline_,
+ logical_device_->allocator());
+ }
+
+ if (executable_layout_) {
+ iree_hal_executable_layout_destroy(executable_layout_);
+ }
+
+ for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+ iree_hal_descriptor_set_layout_release(descriptor_set_layouts_[i]);
+ }
+}
+
+iree_status_t BuiltinExecutables::InitializeExecutables() {
+ IREE_TRACE_SCOPE();
+
+ // Create descriptor set layouts for our compute pipeline.
+ // Even though we're just using one set, we still need to create layout
+ // bindings for those preceding it.
+ for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+ iree_hal_descriptor_set_layout_t* layout = NULL;
+ iree_hal_descriptor_set_layout_binding_t layout_binding;
+ layout_binding.binding = 0;
+ layout_binding.type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+ layout_binding.access = i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET
+ ? IREE_HAL_MEMORY_ACCESS_NONE
+ : IREE_HAL_MEMORY_ACCESS_WRITE;
+ IREE_RETURN_IF_ERROR(iree_hal_vulkan_native_descriptor_set_layout_create(
+ logical_device_,
+ i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET
+ ? IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE
+ : IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY,
+ /*binding_count=*/1, &layout_binding, &layout));
+ descriptor_set_layouts_[i] = layout;
+ }
+
+ iree_status_t status = iree_ok_status();
+
+ // Create shader module.
+ VkShaderModule fill_unaligned_shader = VK_NULL_HANDLE;
+ if (iree_status_is_ok(status)) {
+ VkShaderModuleCreateInfo shader_create_info;
+ shader_create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+ shader_create_info.pNext = NULL;
+ shader_create_info.flags = 0;
+ shader_create_info.codeSize = builtin_shaders_spv_create()[0].size;
+ shader_create_info.pCode =
+ (const uint32_t*)builtin_shaders_spv_create()[0].data;
+ status = VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateShaderModule(
+ *logical_device_, &shader_create_info, logical_device_->allocator(),
+ &fill_unaligned_shader));
+ }
+
+ // Create pipeline layout.
+ if (iree_status_is_ok(status)) {
+ status = iree_hal_vulkan_native_executable_layout_create(
+ logical_device_, IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT / 4,
+ IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT, descriptor_set_layouts_,
+ &executable_layout_);
+ }
+
+ // Create pipeline.
+ if (iree_status_is_ok(status)) {
+ VkComputePipelineCreateInfo pipeline_create_info;
+ pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+ pipeline_create_info.pNext = NULL;
+ pipeline_create_info.flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+ pipeline_create_info.layout =
+ iree_hal_vulkan_native_executable_layout_handle(executable_layout_);
+ pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+ pipeline_create_info.basePipelineIndex = 0;
+ VkPipelineShaderStageCreateInfo* stage_create_info =
+ &pipeline_create_info.stage;
+ stage_create_info->sType =
+ VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+ stage_create_info->pNext = NULL;
+ stage_create_info->flags = 0;
+ stage_create_info->stage = VK_SHADER_STAGE_COMPUTE_BIT;
+ stage_create_info->module = fill_unaligned_shader;
+ stage_create_info->pName = "main";
+ stage_create_info->pSpecializationInfo = NULL;
+ status =
+ VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateComputePipelines(
+ *logical_device_, /*pipeline_cache=*/VK_NULL_HANDLE,
+ /*pipeline_count=*/1, &pipeline_create_info,
+ logical_device_->allocator(), &pipeline_));
+ }
+
+ // Destroy shader module now that the pipeline is created.
+ if (fill_unaligned_shader != VK_NULL_HANDLE) {
+ logical_device_->syms()->vkDestroyShaderModule(
+ *logical_device_, fill_unaligned_shader, logical_device_->allocator());
+ }
+
+ return status;
+}
+
+iree_status_t BuiltinExecutables::FillBufferUnaligned(
+ VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, const void* pattern,
+ iree_host_size_t pattern_length, const void* push_constants_to_restore) {
+ IREE_TRACE_SCOPE();
+
+ iree_hal_vulkan_builtin_fill_unaligned_constants_t constants;
+ switch (pattern_length) {
+ case 1:
+ constants.fill_pattern = *static_cast<const uint8_t*>(pattern);
+ break;
+ case 2:
+ constants.fill_pattern = *static_cast<const uint16_t*>(pattern);
+ break;
+ case 4:
+ constants.fill_pattern = *static_cast<const uint32_t*>(pattern);
+ break;
+ default:
+ return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+ "pattern length (%" PRIhsz
+ ") is not a power of two or is too large",
+ pattern_length);
+ }
+
+ iree_hal_descriptor_set_binding_t binding;
+ binding.binding = 0;
+ binding.buffer = target_buffer;
+ binding.offset = 0;
+ binding.length = IREE_WHOLE_BUFFER;
+ IREE_RETURN_IF_ERROR(descriptor_set_arena->BindDescriptorSet(
+ command_buffer, executable_layout_,
+ IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET, /*binding_count=*/1, &binding));
+
+ logical_device_->syms()->vkCmdBindPipeline(
+ command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_);
+
+ constants.fill_pattern_width = pattern_length;
+ constants.fill_offset_bytes = target_offset;
+ constants.fill_length_bytes = length;
+ logical_device_->syms()->vkCmdPushConstants(
+ command_buffer,
+ iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+ VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+ sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t), &constants);
+
+ // TODO(scotttodd): insert memory barrier if we need to do dispatch<->dispatch
+ // synchronization. The barriers inserted normally by callers would be for
+ // transfer<->dispatch.
+
+ logical_device_->syms()->vkCmdDispatch(command_buffer, 1, 1, 1);
+
+ // Restore push constants.
+ logical_device_->syms()->vkCmdPushConstants(
+ command_buffer,
+ iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+ VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+ sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t),
+ push_constants_to_restore);
+
+ return iree_ok_status();
+}
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
diff --git a/iree/hal/vulkan/builtin_executables.h b/iree/hal/vulkan/builtin_executables.h
new file mode 100644
index 0000000..ea25102
--- /dev/null
+++ b/iree/hal/vulkan/builtin_executables.h
@@ -0,0 +1,69 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+#define IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/descriptor_set_arena.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// The `maxBoundDescriptorSets` limit is 4 on many devices we support and we
+// want to avoid conflicts with what the compiler uses, so we'll expect the
+// compiler to have reserved the index 3 for our exclusive use.
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT 4
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET 3
+
+#define IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT 16
+
+class BuiltinExecutables {
+ public:
+ BuiltinExecutables(VkDeviceHandle* logical_device);
+ ~BuiltinExecutables();
+
+ const ref_ptr<DynamicSymbols>& syms() const {
+ return logical_device_->syms();
+ }
+
+ iree_status_t InitializeExecutables();
+
+ // Fills a buffer without 4 byte offset or length requirements.
+ //
+ // This only implements the unaligned edges of fills, vkCmdFillBuffer should
+ // be used for the aligned interior (if any).
+ //
+ // |push_constants_to_restore| will be pushed using vkCmdPushConstants over
+ // the bytes used by this call.
+ iree_status_t FillBufferUnaligned(
+ VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+ iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+ iree_device_size_t length, const void* pattern,
+ iree_host_size_t pattern_length, const void* push_constants_to_restore);
+
+ private:
+ VkDeviceHandle* logical_device_ = NULL;
+
+ iree_hal_descriptor_set_layout_t*
+ descriptor_set_layouts_[IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT] = {
+ NULL};
+ iree_hal_executable_layout_t* executable_layout_ = NULL;
+ VkPipeline pipeline_ = VK_NULL_HANDLE;
+};
+
+} // namespace vulkan
+} // namespace hal
+} // namespace iree
+
+#endif // IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
diff --git a/iree/hal/vulkan/descriptor_set_arena.cc b/iree/hal/vulkan/descriptor_set_arena.cc
index d605669..cefa6bc 100644
--- a/iree/hal/vulkan/descriptor_set_arena.cc
+++ b/iree/hal/vulkan/descriptor_set_arena.cc
@@ -42,27 +42,32 @@
iree_hal_buffer_allocated_buffer(binding.buffer));
buffer_info.offset =
iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
- // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
- // GPUs; it has the best support compared to other bitwidths. We use VMA to
- // manage GPU memory for us and VMA should already handled proper alignment
- // when performing allocations; here we just need to provide the proper
- // "view" to Vulkan drivers over the allocated memory.
- //
- // Note this is needed because we can see unusal buffers like tensor<3xi8>.
- // Depending on GPU capabilities, this might not always be directly
- // supported by the hardware. Under such circumstances, we need to emulate
- // i8 support with i32. Shader CodeGen takes care of that: the shader will
- // read the buffer as tensor<i32> and perform bit shifts to extract each
- // byte and conduct computations. The extra additional byte is read but
- // not really used by the shader. Here in application we need to match the
- // ABI and provide the buffer as 32-bit aligned, otherwise the whole read by
- // the shader is considered as out of bounds per the Vulkan spec.
- // See https://github.com/google/iree/issues/2022#issuecomment-640617234
- // for more details.
- buffer_info.range = iree_device_align(
- std::min(binding.length,
- iree_hal_buffer_byte_length(binding.buffer) - binding.offset),
- 4);
+ if (binding.length == IREE_WHOLE_BUFFER) {
+ buffer_info.range = VK_WHOLE_SIZE;
+ } else {
+ // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
+ // GPUs; it has the best support compared to other bitwidths. We use VMA
+ // to manage GPU memory for us and VMA should already handled proper
+ // alignment when performing allocations; here we just need to provide the
+ // proper "view" to Vulkan drivers over the allocated memory.
+ //
+ // Note this is needed because we can see unusal buffers like
+ // tensor<3xi8>. Depending on GPU capabilities, this might not always be
+ // directly supported by the hardware. Under such circumstances, we need
+ // to emulate i8 support with i32. Shader CodeGen takes care of that: the
+ // shader will read the buffer as tensor<i32> and perform bit shifts to
+ // extract each byte and conduct computations. The extra additional byte
+ // is read but not really used by the shader. Here in application we need
+ // to match the ABI and provide the buffer as 32-bit aligned, otherwise
+ // the whole read by the shader is considered as out of bounds per the
+ // Vulkan spec. See
+ // https://github.com/google/iree/issues/2022#issuecomment-640617234 for
+ // more details.
+ buffer_info.range = iree_device_align(
+ std::min(binding.length, iree_hal_buffer_byte_length(binding.buffer) -
+ binding.offset),
+ 4);
+ }
auto& write_info = write_infos[i];
write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc
index 96f8eed..734fac1 100644
--- a/iree/hal/vulkan/direct_command_buffer.cc
+++ b/iree/hal/vulkan/direct_command_buffer.cc
@@ -50,6 +50,15 @@
// This must remain valid until all in-flight submissions of the command
// buffer complete.
DescriptorSetGroup descriptor_set_group;
+
+ BuiltinExecutables* builtin_executables;
+
+ // Shadow copy of push constants used during normal operation, for restoring
+ // after builtin_executables uses vkCmdPushConstants. Size must be greater
+ // than or equal to the push constant memory used by builtin_executables.
+ // TODO(scotttodd): use [maxPushConstantsSize - 16, maxPushConstantsSize]
+ // instead of [0, 16] to reduce frequency of updates
+ uint8_t push_constants_storage[IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT];
} iree_hal_vulkan_direct_command_buffer_t;
extern const iree_hal_command_buffer_vtable_t
@@ -71,6 +80,7 @@
iree_hal_queue_affinity_t queue_affinity,
iree_hal_vulkan_tracing_context_t* tracing_context,
iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+ iree::hal::vulkan::BuiltinExecutables* builtin_executables,
iree_hal_command_buffer_t** out_command_buffer) {
IREE_ASSERT_ARGUMENT(logical_device);
IREE_ASSERT_ARGUMENT(command_pool);
@@ -109,6 +119,8 @@
DescriptorSetArena(descriptor_pool_cache);
new (&command_buffer->descriptor_set_group) DescriptorSetGroup();
+ command_buffer->builtin_executables = builtin_executables;
+
*out_command_buffer = (iree_hal_command_buffer_t*)command_buffer;
} else {
command_pool->Free(handle);
@@ -512,14 +524,47 @@
VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
iree_hal_buffer_allocated_buffer(target_buffer));
- // Note that fill only accepts 4-byte aligned values so we need to splat out
- // our variable-length pattern.
- target_offset += iree_hal_buffer_byte_offset(target_buffer);
- uint32_t dword_pattern =
- iree_hal_vulkan_splat_pattern(pattern, pattern_length);
- command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
- target_device_buffer, target_offset,
- length, dword_pattern);
+ // vkCmdFillBuffer requires a 4 byte alignment for the offset, pattern, and
+ // length. We use a polyfill here that fills the unaligned start and end of
+ // fill operations, if needed.
+
+ if (target_offset % 4 != 0 || length % 4 != 0) {
+ // TODO(scotttodd): only restore push constants that have been modified?
+ // (this can pass uninitialized memory right now, which
+ // *should* be safe but is wasteful)
+ IREE_RETURN_IF_ERROR(
+ command_buffer->builtin_executables->FillBufferUnaligned(
+ command_buffer->handle, &(command_buffer->descriptor_set_arena),
+ target_buffer, target_offset, length, pattern, pattern_length,
+ command_buffer->push_constants_storage));
+
+ // Continue using vkCmdFillBuffer below, but only for the inner aligned
+ // portion of the fill operation.
+ // For example:
+ // original offset 2, length 8
+ // aligned offset 4, length 4
+ // [0x00,0x00,0xAB,0xAB | 0xAB,0xAB,0xAB,0xAB | 0xAB,0xAB,0x00,0x00]
+ // <-------> <---------------------> <------->
+ // unaligned vkCmdFillBuffer unaligned
+ iree_device_size_t aligned_target_offset =
+ iree_device_align(target_offset, 4);
+ iree_device_size_t target_end = target_offset + length;
+ iree_device_size_t rounded_down_target_end = (target_end / 4) * 4;
+ length -= (aligned_target_offset - target_offset) +
+ (target_end - rounded_down_target_end);
+ target_offset = aligned_target_offset;
+ }
+
+ if (length > 0) {
+ // Note that vkCmdFillBuffer only accepts 4-byte aligned values so we need
+ // to splat out our variable-length pattern.
+ target_offset += iree_hal_buffer_byte_offset(target_buffer);
+ uint32_t dword_pattern =
+ iree_hal_vulkan_splat_pattern(pattern, pattern_length);
+ command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
+ target_device_buffer, target_offset,
+ length, dword_pattern);
+ }
return iree_ok_status();
}
@@ -584,6 +629,13 @@
iree_hal_vulkan_direct_command_buffer_t* command_buffer =
iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
+ iree_host_size_t storage_size =
+ IREE_ARRAYSIZE(command_buffer->push_constants_storage);
+ if (offset < storage_size) {
+ memcpy(command_buffer->push_constants_storage + offset, values,
+ std::min(values_length, storage_size) - offset);
+ }
+
command_buffer->syms->vkCmdPushConstants(
command_buffer->handle,
iree_hal_vulkan_native_executable_layout_handle(executable_layout),
diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h
index cc1d097..606e859 100644
--- a/iree/hal/vulkan/direct_command_buffer.h
+++ b/iree/hal/vulkan/direct_command_buffer.h
@@ -9,6 +9,7 @@
#include "iree/base/api.h"
#include "iree/hal/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
#include "iree/hal/vulkan/descriptor_pool_cache.h"
#include "iree/hal/vulkan/handle_util.h"
#include "iree/hal/vulkan/tracing.h"
@@ -26,6 +27,7 @@
iree_hal_queue_affinity_t queue_affinity,
iree_hal_vulkan_tracing_context_t* tracing_context,
iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+ iree::hal::vulkan::BuiltinExecutables* builtin_executables,
iree_hal_command_buffer_t** out_command_buffer);
// Returns the native Vulkan VkCommandBuffer handle.
diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 953abba..c5a7873 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc
@@ -14,6 +14,7 @@
#include "iree/base/internal/math.h"
#include "iree/base/tracing.h"
#include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
#include "iree/hal/vulkan/command_queue.h"
#include "iree/hal/vulkan/descriptor_pool_cache.h"
#include "iree/hal/vulkan/direct_command_buffer.h"
@@ -363,6 +364,8 @@
// Used only for emulated timeline semaphores.
TimePointSemaphorePool* semaphore_pool;
TimePointFencePool* fence_pool;
+
+ BuiltinExecutables* builtin_executables;
} iree_hal_vulkan_device_t;
extern const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable;
@@ -622,6 +625,12 @@
}
if (iree_status_is_ok(status)) {
+ device->builtin_executables =
+ new BuiltinExecutables(device->logical_device);
+ status = device->builtin_executables->InitializeExecutables();
+ }
+
+ if (iree_status_is_ok(status)) {
*out_device = (iree_hal_device_t*)device;
} else {
iree_hal_device_destroy((iree_hal_device_t*)device);
@@ -647,6 +656,7 @@
// Now that no commands are outstanding we can release all resources that may
// have been in use.
+ delete device->builtin_executables;
delete device->descriptor_pool_cache;
delete device->semaphore_pool;
delete device->fence_pool;
@@ -930,6 +940,12 @@
iree_hal_vulkan_device_t* device,
iree_hal_command_category_t command_categories,
iree_hal_queue_affinity_t queue_affinity) {
+ // TODO(scotttodd): revisit queue selection logic and remove this
+ // * the unaligned buffer fill polyfill and tracing timestamp queries may
+ // both insert dispatches into command buffers that at compile time are
+ // expected to only contain transfer commands
+ // * we could set a bit at recording time if emulation or tracing is used
+ // and submit to the right queue based on that
command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
// TODO(benvanik): meaningful heuristics for affinity. We don't generate
@@ -949,6 +965,12 @@
iree_hal_command_buffer_t** out_command_buffer) {
iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
+ // TODO(scotttodd): revisit queue selection logic and remove this
+ // * the unaligned buffer fill polyfill and tracing timestamp queries may
+ // both insert dispatches into command buffers that at compile time are
+ // expected to only contain transfer commands
+ // * we could set a bit at recording time if emulation or tracing is used
+ // and submit to the right queue based on that
command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
// Select the command pool to used based on the types of commands used.
@@ -974,7 +996,7 @@
return iree_hal_vulkan_direct_command_buffer_allocate(
device->logical_device, command_pool, mode, command_categories,
queue_affinity, queue->tracing_context(), device->descriptor_pool_cache,
- out_command_buffer);
+ device->builtin_executables, out_command_buffer);
}
static iree_status_t iree_hal_vulkan_device_create_descriptor_set(