Polyfill non-4-byte-aligned fills in the Vulkan HAL. (#7317)

This fixes https://github.com/google/iree/issues/7228 by introducing a new `BuiltinExecutables` class under the Vulkan HAL that uses a 1x1x1 dispatch to fill any unaligned bytes within a buffer fill operation. Aligned regions of buffer fills will still use the native `vkCmdFillBuffer`. New CTS test cases provide coverage for each [pattern_size x offset x fill_length] permutation.

Inserting a dispatch into a command buffer (that was expecting to just have a native `vkCmdFillBuffer` transfer operation inside it) requires some special handling:

* Descriptor set layout index 3 is used (Android typically only supports 4, so this is the last slot), to avoid overlap with descriptor set layouts used by the compiler.
* Push constants are tracked for each command buffer so when this dispatch uses its own push constants it can restore from that backup.
* Command buffers that may go down this polyfill path need to run on a queue compatible with dispatch operations (i.e. not transfer exclusive). We had some ideas for adding a compiler analysis and attribute for "will only contain aligned fills", but without that I'm just defaulting to `command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH` where needed.

---

I have not yet tested this in a real program and there are still a few TODOs for optimal performance and correct synchronization behavior.
diff --git a/iree/hal/cts/command_buffer_test.cc b/iree/hal/cts/command_buffer_test.cc
index 234bc8e..bd9a1cd 100644
--- a/iree/hal/cts/command_buffer_test.cc
+++ b/iree/hal/cts/command_buffer_test.cc
@@ -37,6 +37,64 @@
   }
 
  protected:
+  std::vector<uint8_t> RunFillBufferTest(iree_device_size_t buffer_size,
+                                         iree_device_size_t target_offset,
+                                         iree_device_size_t fill_length,
+                                         const void* pattern,
+                                         iree_host_size_t pattern_length) {
+    iree_hal_command_buffer_t* command_buffer;
+    IREE_CHECK_OK(iree_hal_command_buffer_create(
+        device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+        IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
+        &command_buffer));
+    iree_hal_buffer_t* device_buffer;
+    IREE_CHECK_OK(iree_hal_allocator_allocate_buffer(
+        iree_hal_device_allocator(device_),
+        IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
+        IREE_HAL_BUFFER_USAGE_ALL, buffer_size, &device_buffer));
+
+    IREE_CHECK_OK(iree_hal_command_buffer_begin(command_buffer));
+    // Start with a zero fill on the entire buffer...
+    uint8_t zero_val = 0x0;
+    IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
+        command_buffer, device_buffer, /*target_offset=*/0,
+        /*length=*/buffer_size, &zero_val,
+        /*pattern_length=*/sizeof(zero_val)));
+    // (buffer barrier between the fill operations)
+    iree_hal_buffer_barrier_t buffer_barrier;
+    buffer_barrier.source_scope = IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE;
+    buffer_barrier.target_scope = IREE_HAL_ACCESS_SCOPE_TRANSFER_WRITE |
+                                  IREE_HAL_ACCESS_SCOPE_DISPATCH_WRITE;
+    buffer_barrier.buffer = device_buffer;
+    buffer_barrier.offset = 0;
+    buffer_barrier.length = buffer_size;
+    IREE_CHECK_OK(iree_hal_command_buffer_execution_barrier(
+        command_buffer, IREE_HAL_EXECUTION_STAGE_TRANSFER,
+        IREE_HAL_EXECUTION_STAGE_TRANSFER | IREE_HAL_EXECUTION_STAGE_DISPATCH,
+        IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, NULL,
+        /*buffer_barrier_count=*/1, &buffer_barrier));
+    // ... then fill the pattern on top.
+    IREE_CHECK_OK(iree_hal_command_buffer_fill_buffer(
+        command_buffer, device_buffer,
+        /*target_offset=*/target_offset, /*length=*/fill_length,
+        /*pattern=*/pattern,
+        /*pattern_length=*/pattern_length));
+    IREE_CHECK_OK(iree_hal_command_buffer_end(command_buffer));
+    IREE_CHECK_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_ANY,
+                                             command_buffer));
+
+    std::vector<uint8_t> actual_data(buffer_size);
+    IREE_CHECK_OK(
+        iree_hal_buffer_read_data(device_buffer, /*source_offset=*/0,
+                                  /*target_buffer=*/actual_data.data(),
+                                  /*data_length=*/buffer_size));
+
+    iree_hal_command_buffer_release(command_buffer);
+    iree_hal_buffer_release(device_buffer);
+
+    return actual_data;
+  }
+
   static constexpr iree_device_size_t kBufferSize = 4096;
 };
 
@@ -83,67 +141,6 @@
   iree_hal_command_buffer_release(command_buffer);
 }
 
-TEST_P(CommandBufferTest, FillBufferWithRepeatedBytes) {
-  iree_hal_command_buffer_t* command_buffer;
-  IREE_ASSERT_OK(iree_hal_command_buffer_create(
-      device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
-      IREE_HAL_COMMAND_CATEGORY_TRANSFER, IREE_HAL_QUEUE_AFFINITY_ANY,
-      &command_buffer));
-
-  iree_hal_buffer_t* device_buffer;
-  IREE_ASSERT_OK(iree_hal_allocator_allocate_buffer(
-      device_allocator_,
-      IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE,
-      IREE_HAL_BUFFER_USAGE_ALL, kBufferSize, &device_buffer));
-
-  std::vector<uint8_t> reference_buffer(kBufferSize);
-
-  IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
-
-  // Fill the device buffer with segments of different values so that we can
-  // test both fill and offset/size.
-  uint8_t val1 = 0x07;
-  IREE_ASSERT_OK(iree_hal_command_buffer_fill_buffer(
-      command_buffer, device_buffer,
-      /*target_offset=*/0, /*length=*/kBufferSize / 4, /*pattern=*/&val1,
-      /*pattern_length=*/sizeof(val1)));
-  std::memset(reference_buffer.data(), val1, kBufferSize / 4);
-
-  uint8_t val2 = 0xbe;
-  IREE_ASSERT_OK(
-      iree_hal_command_buffer_fill_buffer(command_buffer, device_buffer,
-                                          /*target_offset=*/kBufferSize / 4,
-                                          /*length=*/kBufferSize / 4,
-                                          /*pattern=*/&val2,
-                                          /*pattern_length=*/sizeof(val2)));
-  std::memset(reference_buffer.data() + kBufferSize / 4, val2, kBufferSize / 4);
-
-  uint8_t val3 = 0x54;
-  IREE_ASSERT_OK(
-      iree_hal_command_buffer_fill_buffer(command_buffer, device_buffer,
-                                          /*target_offset=*/kBufferSize / 2,
-                                          /*length=*/kBufferSize / 2,
-                                          /*pattern=*/&val3,
-                                          /*pattern_length=*/sizeof(val3)));
-  std::memset(reference_buffer.data() + kBufferSize / 2, val3, kBufferSize / 2);
-
-  IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer));
-
-  IREE_ASSERT_OK(SubmitCommandBufferAndWait(IREE_HAL_COMMAND_CATEGORY_TRANSFER,
-                                            command_buffer));
-
-  // Read the device buffer and compare.
-  std::vector<uint8_t> actual_data(kBufferSize);
-  IREE_ASSERT_OK(iree_hal_buffer_read_data(device_buffer, /*source_offset=*/0,
-                                           /*target_buffer=*/actual_data.data(),
-                                           /*data_length=*/kBufferSize));
-  EXPECT_THAT(actual_data, ContainerEq(reference_buffer));
-
-  // Must release the command buffer before resources used by it.
-  iree_hal_command_buffer_release(command_buffer);
-  iree_hal_buffer_release(device_buffer);
-}
-
 TEST_P(CommandBufferTest, CopyWholeBuffer) {
   iree_hal_command_buffer_t* command_buffer;
   IREE_ASSERT_OK(iree_hal_command_buffer_create(
@@ -257,6 +254,126 @@
   iree_hal_buffer_release(host_buffer);
 }
 
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length1) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 1;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length3) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 3;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x07, 0x07, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x07, 0x07,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern1_offset2_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 2;
+  iree_device_size_t fill_length = 8;
+  uint8_t pattern = 0x07;
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x07, 0x07,  //
+                                        0x07, 0x07, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset0_length10) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 10;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern2_offset2_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 2;
+  iree_device_size_t fill_length = 8;
+  uint16_t pattern = 0xAB23;
+  std::vector<uint8_t> reference_buffer{0x00, 0x00, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x23, 0xAB,  //
+                                        0x23, 0xAB, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
+TEST_P(CommandBufferTest, FillBuffer_pattern4_offset0_length8) {
+  iree_device_size_t buffer_size = 16;
+  iree_device_size_t target_offset = 0;
+  iree_device_size_t fill_length = 8;
+  uint32_t pattern = 0xAB23CD45;
+  std::vector<uint8_t> reference_buffer{0x45, 0xCD, 0x23, 0xAB,  //
+                                        0x45, 0xCD, 0x23, 0xAB,  //
+                                        0x00, 0x00, 0x00, 0x00,  //
+                                        0x00, 0x00, 0x00, 0x00};
+  std::vector<uint8_t> actual_buffer =
+      RunFillBufferTest(buffer_size, target_offset, fill_length,
+                        (void*)&pattern, sizeof(pattern));
+  EXPECT_THAT(actual_buffer, ContainerEq(reference_buffer));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     AllDrivers, CommandBufferTest,
     ::testing::ValuesIn(testing::EnumerateAvailableDrivers()),
diff --git a/iree/hal/vulkan/BUILD b/iree/hal/vulkan/BUILD
index 97e243a..d531e68 100644
--- a/iree/hal/vulkan/BUILD
+++ b/iree/hal/vulkan/BUILD
@@ -26,6 +26,8 @@
     name = "vulkan",
     srcs = [
         "api.cc",
+        "builtin_executables.cc",
+        "builtin_executables.h",
         "command_queue.h",
         "debug_reporter.cc",
         "debug_reporter.h",
@@ -92,6 +94,7 @@
         "//iree/base/internal:synchronization",
         "//iree/base/internal/flatcc:parsing",
         "//iree/hal",
+        "//iree/hal/vulkan/builtin",
         "//iree/hal/vulkan/util:arena",
         "//iree/hal/vulkan/util:intrusive_list",
         "//iree/hal/vulkan/util:ref_ptr",
diff --git a/iree/hal/vulkan/CMakeLists.txt b/iree/hal/vulkan/CMakeLists.txt
index 8126033..caee945 100644
--- a/iree/hal/vulkan/CMakeLists.txt
+++ b/iree/hal/vulkan/CMakeLists.txt
@@ -23,6 +23,8 @@
     "vulkan_driver.h"
   SRCS
     "api.cc"
+    "builtin_executables.cc"
+    "builtin_executables.h"
     "command_queue.h"
     "debug_reporter.cc"
     "debug_reporter.h"
@@ -82,6 +84,7 @@
     iree::base::logging
     iree::base::tracing
     iree::hal
+    iree::hal::vulkan::builtin
     iree::hal::vulkan::util::arena
     iree::hal::vulkan::util::intrusive_list
     iree::hal::vulkan::util::ref_ptr
diff --git a/iree/hal/vulkan/builtin/BUILD b/iree/hal/vulkan/builtin/BUILD
new file mode 100644
index 0000000..083e92f
--- /dev/null
+++ b/iree/hal/vulkan/builtin/BUILD
@@ -0,0 +1,24 @@
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/embed_data:build_defs.bzl", "c_embed_data")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+c_embed_data(
+    name = "builtin",
+    srcs = [
+        "fill_unaligned.spv",
+    ],
+    c_file_output = "builtin_shaders_spv.c",
+    flatten = True,
+    h_file_output = "builtin_shaders_spv.h",
+    identifier = "builtin_shaders_spv",
+)
diff --git a/iree/hal/vulkan/builtin/CMakeLists.txt b/iree/hal/vulkan/builtin/CMakeLists.txt
new file mode 100644
index 0000000..2514196
--- /dev/null
+++ b/iree/hal/vulkan/builtin/CMakeLists.txt
@@ -0,0 +1,28 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# iree/hal/vulkan/builtin/BUILD                                                #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_c_embed_data(
+  NAME
+    builtin
+  SRCS
+    "fill_unaligned.spv"
+  C_FILE_OUTPUT
+    "builtin_shaders_spv.c"
+  H_FILE_OUTPUT
+    "builtin_shaders_spv.h"
+  IDENTIFIER
+    "builtin_shaders_spv"
+  FLATTEN
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/iree/hal/vulkan/builtin/compile_shaders.sh b/iree/hal/vulkan/builtin/compile_shaders.sh
new file mode 100644
index 0000000..fd5f571
--- /dev/null
+++ b/iree/hal/vulkan/builtin/compile_shaders.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Copyright 2021 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Compiles input .glsl files into output .spv binary files. As these files are
+# updated infrequently and their binary sizes are small, we check in both files
+# and don't take a hard dependency on the shader compiler tool.
+#
+# To use, ensure `glslc` is on your PATH (such as by installing the Vulkan SDK
+# or builting it from its source at https://github.com/google/shaderc) and run
+# the script.
+
+set -e
+set -x
+
+BUILTIN_DIR="$(dirname $0)"
+
+glslc \
+  -Os -fshader-stage=compute -mfmt=bin \
+  ${BUILTIN_DIR}/fill_unaligned.glsl \
+  -o ${BUILTIN_DIR}/fill_unaligned.spv
diff --git a/iree/hal/vulkan/builtin/fill_unaligned.glsl b/iree/hal/vulkan/builtin/fill_unaligned.glsl
new file mode 100644
index 0000000..9ba434e
--- /dev/null
+++ b/iree/hal/vulkan/builtin/fill_unaligned.glsl
@@ -0,0 +1,64 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#version 450
+
+// Polyfill for buffer fills that are not aligned to 4 byte offsets or lengths.
+// This only implements the unaligned edges of fill operations. vkCmdFillBuffer
+// should be used for the aligned interior (if any).
+//
+// Repeats the 4 byte value |fill_pattern| into |output_elements|, between
+// |fill_offset_bytes| and |fill_offset_bytes| + |fill_length_bytes|.
+
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(set = 3, binding = 0) buffer OutputBuffer { uint output_elements[]; };
+
+layout(push_constant) uniform Constants {
+  // TODO(scotttodd): low and high for 8 byte pattern
+  uint fill_pattern;
+  uint fill_pattern_width;  // should be 1 or 2 (or 8 later on)
+  uint fill_offset_bytes;   // must be aligned to pattern width
+  uint fill_length_bytes;
+} input_constants;
+
+void FillBufferUnalignedHelper(uint fill_offset_bytes, uint fill_length_bytes) {
+  uint fill_aligned_offset = fill_offset_bytes % 4;
+  uint fill_aligned_start_bytes = fill_offset_bytes - fill_aligned_offset;
+  uint fill_aligned_start_index = fill_aligned_start_bytes / 4;
+
+  uint shifted_pattern = 0x00000000;
+  if (input_constants.fill_pattern_width == 1) {
+    // Shift the pattern into each segment that is within the fill range.
+    uint fill_start = fill_aligned_offset;
+    uint fill_end = min(4, fill_start + fill_length_bytes);
+    for (uint i = fill_start; i < fill_end; ++i) {
+      shifted_pattern |= input_constants.fill_pattern << (8 * i);
+    }
+  } else if (input_constants.fill_pattern_width == 2) {
+    // Shift the pattern into the only supported segment in the fill range.
+    shifted_pattern = input_constants.fill_pattern << (8 * fill_aligned_offset);
+  }
+  output_elements[fill_aligned_start_index] = shifted_pattern;
+}
+
+void main() {
+  uint start_byte = input_constants.fill_offset_bytes;
+  uint end_byte =
+      input_constants.fill_offset_bytes + input_constants.fill_length_bytes;
+
+  // Unaligned start fill, if needed.
+  if (start_byte % 4 != 0 || input_constants.fill_length_bytes < 4) {
+    FillBufferUnalignedHelper(start_byte, input_constants.fill_length_bytes);
+  }
+  // Unaligned end fill, if needed.
+  if ((end_byte % 4 != 0) &&
+      (start_byte % 4 + input_constants.fill_length_bytes > 4)) {
+    uint end_rounded_down = (end_byte / 4) * 4;
+    uint length_end = end_byte - end_rounded_down;
+    FillBufferUnalignedHelper(end_rounded_down, length_end);
+  }
+}
diff --git a/iree/hal/vulkan/builtin/fill_unaligned.spv b/iree/hal/vulkan/builtin/fill_unaligned.spv
new file mode 100644
index 0000000..d457e5d
--- /dev/null
+++ b/iree/hal/vulkan/builtin/fill_unaligned.spv
Binary files differ
diff --git a/iree/hal/vulkan/builtin_executables.cc b/iree/hal/vulkan/builtin_executables.cc
new file mode 100644
index 0000000..3d6b918
--- /dev/null
+++ b/iree/hal/vulkan/builtin_executables.cc
@@ -0,0 +1,204 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/hal/vulkan/builtin_executables.h"
+
+#include <cstddef>
+
+#include "iree/base/tracing.h"
+#include "iree/hal/vulkan/builtin/builtin_shaders_spv.h"
+#include "iree/hal/vulkan/native_descriptor_set.h"
+#include "iree/hal/vulkan/native_descriptor_set_layout.h"
+#include "iree/hal/vulkan/native_executable_layout.h"
+#include "iree/hal/vulkan/status_util.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+namespace {
+
+typedef struct iree_hal_vulkan_builtin_fill_unaligned_constants_t {
+  uint32_t fill_pattern;
+  uint32_t fill_pattern_width;
+  uint32_t fill_offset_bytes;
+  uint32_t fill_length_bytes;
+} iree_hal_vulkan_builtin_fill_unaligned_constants_t;
+
+static_assert(sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t) ==
+                  IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT,
+              "push constant count must match struct size");
+
+}  // namespace
+
+BuiltinExecutables::BuiltinExecutables(VkDeviceHandle* logical_device)
+    : logical_device_(logical_device) {}
+
+BuiltinExecutables::~BuiltinExecutables() {
+  if (pipeline_ != VK_NULL_HANDLE) {
+    logical_device_->syms()->vkDestroyPipeline(*logical_device_, pipeline_,
+                                               logical_device_->allocator());
+  }
+
+  if (executable_layout_) {
+    iree_hal_executable_layout_destroy(executable_layout_);
+  }
+
+  for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+    iree_hal_descriptor_set_layout_release(descriptor_set_layouts_[i]);
+  }
+}
+
+iree_status_t BuiltinExecutables::InitializeExecutables() {
+  IREE_TRACE_SCOPE();
+
+  // Create descriptor set layouts for our compute pipeline.
+  // Even though we're just using one set, we still need to create layout
+  // bindings for those preceding it.
+  for (size_t i = 0; i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT; ++i) {
+    iree_hal_descriptor_set_layout_t* layout = NULL;
+    iree_hal_descriptor_set_layout_binding_t layout_binding;
+    layout_binding.binding = 0;
+    layout_binding.type = IREE_HAL_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    layout_binding.access = i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET
+                                ? IREE_HAL_MEMORY_ACCESS_NONE
+                                : IREE_HAL_MEMORY_ACCESS_WRITE;
+    IREE_RETURN_IF_ERROR(iree_hal_vulkan_native_descriptor_set_layout_create(
+        logical_device_,
+        i < IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET
+            ? IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_IMMUTABLE
+            : IREE_HAL_DESCRIPTOR_SET_LAYOUT_USAGE_TYPE_PUSH_ONLY,
+        /*binding_count=*/1, &layout_binding, &layout));
+    descriptor_set_layouts_[i] = layout;
+  }
+
+  iree_status_t status = iree_ok_status();
+
+  // Create shader module.
+  VkShaderModule fill_unaligned_shader = VK_NULL_HANDLE;
+  if (iree_status_is_ok(status)) {
+    VkShaderModuleCreateInfo shader_create_info;
+    shader_create_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    shader_create_info.pNext = NULL;
+    shader_create_info.flags = 0;
+    shader_create_info.codeSize = builtin_shaders_spv_create()[0].size;
+    shader_create_info.pCode =
+        (const uint32_t*)builtin_shaders_spv_create()[0].data;
+    status = VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateShaderModule(
+        *logical_device_, &shader_create_info, logical_device_->allocator(),
+        &fill_unaligned_shader));
+  }
+
+  // Create pipeline layout.
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_native_executable_layout_create(
+        logical_device_, IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT / 4,
+        IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT, descriptor_set_layouts_,
+        &executable_layout_);
+  }
+
+  // Create pipeline.
+  if (iree_status_is_ok(status)) {
+    VkComputePipelineCreateInfo pipeline_create_info;
+    pipeline_create_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipeline_create_info.pNext = NULL;
+    pipeline_create_info.flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT;
+    pipeline_create_info.layout =
+        iree_hal_vulkan_native_executable_layout_handle(executable_layout_);
+    pipeline_create_info.basePipelineHandle = VK_NULL_HANDLE;
+    pipeline_create_info.basePipelineIndex = 0;
+    VkPipelineShaderStageCreateInfo* stage_create_info =
+        &pipeline_create_info.stage;
+    stage_create_info->sType =
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    stage_create_info->pNext = NULL;
+    stage_create_info->flags = 0;
+    stage_create_info->stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    stage_create_info->module = fill_unaligned_shader;
+    stage_create_info->pName = "main";
+    stage_create_info->pSpecializationInfo = NULL;
+    status =
+        VK_RESULT_TO_STATUS(logical_device_->syms()->vkCreateComputePipelines(
+            *logical_device_, /*pipeline_cache=*/VK_NULL_HANDLE,
+            /*pipeline_count=*/1, &pipeline_create_info,
+            logical_device_->allocator(), &pipeline_));
+  }
+
+  // Destroy shader module now that the pipeline is created.
+  if (fill_unaligned_shader != VK_NULL_HANDLE) {
+    logical_device_->syms()->vkDestroyShaderModule(
+        *logical_device_, fill_unaligned_shader, logical_device_->allocator());
+  }
+
+  return status;
+}
+
+iree_status_t BuiltinExecutables::FillBufferUnaligned(
+    VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+    iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+    iree_device_size_t length, const void* pattern,
+    iree_host_size_t pattern_length, const void* push_constants_to_restore) {
+  IREE_TRACE_SCOPE();
+
+  iree_hal_vulkan_builtin_fill_unaligned_constants_t constants;
+  switch (pattern_length) {
+    case 1:
+      constants.fill_pattern = *static_cast<const uint8_t*>(pattern);
+      break;
+    case 2:
+      constants.fill_pattern = *static_cast<const uint16_t*>(pattern);
+      break;
+    case 4:
+      constants.fill_pattern = *static_cast<const uint32_t*>(pattern);
+      break;
+    default:
+      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                              "pattern length (%" PRIhsz
+                              ") is not a power of two or is too large",
+                              pattern_length);
+  }
+
+  iree_hal_descriptor_set_binding_t binding;
+  binding.binding = 0;
+  binding.buffer = target_buffer;
+  binding.offset = 0;
+  binding.length = IREE_WHOLE_BUFFER;
+  IREE_RETURN_IF_ERROR(descriptor_set_arena->BindDescriptorSet(
+      command_buffer, executable_layout_,
+      IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET, /*binding_count=*/1, &binding));
+
+  logical_device_->syms()->vkCmdBindPipeline(
+      command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_);
+
+  constants.fill_pattern_width = pattern_length;
+  constants.fill_offset_bytes = target_offset;
+  constants.fill_length_bytes = length;
+  logical_device_->syms()->vkCmdPushConstants(
+      command_buffer,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+      VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+      sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t), &constants);
+
+  // TODO(scotttodd): insert memory barrier if we need to do dispatch<->dispatch
+  //   synchronization. The barriers inserted normally by callers would be for
+  //   transfer<->dispatch.
+
+  logical_device_->syms()->vkCmdDispatch(command_buffer, 1, 1, 1);
+
+  // Restore push constants.
+  logical_device_->syms()->vkCmdPushConstants(
+      command_buffer,
+      iree_hal_vulkan_native_executable_layout_handle(executable_layout_),
+      VK_SHADER_STAGE_COMPUTE_BIT, /*offset=*/0,
+      sizeof(iree_hal_vulkan_builtin_fill_unaligned_constants_t),
+      push_constants_to_restore);
+
+  return iree_ok_status();
+}
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
diff --git a/iree/hal/vulkan/builtin_executables.h b/iree/hal/vulkan/builtin_executables.h
new file mode 100644
index 0000000..ea25102
--- /dev/null
+++ b/iree/hal/vulkan/builtin_executables.h
@@ -0,0 +1,69 @@
+// Copyright 2021 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+#define IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
+
+#include <vector>
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/vulkan/descriptor_set_arena.h"
+#include "iree/hal/vulkan/dynamic_symbols.h"
+#include "iree/hal/vulkan/handle_util.h"
+#include "iree/hal/vulkan/util/ref_ptr.h"
+
+namespace iree {
+namespace hal {
+namespace vulkan {
+
+// The `maxBoundDescriptorSets` limit is 4 on many devices we support and we
+// want to avoid conflicts with what the compiler uses, so we'll expect the
+// compiler to have reserved the index 3 for our exclusive use.
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT 4
+#define IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET 3
+
+#define IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT 16
+
+class BuiltinExecutables {
+ public:
+  BuiltinExecutables(VkDeviceHandle* logical_device);
+  ~BuiltinExecutables();
+
+  const ref_ptr<DynamicSymbols>& syms() const {
+    return logical_device_->syms();
+  }
+
+  iree_status_t InitializeExecutables();
+
+  // Fills a buffer without 4 byte offset or length requirements.
+  //
+  // This only implements the unaligned edges of fills, vkCmdFillBuffer should
+  // be used for the aligned interior (if any).
+  //
+  // |push_constants_to_restore| will be pushed using vkCmdPushConstants over
+  // the bytes used by this call.
+  iree_status_t FillBufferUnaligned(
+      VkCommandBuffer command_buffer, DescriptorSetArena* descriptor_set_arena,
+      iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
+      iree_device_size_t length, const void* pattern,
+      iree_host_size_t pattern_length, const void* push_constants_to_restore);
+
+ private:
+  VkDeviceHandle* logical_device_ = NULL;
+
+  iree_hal_descriptor_set_layout_t*
+      descriptor_set_layouts_[IREE_HAL_VULKAN_BUILTIN_DESCRIPTOR_SET_COUNT] = {
+          NULL};
+  iree_hal_executable_layout_t* executable_layout_ = NULL;
+  VkPipeline pipeline_ = VK_NULL_HANDLE;
+};
+
+}  // namespace vulkan
+}  // namespace hal
+}  // namespace iree
+
+#endif  // IREE_HAL_VULKAN_BUILTIN_EXECUTABLES_H_
diff --git a/iree/hal/vulkan/descriptor_set_arena.cc b/iree/hal/vulkan/descriptor_set_arena.cc
index d605669..cefa6bc 100644
--- a/iree/hal/vulkan/descriptor_set_arena.cc
+++ b/iree/hal/vulkan/descriptor_set_arena.cc
@@ -42,27 +42,32 @@
         iree_hal_buffer_allocated_buffer(binding.buffer));
     buffer_info.offset =
         iree_hal_buffer_byte_offset(binding.buffer) + binding.offset;
-    // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
-    // GPUs; it has the best support compared to other bitwidths. We use VMA to
-    // manage GPU memory for us and VMA should already handled proper alignment
-    // when performing allocations; here we just need to provide the proper
-    // "view" to Vulkan drivers over the allocated memory.
-    //
-    // Note this is needed because we can see unusal buffers like tensor<3xi8>.
-    // Depending on GPU capabilities, this might not always be directly
-    // supported by the hardware. Under such circumstances, we need to emulate
-    // i8 support with i32. Shader CodeGen takes care of that: the shader will
-    // read the buffer as tensor<i32> and perform bit shifts to extract each
-    // byte and conduct computations. The extra additional byte is read but
-    // not really used by the shader. Here in application we need to match the
-    // ABI and provide the buffer as 32-bit aligned, otherwise the whole read by
-    // the shader is considered as out of bounds per the Vulkan spec.
-    // See https://github.com/google/iree/issues/2022#issuecomment-640617234
-    // for more details.
-    buffer_info.range = iree_device_align(
-        std::min(binding.length,
-                 iree_hal_buffer_byte_length(binding.buffer) - binding.offset),
-        4);
+    if (binding.length == IREE_WHOLE_BUFFER) {
+      buffer_info.range = VK_WHOLE_SIZE;
+    } else {
+      // Round up to a multiple of 32-bit. 32-bit is the most native bitwidth on
+      // GPUs; it has the best support compared to other bitwidths. We use VMA
+      // to manage GPU memory for us and VMA should already handled proper
+      // alignment when performing allocations; here we just need to provide the
+      // proper "view" to Vulkan drivers over the allocated memory.
+      //
+      // Note this is needed because we can see unusal buffers like
+      // tensor<3xi8>. Depending on GPU capabilities, this might not always be
+      // directly supported by the hardware. Under such circumstances, we need
+      // to emulate i8 support with i32. Shader CodeGen takes care of that: the
+      // shader will read the buffer as tensor<i32> and perform bit shifts to
+      // extract each byte and conduct computations. The extra additional byte
+      // is read but not really used by the shader. Here in application we need
+      // to match the ABI and provide the buffer as 32-bit aligned, otherwise
+      // the whole read by the shader is considered as out of bounds per the
+      // Vulkan spec. See
+      // https://github.com/google/iree/issues/2022#issuecomment-640617234 for
+      // more details.
+      buffer_info.range = iree_device_align(
+          std::min(binding.length, iree_hal_buffer_byte_length(binding.buffer) -
+                                       binding.offset),
+          4);
+    }
 
     auto& write_info = write_infos[i];
     write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
diff --git a/iree/hal/vulkan/direct_command_buffer.cc b/iree/hal/vulkan/direct_command_buffer.cc
index 96f8eed..734fac1 100644
--- a/iree/hal/vulkan/direct_command_buffer.cc
+++ b/iree/hal/vulkan/direct_command_buffer.cc
@@ -50,6 +50,15 @@
   // This must remain valid until all in-flight submissions of the command
   // buffer complete.
   DescriptorSetGroup descriptor_set_group;
+
+  BuiltinExecutables* builtin_executables;
+
+  // Shadow copy of push constants used during normal operation, for restoring
+  // after builtin_executables uses vkCmdPushConstants. Size must be greater
+  // than or equal to the push constant memory used by builtin_executables.
+  // TODO(scotttodd): use [maxPushConstantsSize - 16, maxPushConstantsSize]
+  //                  instead of [0, 16] to reduce frequency of updates
+  uint8_t push_constants_storage[IREE_HAL_VULKAN_BUILTIN_PUSH_CONSTANT_COUNT];
 } iree_hal_vulkan_direct_command_buffer_t;
 
 extern const iree_hal_command_buffer_vtable_t
@@ -71,6 +80,7 @@
     iree_hal_queue_affinity_t queue_affinity,
     iree_hal_vulkan_tracing_context_t* tracing_context,
     iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+    iree::hal::vulkan::BuiltinExecutables* builtin_executables,
     iree_hal_command_buffer_t** out_command_buffer) {
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(command_pool);
@@ -109,6 +119,8 @@
         DescriptorSetArena(descriptor_pool_cache);
     new (&command_buffer->descriptor_set_group) DescriptorSetGroup();
 
+    command_buffer->builtin_executables = builtin_executables;
+
     *out_command_buffer = (iree_hal_command_buffer_t*)command_buffer;
   } else {
     command_pool->Free(handle);
@@ -512,14 +524,47 @@
   VkBuffer target_device_buffer = iree_hal_vulkan_vma_buffer_handle(
       iree_hal_buffer_allocated_buffer(target_buffer));
 
-  // Note that fill only accepts 4-byte aligned values so we need to splat out
-  // our variable-length pattern.
-  target_offset += iree_hal_buffer_byte_offset(target_buffer);
-  uint32_t dword_pattern =
-      iree_hal_vulkan_splat_pattern(pattern, pattern_length);
-  command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
-                                        target_device_buffer, target_offset,
-                                        length, dword_pattern);
+  // vkCmdFillBuffer requires a 4 byte alignment for the offset, pattern, and
+  // length. We use a polyfill here that fills the unaligned start and end of
+  // fill operations, if needed.
+
+  if (target_offset % 4 != 0 || length % 4 != 0) {
+    // TODO(scotttodd): only restore push constants that have been modified?
+    //                  (this can pass uninitialized memory right now, which
+    //                   *should* be safe but is wasteful)
+    IREE_RETURN_IF_ERROR(
+        command_buffer->builtin_executables->FillBufferUnaligned(
+            command_buffer->handle, &(command_buffer->descriptor_set_arena),
+            target_buffer, target_offset, length, pattern, pattern_length,
+            command_buffer->push_constants_storage));
+
+    // Continue using vkCmdFillBuffer below, but only for the inner aligned
+    // portion of the fill operation.
+    // For example:
+    //   original offset 2, length 8
+    //   aligned  offset 4, length 4
+    // [0x00,0x00,0xAB,0xAB | 0xAB,0xAB,0xAB,0xAB | 0xAB,0xAB,0x00,0x00]
+    //            <-------> <---------------------> <------->
+    //            unaligned     vkCmdFillBuffer     unaligned
+    iree_device_size_t aligned_target_offset =
+        iree_device_align(target_offset, 4);
+    iree_device_size_t target_end = target_offset + length;
+    iree_device_size_t rounded_down_target_end = (target_end / 4) * 4;
+    length -= (aligned_target_offset - target_offset) +
+              (target_end - rounded_down_target_end);
+    target_offset = aligned_target_offset;
+  }
+
+  if (length > 0) {
+    // Note that vkCmdFillBuffer only accepts 4-byte aligned values so we need
+    // to splat out our variable-length pattern.
+    target_offset += iree_hal_buffer_byte_offset(target_buffer);
+    uint32_t dword_pattern =
+        iree_hal_vulkan_splat_pattern(pattern, pattern_length);
+    command_buffer->syms->vkCmdFillBuffer(command_buffer->handle,
+                                          target_device_buffer, target_offset,
+                                          length, dword_pattern);
+  }
 
   return iree_ok_status();
 }
@@ -584,6 +629,13 @@
   iree_hal_vulkan_direct_command_buffer_t* command_buffer =
       iree_hal_vulkan_direct_command_buffer_cast(base_command_buffer);
 
+  iree_host_size_t storage_size =
+      IREE_ARRAYSIZE(command_buffer->push_constants_storage);
+  if (offset < storage_size) {
+    memcpy(command_buffer->push_constants_storage + offset, values,
+           std::min(values_length, storage_size) - offset);
+  }
+
   command_buffer->syms->vkCmdPushConstants(
       command_buffer->handle,
       iree_hal_vulkan_native_executable_layout_handle(executable_layout),
diff --git a/iree/hal/vulkan/direct_command_buffer.h b/iree/hal/vulkan/direct_command_buffer.h
index cc1d097..606e859 100644
--- a/iree/hal/vulkan/direct_command_buffer.h
+++ b/iree/hal/vulkan/direct_command_buffer.h
@@ -9,6 +9,7 @@
 
 #include "iree/base/api.h"
 #include "iree/hal/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
 #include "iree/hal/vulkan/descriptor_pool_cache.h"
 #include "iree/hal/vulkan/handle_util.h"
 #include "iree/hal/vulkan/tracing.h"
@@ -26,6 +27,7 @@
     iree_hal_queue_affinity_t queue_affinity,
     iree_hal_vulkan_tracing_context_t* tracing_context,
     iree::hal::vulkan::DescriptorPoolCache* descriptor_pool_cache,
+    iree::hal::vulkan::BuiltinExecutables* builtin_executables,
     iree_hal_command_buffer_t** out_command_buffer);
 
 // Returns the native Vulkan VkCommandBuffer handle.
diff --git a/iree/hal/vulkan/vulkan_device.cc b/iree/hal/vulkan/vulkan_device.cc
index 953abba..c5a7873 100644
--- a/iree/hal/vulkan/vulkan_device.cc
+++ b/iree/hal/vulkan/vulkan_device.cc
@@ -14,6 +14,7 @@
 #include "iree/base/internal/math.h"
 #include "iree/base/tracing.h"
 #include "iree/hal/vulkan/api.h"
+#include "iree/hal/vulkan/builtin_executables.h"
 #include "iree/hal/vulkan/command_queue.h"
 #include "iree/hal/vulkan/descriptor_pool_cache.h"
 #include "iree/hal/vulkan/direct_command_buffer.h"
@@ -363,6 +364,8 @@
   // Used only for emulated timeline semaphores.
   TimePointSemaphorePool* semaphore_pool;
   TimePointFencePool* fence_pool;
+
+  BuiltinExecutables* builtin_executables;
 } iree_hal_vulkan_device_t;
 
 extern const iree_hal_device_vtable_t iree_hal_vulkan_device_vtable;
@@ -622,6 +625,12 @@
   }
 
   if (iree_status_is_ok(status)) {
+    device->builtin_executables =
+        new BuiltinExecutables(device->logical_device);
+    status = device->builtin_executables->InitializeExecutables();
+  }
+
+  if (iree_status_is_ok(status)) {
     *out_device = (iree_hal_device_t*)device;
   } else {
     iree_hal_device_destroy((iree_hal_device_t*)device);
@@ -647,6 +656,7 @@
 
   // Now that no commands are outstanding we can release all resources that may
   // have been in use.
+  delete device->builtin_executables;
   delete device->descriptor_pool_cache;
   delete device->semaphore_pool;
   delete device->fence_pool;
@@ -930,6 +940,12 @@
     iree_hal_vulkan_device_t* device,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity) {
+  // TODO(scotttodd): revisit queue selection logic and remove this
+  //   * the unaligned buffer fill polyfill and tracing timestamp queries may
+  //     both insert dispatches into command buffers that at compile time are
+  //     expected to only contain transfer commands
+  //   * we could set a bit at recording time if emulation or tracing is used
+  //     and submit to the right queue based on that
   command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
 
   // TODO(benvanik): meaningful heuristics for affinity. We don't generate
@@ -949,6 +965,12 @@
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_vulkan_device_t* device = iree_hal_vulkan_device_cast(base_device);
 
+  // TODO(scotttodd): revisit queue selection logic and remove this
+  //   * the unaligned buffer fill polyfill and tracing timestamp queries may
+  //     both insert dispatches into command buffers that at compile time are
+  //     expected to only contain transfer commands
+  //   * we could set a bit at recording time if emulation or tracing is used
+  //     and submit to the right queue based on that
   command_categories |= IREE_HAL_COMMAND_CATEGORY_DISPATCH;
 
   // Select the command pool to used based on the types of commands used.
@@ -974,7 +996,7 @@
   return iree_hal_vulkan_direct_command_buffer_allocate(
       device->logical_device, command_pool, mode, command_categories,
       queue_affinity, queue->tracing_context(), device->descriptor_pool_cache,
-      out_command_buffer);
+      device->builtin_executables, out_command_buffer);
 }
 
 static iree_status_t iree_hal_vulkan_device_create_descriptor_set(