Adding native (non-VMA) Vulkan allocator behind a flag. (#14389)

`--vulkan_vma_allocator=false` can be used to disable VMA. Strongly
recommended that an allocator shim like `--device_allocator=caching` is
used as native Vulkan API allocations are slow/limited.
diff --git a/runtime/src/iree/hal/allocator.h b/runtime/src/iree/hal/allocator.h
index 2757d8d..2d13fa0 100644
--- a/runtime/src/iree/hal/allocator.h
+++ b/runtime/src/iree/hal/allocator.h
@@ -621,8 +621,8 @@
 }
 
 #else
-#define iree_hal_allocator_statistics_record_alloc(...)
-#define iree_hal_allocator_statistics_record_free(...)
+#define iree_hal_allocator_statistics_record_alloc(statistics, ...)
+#define iree_hal_allocator_statistics_record_free(statistics, ...)
 #endif  // IREE_STATISTICS_ENABLE
 
 #ifdef __cplusplus
diff --git a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
index b65c24c..96e51b7 100644
--- a/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/vulkan/BUILD.bazel
@@ -36,6 +36,8 @@
         "extensibility_util.cc",
         "extensibility_util.h",
         "handle_util.h",
+        "native_allocator.cc",
+        "native_allocator.h",
         "native_buffer.cc",
         "native_buffer.h",
         "native_event.cc",
diff --git a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
index 93cb257..f9893db 100644
--- a/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/vulkan/CMakeLists.txt
@@ -37,6 +37,8 @@
     "extensibility_util.cc"
     "extensibility_util.h"
     "handle_util.h"
+    "native_allocator.cc"
+    "native_allocator.h"
     "native_buffer.cc"
     "native_buffer.h"
     "native_event.cc"
diff --git a/runtime/src/iree/hal/drivers/vulkan/api.h b/runtime/src/iree/hal/drivers/vulkan/api.h
index e243899..3c8a919 100644
--- a/runtime/src/iree/hal/drivers/vulkan/api.h
+++ b/runtime/src/iree/hal/drivers/vulkan/api.h
@@ -194,6 +194,11 @@
   // IREE execution to run asynchronously with the graphics workloads.
   // See: https://gpuopen.com/learn/concurrent-execution-asynchronous-queues/
   IREE_HAL_VULKAN_DEVICE_FLAG_DEDICATED_COMPUTE_QUEUE = 1u << 0,
+
+  // Whether to use the VMA allocator instead of native Vulkan API memory
+  // allocations.
+  // NOTE: this is temporary and VMA is slated for removal in the future.
+  IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR = 1u << 1,
 };
 typedef uint32_t iree_hal_vulkan_device_flags_t;
 
@@ -205,7 +210,7 @@
   // size of a large heap block allocation. This effectively specifies the
   // minimum amount of memory required and will always allocate at least this
   // much.
-  // NOTE: this is temporary and likely to get removed in the future.
+  // NOTE: this is temporary and VMA is slated for removal in the future.
   iree_device_size_t large_heap_block_size;
 } iree_hal_vulkan_device_options_t;
 
@@ -307,10 +312,14 @@
 //===----------------------------------------------------------------------===//
 
 // EXPERIMENTAL: until VMA is removed this is doing a shady reinterpret cast.
+//
 // TODO(benvanik): make this safer (dyn_cast-like, lookup allocated buffer).
 // Returns the backing device memory and logical buffer handle of a HAL buffer
 // managed by the Vulkan HAL. Invalid to call on any buffer but a base allocated
 // Vulkan HAL buffer.
+//
+// NOTE: |out_memory| will be VK_NULL_HANDLE in cases where sparse residency is
+// used.
 IREE_API_EXPORT iree_status_t iree_hal_vulkan_allocated_buffer_handle(
     iree_hal_buffer_t* allocated_buffer, VkDeviceMemory* out_memory,
     VkBuffer* out_handle);
diff --git a/runtime/src/iree/hal/drivers/vulkan/base_buffer.c b/runtime/src/iree/hal/drivers/vulkan/base_buffer.c
index 4270619..cf8eb12 100644
--- a/runtime/src/iree/hal/drivers/vulkan/base_buffer.c
+++ b/runtime/src/iree/hal/drivers/vulkan/base_buffer.c
@@ -36,6 +36,74 @@
          !iree_all_bits_set(flags, VK_MEMORY_PROPERTY_PROTECTED_BIT);
 }
 
+iree_status_t iree_hal_vulkan_find_memory_type(
+    const VkPhysicalDeviceProperties* device_props,
+    const VkPhysicalDeviceMemoryProperties* memory_props,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    uint32_t* out_memory_type_index) {
+  *out_memory_type_index = 0;
+
+  VkMemoryPropertyFlags require_flags = 0;
+  VkMemoryPropertyFlags prefer_flags = 0;
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
+    if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
+      // Device-local, host-visible.
+      require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+      prefer_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    } else {
+      // Device-local only.
+      require_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    }
+  } else {
+    if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+      // Host-local, device-visible.
+      require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    } else {
+      // Host-local only.
+      require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    }
+  }
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
+    require_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+  }
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
+    require_flags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+  }
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_MAPPING)) {
+    require_flags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+  }
+
+  int most_bits_count = 0;
+  int most_bits_idx = -1;
+  for (uint32_t i = 0; i < memory_props->memoryTypeCount; ++i) {
+    VkMemoryPropertyFlags flags = memory_props->memoryTypes[i].propertyFlags;
+    if (!iree_all_bits_set(flags, require_flags) ||
+        !iree_hal_vulkan_is_memory_type_usable(flags)) {
+      // Excluded (required bits missing or memory type is not usable).
+      continue;
+    }
+    // When all required bits are satisfied try to find the memory type that
+    // has the most preferred bits set.
+    int bit_count = iree_math_count_ones_u32(flags & prefer_flags);
+    if (most_bits_idx == -1) {
+      most_bits_count = bit_count;
+      most_bits_idx = (int)i;
+    } else if (bit_count > most_bits_count) {
+      most_bits_count = bit_count;
+      most_bits_idx = (int)i;
+    }
+  }
+  if (most_bits_idx == -1) {
+    // No valid memory type found.
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "no memory type available that satisfies the required flags");
+  }
+
+  *out_memory_type_index = (uint32_t)most_bits_idx;
+  return iree_ok_status();
+}
+
 static void iree_hal_vulkan_populate_dispatch_memory_types(
     const VkPhysicalDeviceProperties* device_props,
     const VkPhysicalDeviceMemoryProperties* memory_props,
diff --git a/runtime/src/iree/hal/drivers/vulkan/base_buffer.h b/runtime/src/iree/hal/drivers/vulkan/base_buffer.h
index 10fe137..c9a0d34 100644
--- a/runtime/src/iree/hal/drivers/vulkan/base_buffer.h
+++ b/runtime/src/iree/hal/drivers/vulkan/base_buffer.h
@@ -64,6 +64,15 @@
   int indices[5];
 } iree_hal_vulkan_memory_types_t;
 
+// Finds the memory type that satisfies the required and preferred buffer
+// |params| and returns it in |out_memory_type_index|. Fails if no memory type
+// satisfies the requirements.
+iree_status_t iree_hal_vulkan_find_memory_type(
+    const VkPhysicalDeviceProperties* device_props,
+    const VkPhysicalDeviceMemoryProperties* memory_props,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    uint32_t* out_memory_type_index);
+
 // Queries the underlying Vulkan implementation to decide which memory type
 // should be used for particular operations.
 iree_status_t iree_hal_vulkan_populate_memory_types(
@@ -88,6 +97,8 @@
 // to get access to the API VkBuffer handle.
 typedef struct iree_hal_vulkan_base_buffer_t {
   iree_hal_buffer_t base;
+  // NOTE: may be VK_NULL_HANDLE if sparse residency is used to back the buffer
+  // with multiple device memory allocations.
   VkDeviceMemory device_memory;
   VkBuffer handle;
 } iree_hal_vulkan_base_buffer_t;
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc
new file mode 100644
index 0000000..70e2ac4
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc
@@ -0,0 +1,368 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <cstddef>
+#include <cstring>
+
+#include "iree/base/api.h"
+#include "iree/hal/drivers/vulkan/base_buffer.h"
+#include "iree/hal/drivers/vulkan/dynamic_symbols.h"
+#include "iree/hal/drivers/vulkan/native_buffer.h"
+#include "iree/hal/drivers/vulkan/status_util.h"
+
+using namespace iree::hal::vulkan;
+
+#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+static const char* IREE_HAL_VULKAN_NATIVE_ALLOCATOR_ID = "Vulkan/Native";
+#endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING
+
+typedef struct iree_hal_vulkan_native_allocator_t {
+  iree_hal_resource_t resource;
+  VkDeviceHandle* logical_device;
+  iree_hal_device_t* device;  // unretained to avoid cycles
+  iree_allocator_t host_allocator;
+
+  // Cached from the API to avoid additional queries in hot paths.
+  VkPhysicalDeviceProperties device_props;
+  VkPhysicalDeviceMemoryProperties memory_props;
+
+  // Used to quickly look up the memory type index used for a particular usage.
+  iree_hal_vulkan_memory_types_t memory_types;
+
+  IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
+} iree_hal_vulkan_native_allocator_t;
+
+namespace {
+extern const iree_hal_allocator_vtable_t
+    iree_hal_vulkan_native_allocator_vtable;
+}  // namespace
+
+static iree_hal_vulkan_native_allocator_t*
+iree_hal_vulkan_native_allocator_cast(iree_hal_allocator_t* base_value) {
+  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_vulkan_native_allocator_vtable);
+  return (iree_hal_vulkan_native_allocator_t*)base_value;
+}
+
+static void iree_hal_vulkan_native_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator);
+
+extern "C" iree_status_t iree_hal_vulkan_native_allocator_create(
+    const iree_hal_vulkan_device_options_t* options, VkInstance instance,
+    VkPhysicalDevice physical_device, VkDeviceHandle* logical_device,
+    iree_hal_device_t* device, iree_hal_allocator_t** out_allocator) {
+  IREE_ASSERT_ARGUMENT(instance);
+  IREE_ASSERT_ARGUMENT(physical_device);
+  IREE_ASSERT_ARGUMENT(logical_device);
+  IREE_ASSERT_ARGUMENT(device);
+  IREE_ASSERT_ARGUMENT(out_allocator);
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_t host_allocator = logical_device->host_allocator();
+  iree_hal_vulkan_native_allocator_t* allocator = NULL;
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0, iree_allocator_malloc(host_allocator, sizeof(*allocator),
+                                (void**)&allocator));
+  iree_hal_resource_initialize(&iree_hal_vulkan_native_allocator_vtable,
+                               &allocator->resource);
+  allocator->logical_device = logical_device;
+  allocator->device = device;
+  allocator->host_allocator = host_allocator;
+
+  const auto& syms = logical_device->syms();
+  syms->vkGetPhysicalDeviceProperties(physical_device,
+                                      &allocator->device_props);
+  syms->vkGetPhysicalDeviceMemoryProperties(physical_device,
+                                            &allocator->memory_props);
+  iree_status_t status = iree_hal_vulkan_populate_memory_types(
+      &allocator->device_props, &allocator->memory_props,
+      &allocator->memory_types);
+
+  if (iree_status_is_ok(status)) {
+    *out_allocator = (iree_hal_allocator_t*)allocator;
+  } else {
+    iree_hal_vulkan_native_allocator_destroy((iree_hal_allocator_t*)allocator);
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void iree_hal_vulkan_native_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_vulkan_native_allocator_t* allocator =
+      iree_hal_vulkan_native_allocator_cast(base_allocator);
+  iree_allocator_t host_allocator = allocator->host_allocator;
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(host_allocator, allocator);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+static iree_allocator_t iree_hal_vulkan_native_allocator_host_allocator(
+    const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  iree_hal_vulkan_native_allocator_t* allocator =
+      (iree_hal_vulkan_native_allocator_t*)base_allocator;
+  return allocator->host_allocator;
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_trim(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
+  return iree_ok_status();
+}
+
+static void iree_hal_vulkan_native_allocator_query_statistics(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
+  IREE_STATISTICS({
+    iree_hal_vulkan_native_allocator_t* allocator =
+        iree_hal_vulkan_native_allocator_cast(base_allocator);
+    memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
+  });
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_query_memory_heaps(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_host_size_t capacity,
+    iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
+    iree_host_size_t* IREE_RESTRICT out_count) {
+  iree_hal_vulkan_native_allocator_t* allocator =
+      iree_hal_vulkan_native_allocator_cast(base_allocator);
+  return iree_hal_vulkan_query_memory_heaps(
+      &allocator->device_props, &allocator->memory_props,
+      &allocator->memory_types, capacity, heaps, out_count);
+}
+
+static iree_hal_buffer_compatibility_t
+iree_hal_vulkan_native_allocator_query_buffer_compatibility(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t* IREE_RESTRICT allocation_size) {
+  // TODO(benvanik): check to ensure the allocator can serve the memory type.
+
+  // All buffers can be allocated on the heap.
+  iree_hal_buffer_compatibility_t compatibility =
+      IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;
+
+  if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
+  }
+
+  // Buffers can only be used on the queue if they are device visible.
+  if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
+    if (iree_any_bit_set(params->usage,
+                         IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+      compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
+    }
+  }
+
+  // We are now optimal.
+  params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL;
+
+  // Guard against the corner case where the requested buffer size is 0. The
+  // application is unlikely to do anything when requesting a 0-byte buffer; but
+  // it can happen in real world use cases. So we should at least not crash.
+  if (*allocation_size == 0) *allocation_size = 4;
+
+  // Align allocation sizes to 4 bytes so shaders operating on 32 bit types can
+  // act safely even on buffer ranges that are not naturally aligned.
+  *allocation_size = iree_host_align(*allocation_size, 4);
+
+  return compatibility;
+}
+
+static void iree_hal_vulkan_native_allocator_native_buffer_release(
+    void* user_data, iree::hal::vulkan::VkDeviceHandle* logical_device,
+    VkDeviceMemory device_memory, VkBuffer handle) {
+  IREE_TRACE_FREE_NAMED(IREE_HAL_VULKAN_NATIVE_ALLOCATOR_ID, (void*)handle);
+  logical_device->syms()->vkDestroyBuffer(*logical_device, handle,
+                                          logical_device->allocator());
+  logical_device->syms()->vkFreeMemory(*logical_device, device_memory,
+                                       logical_device->allocator());
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_allocate_internal(
+    iree_hal_vulkan_native_allocator_t* IREE_RESTRICT allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  VkDeviceHandle* logical_device = allocator->logical_device;
+
+  // TODO(benvanik): if on a unified memory system and initial data is present
+  // we could set the mapping bit and ensure a much more efficient upload.
+
+  // Allocate the device memory we'll attach the buffer to.
+  VkMemoryAllocateInfo allocate_info = {};
+  allocate_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+  allocate_info.pNext = NULL;
+  allocate_info.memoryTypeIndex = 0;
+  allocate_info.allocationSize = allocation_size;
+  IREE_RETURN_IF_ERROR(iree_hal_vulkan_find_memory_type(
+      &allocator->device_props, &allocator->memory_props, params,
+      &allocate_info.memoryTypeIndex));
+  VkDeviceMemory device_memory = VK_NULL_HANDLE;
+  VK_RETURN_IF_ERROR(logical_device->syms()->vkAllocateMemory(
+                         *logical_device, &allocate_info,
+                         logical_device->allocator(), &device_memory),
+                     "vkAllocateMemory");
+
+  // Create an initially unbound buffer handle.
+  VkBufferCreateInfo buffer_create_info = {};
+  buffer_create_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+  buffer_create_info.pNext = NULL;
+  buffer_create_info.flags = 0;
+  buffer_create_info.size = allocation_size;
+  buffer_create_info.usage = 0;
+  if (iree_all_bits_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
+    buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+  }
+  if (iree_all_bits_set(params->usage,
+                        IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
+    buffer_create_info.usage |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+    buffer_create_info.usage |= VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT;
+  }
+  buffer_create_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  buffer_create_info.queueFamilyIndexCount = 0;
+  buffer_create_info.pQueueFamilyIndices = NULL;
+  VkBuffer handle = VK_NULL_HANDLE;
+  iree_status_t status =
+      VK_RESULT_TO_STATUS(logical_device->syms()->vkCreateBuffer(
+                              *logical_device, &buffer_create_info,
+                              logical_device->allocator(), &handle),
+                          "vkCreateBuffer");
+
+  iree_hal_vulkan_native_buffer_release_callback_t release_callback = {0};
+  release_callback.fn = iree_hal_vulkan_native_allocator_native_buffer_release;
+  release_callback.user_data = NULL;
+  iree_hal_buffer_t* buffer = NULL;
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_vulkan_native_buffer_wrap(
+        (iree_hal_allocator_t*)allocator, params->type, params->access,
+        params->usage, allocation_size,
+        /*byte_offset=*/0,
+        /*byte_length=*/allocation_size, logical_device, device_memory, handle,
+        release_callback, &buffer);
+  }
+  if (!iree_status_is_ok(status)) {
+    // Early exit after cleaning up the buffer and allocation.
+    // After this point releasing the wrapping buffer will take care of this.
+    if (handle) {
+      logical_device->syms()->vkDestroyBuffer(*logical_device, handle,
+                                              logical_device->allocator());
+    }
+    if (device_memory) {
+      logical_device->syms()->vkFreeMemory(*logical_device, device_memory,
+                                           logical_device->allocator());
+    }
+    return status;
+  }
+
+  IREE_TRACE_ALLOC_NAMED(IREE_HAL_VULKAN_NATIVE_ALLOCATOR_ID, (void*)handle,
+                         allocation_size);
+
+  // Bind the memory to the buffer.
+  if (iree_status_is_ok(status)) {
+    status = VK_RESULT_TO_STATUS(
+        logical_device->syms()->vkBindBufferMemory(
+            *logical_device, handle, device_memory, /*memoryOffset=*/0),
+        "vkBindBufferMemory");
+  }
+
+  // Copy the initial contents into the buffer. This may require staging.
+  if (iree_status_is_ok(status) &&
+      !iree_const_byte_span_is_empty(initial_data)) {
+    status = iree_hal_device_transfer_range(
+        allocator->device,
+        iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
+                                                initial_data.data_length),
+        0, iree_hal_make_device_transfer_buffer(buffer), 0,
+        initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
+        iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    iree_hal_allocator_statistics_record_alloc(
+        &allocator->statistics, params->type, buffer->allocation_size);
+    *out_buffer = buffer;
+  } else {
+    iree_hal_buffer_release(buffer);
+  }
+  return status;
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_allocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  iree_hal_vulkan_native_allocator_t* allocator =
+      iree_hal_vulkan_native_allocator_cast(base_allocator);
+
+  // Coerce options into those required by the current device.
+  iree_hal_buffer_params_t compat_params = *params;
+  if (!iree_all_bits_set(
+          iree_hal_vulkan_native_allocator_query_buffer_compatibility(
+              base_allocator, &compat_params, &allocation_size),
+          IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "allocator cannot allocate a buffer with the given parameters");
+  }
+
+  return iree_hal_vulkan_native_allocator_allocate_internal(
+      allocator, &compat_params, allocation_size, initial_data, out_buffer);
+}
+
+static void iree_hal_vulkan_native_allocator_deallocate_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
+  iree_hal_vulkan_native_allocator_t* allocator =
+      iree_hal_vulkan_native_allocator_cast(base_buffer->device_allocator);
+  (void)allocator;
+  iree_hal_allocator_statistics_record_free(&allocator->statistics,
+                                            base_buffer->memory_type,
+                                            base_buffer->allocation_size);
+  iree_hal_buffer_destroy(base_buffer);
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_import_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    const iree_hal_buffer_params_t* IREE_RESTRICT params,
+    iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
+    iree_hal_buffer_release_callback_t release_callback,
+    iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
+  // TODO(#7242): use VK_EXT_external_memory_host to import memory.
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "importing from external buffers not supported");
+}
+
+static iree_status_t iree_hal_vulkan_native_allocator_export_buffer(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator,
+    iree_hal_buffer_t* IREE_RESTRICT buffer,
+    iree_hal_external_buffer_type_t requested_type,
+    iree_hal_external_buffer_flags_t requested_flags,
+    iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
+  return iree_make_status(IREE_STATUS_UNAVAILABLE,
+                          "exporting to external buffers not supported");
+}
+
+namespace {
+const iree_hal_allocator_vtable_t iree_hal_vulkan_native_allocator_vtable = {
+    /*.destroy=*/iree_hal_vulkan_native_allocator_destroy,
+    /*.host_allocator=*/iree_hal_vulkan_native_allocator_host_allocator,
+    /*.trim=*/iree_hal_vulkan_native_allocator_trim,
+    /*.query_statistics=*/iree_hal_vulkan_native_allocator_query_statistics,
+    /*.query_memory_heaps=*/iree_hal_vulkan_native_allocator_query_memory_heaps,
+    /*.query_buffer_compatibility=*/
+    iree_hal_vulkan_native_allocator_query_buffer_compatibility,
+    /*.allocate_buffer=*/iree_hal_vulkan_native_allocator_allocate_buffer,
+    /*.deallocate_buffer=*/iree_hal_vulkan_native_allocator_deallocate_buffer,
+    /*.import_buffer=*/iree_hal_vulkan_native_allocator_import_buffer,
+    /*.export_buffer=*/iree_hal_vulkan_native_allocator_export_buffer,
+};
+}  // namespace
diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.h b/runtime/src/iree/hal/drivers/vulkan/native_allocator.h
new file mode 100644
index 0000000..9d6e524
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.h
@@ -0,0 +1,30 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_VULKAN_NATIVE_ALLOCATOR_H_
+#define IREE_HAL_DRIVERS_VULKAN_NATIVE_ALLOCATOR_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+#include "iree/hal/drivers/vulkan/handle_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Creates a native Vulkan API-based allocator that directly allocates memory
+// from the underlying implementation with no pooling or suballocation.
+iree_status_t iree_hal_vulkan_native_allocator_create(
+    const iree_hal_vulkan_device_options_t* options, VkInstance instance,
+    VkPhysicalDevice physical_device,
+    iree::hal::vulkan::VkDeviceHandle* logical_device,
+    iree_hal_device_t* device, iree_hal_allocator_t** out_allocator);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_VULKAN_NATIVE_ALLOCATOR_H_
diff --git a/runtime/src/iree/hal/drivers/vulkan/registration/driver_module.cc b/runtime/src/iree/hal/drivers/vulkan/registration/driver_module.cc
index 514d302..a7182c7 100644
--- a/runtime/src/iree/hal/drivers/vulkan/registration/driver_module.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/registration/driver_module.cc
@@ -36,6 +36,9 @@
 IREE_FLAG(
     bool, vulkan_dedicated_compute_queue, false,
     "Use a dedicated queue with VK_QUEUE_COMPUTE_BIT for dispatch workloads.");
+IREE_FLAG(bool, vulkan_vma_allocator, true,
+          "Whether to use the VMA allocator instead of native Vulkan API "
+          "memory allocations.");
 IREE_FLAG(
     int64_t, vulkan_large_heap_block_size, 0,
     "Preferred allocator block size for large allocations in bytes. Sets the\n"
@@ -82,6 +85,13 @@
     driver_options.device_options.flags |=
         IREE_HAL_VULKAN_DEVICE_FLAG_DEDICATED_COMPUTE_QUEUE;
   }
+  if (FLAG_vulkan_vma_allocator) {
+    driver_options.device_options.flags |=
+        IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR;
+  } else {
+    driver_options.device_options.flags &=
+        ~IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR;
+  }
   if (FLAG_vulkan_large_heap_block_size) {
     driver_options.device_options.large_heap_block_size =
         FLAG_vulkan_large_heap_block_size;
diff --git a/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc
index ad41e98..3a62218 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vma_allocator.cc
@@ -257,6 +257,9 @@
 
 #endif  // IREE_STATISTICS_ENABLE
 
+static void iree_hal_vulkan_vma_allocator_destroy(
+    iree_hal_allocator_t* IREE_RESTRICT base_allocator);
+
 iree_status_t iree_hal_vulkan_vma_allocator_create(
     const iree_hal_vulkan_device_options_t* options, VkInstance instance,
     VkPhysicalDevice physical_device, VkDeviceHandle* logical_device,
@@ -341,7 +344,7 @@
   if (iree_status_is_ok(status)) {
     *out_allocator = (iree_hal_allocator_t*)allocator;
   } else {
-    vmaDestroyAllocator(vma);
+    iree_hal_vulkan_vma_allocator_destroy((iree_hal_allocator_t*)allocator);
   }
 
   IREE_TRACE_ZONE_END(z0);
diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
index 0047243..10a6d6d 100644
--- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
+++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc
@@ -22,6 +22,7 @@
 #include "iree/hal/drivers/vulkan/dynamic_symbols.h"
 #include "iree/hal/drivers/vulkan/extensibility_util.h"
 #include "iree/hal/drivers/vulkan/handle_util.h"
+#include "iree/hal/drivers/vulkan/native_allocator.h"
 #include "iree/hal/drivers/vulkan/native_event.h"
 #include "iree/hal/drivers/vulkan/native_pipeline_layout.h"
 #include "iree/hal/drivers/vulkan/native_semaphore.h"
@@ -513,7 +514,7 @@
 IREE_API_EXPORT void iree_hal_vulkan_device_options_initialize(
     iree_hal_vulkan_device_options_t* out_options) {
   memset(out_options, 0, sizeof(*out_options));
-  out_options->flags = 0;
+  out_options->flags = IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR;
   out_options->large_heap_block_size = 64 * 1024 * 1024;
 }
 
@@ -711,9 +712,17 @@
 
   // Create the device memory allocator that will service all buffer
   // allocation requests.
-  iree_status_t status = iree_hal_vulkan_vma_allocator_create(
-      options, instance, physical_device, logical_device,
-      (iree_hal_device_t*)device, &device->device_allocator);
+  iree_status_t status = iree_ok_status();
+  if (iree_all_bits_set(options->flags,
+                        IREE_HAL_VULKAN_DEVICE_FLAG_VMA_ALLOCATOR)) {
+    status = iree_hal_vulkan_vma_allocator_create(
+        options, instance, physical_device, logical_device,
+        (iree_hal_device_t*)device, &device->device_allocator);
+  } else {
+    status = iree_hal_vulkan_native_allocator_create(
+        options, instance, physical_device, logical_device,
+        (iree_hal_device_t*)device, &device->device_allocator);
+  }
 
   // Create command pools for each queue family. If we don't have a transfer
   // queue then we'll ignore that one and just use the dispatch pool.