experimental/cuda2/cuda_allocator.c - 3p/openxla/iree - Git at Google

 // Copyright 2023 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "experimental/cuda2/cuda_allocator.h"

 #include <stddef.h>

 #include "experimental/cuda2/cuda_buffer.h"
 #include "experimental/cuda2/cuda_dynamic_symbols.h"
 #include "experimental/cuda2/cuda_status_util.h"
 #include "iree/base/api.h"

 #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_ALLOCATION_TRACKING
 static const char* IREE_HAL_CUDA_ALLOCATOR_ID = "CUDA2 unpooled";
 #endif  // IREE_TRACING_FEATURE_ALLOCATION_TRACKING

 typedef struct iree_hal_cuda2_allocator_t {
   // Abstract resource used for injecting reference counting and vtable;
   // must be at offset 0.
   iree_hal_resource_t resource;

   // The device that this allocator allocates memory from.
   iree_hal_device_t* base_device;
   CUdevice device;

   // The CUDA stream that allocations should be used in.
   CUstream stream;

   // NOTE: optional depending on device support.
   iree_hal_cuda2_memory_pools_t* pools;

   const iree_hal_cuda2_dynamic_symbols_t* symbols;

   iree_allocator_t host_allocator;

   // Whether the GPU and CPU can concurrently access CUDA managed data in a
   // coherent way. We would need to explicitly perform flushing and invalidation
   // between GPU and CPU if not.
   bool supports_concurrent_managed_access;

   IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;)
 } iree_hal_cuda2_allocator_t;

 static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable;

 static iree_hal_cuda2_allocator_t* iree_hal_cuda2_allocator_cast(
     iree_hal_allocator_t* base_value) {
   IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_allocator_vtable);
   return (iree_hal_cuda2_allocator_t*)base_value;
 }

 iree_status_t iree_hal_cuda2_allocator_create(
     iree_hal_device_t* base_device,
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols, CUdevice device,
     CUstream stream, iree_hal_cuda2_memory_pools_t* pools,
     iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) {
   IREE_ASSERT_ARGUMENT(base_device);
   IREE_ASSERT_ARGUMENT(cuda_symbols);
   IREE_ASSERT_ARGUMENT(pools);
   IREE_ASSERT_ARGUMENT(out_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);

   // To support device-local + host-visible memory we need concurrent managed
   // access indicating that the host and devices can concurrently access the
   // device memory. If we don't have this feature then we fall back to forcing
   // all device-local + host-visible memory into host-local + device-visible
   // page-locked memory. The compiler tries to avoid this for high-traffic
   // buffers except for readback staging buffers.
   int supports_concurrent_managed_access = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_CURESULT_TO_STATUS(
               cuda_symbols,
               cuDeviceGetAttribute(
                   &supports_concurrent_managed_access,
                   CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, device),
               "cuDeviceGetAttribute"));

   IREE_TRACE_ZONE_APPEND_TEXT(
       z0, supports_concurrent_managed_access
               ? "has CONCURRENT_MANAGED_ACCESS"
               : "no CONCURRENT_MANAGED_ACCESS (expect slow accesses on "
                 "device-local + host-visible memory)");

   iree_hal_cuda2_allocator_t* allocator = NULL;
   iree_status_t status = iree_allocator_malloc(
       host_allocator, sizeof(*allocator), (void**)&allocator);
   if (iree_status_is_ok(status)) {
     iree_hal_resource_initialize(&iree_hal_cuda2_allocator_vtable,
                                  &allocator->resource);
     allocator->base_device = base_device;
     allocator->device = device;
     allocator->stream = stream;
     allocator->pools = pools;
     allocator->symbols = cuda_symbols;
     allocator->host_allocator = host_allocator;
     allocator->supports_concurrent_managed_access =
         supports_concurrent_managed_access != 0;
     *out_allocator = (iree_hal_allocator_t*)allocator;
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static void iree_hal_cuda2_allocator_destroy(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   IREE_ASSERT_ARGUMENT(base_allocator);
   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_allocator_free(allocator->host_allocator, allocator);

   IREE_TRACE_ZONE_END(z0);
 }

 static iree_allocator_t iree_hal_cuda2_allocator_host_allocator(
     const iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   iree_hal_cuda2_allocator_t* allocator =
       (iree_hal_cuda2_allocator_t*)base_allocator;
   return allocator->host_allocator;
 }

 static iree_status_t iree_hal_cuda2_allocator_trim(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator) {
   return iree_ok_status();
 }

 static void iree_hal_cuda2_allocator_query_statistics(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_allocator_statistics_t* IREE_RESTRICT out_statistics) {
   IREE_STATISTICS({
     iree_hal_cuda2_allocator_t* allocator =
         iree_hal_cuda2_allocator_cast(base_allocator);
     memcpy(out_statistics, &allocator->statistics, sizeof(*out_statistics));
     if (allocator->pools) {
       iree_hal_cuda2_memory_pools_merge_statistics(allocator->pools,
                                                    out_statistics);
     }
   });
 }

 static iree_status_t iree_hal_cuda2_allocator_query_memory_heaps(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_host_size_t capacity,
     iree_hal_allocator_memory_heap_t* IREE_RESTRICT heaps,
     iree_host_size_t* IREE_RESTRICT out_count) {
   IREE_ASSERT_ARGUMENT(base_allocator);
   IREE_ASSERT_ARGUMENT(heaps);
   IREE_ASSERT_ARGUMENT(out_count);

   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);

   // TODO(benvanik): check CU_DEVICE_ATTRIBUTE_INTEGRATED and return a unified
   // set of heaps (likely still a cached and uncached, at minimum).
   iree_host_size_t count = 3;
   if (allocator->supports_concurrent_managed_access) {
     ++count;  // device-local | host-visible
   }
   if (out_count) *out_count = count;
   if (capacity < count) {
     // NOTE: lightweight as this is hit in normal pre-sizing usage.
     return iree_status_from_code(IREE_STATUS_OUT_OF_RANGE);
   }

   // Don't think there's a query for these.
   // Max allocation size may be much smaller in certain memory types such as
   // page-locked memory and it'd be good to enforce that.
   const iree_device_size_t max_allocation_size = ~(iree_device_size_t)0;
   const iree_device_size_t min_alignment = 64;

   int i = 0;

   // Device-local memory (dispatch resources):
   heaps[i++] = (iree_hal_allocator_memory_heap_t){
       .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
       .allowed_usage =
           IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_DISPATCH,
       .max_allocation_size = max_allocation_size,
       .min_alignment = min_alignment,
   };

   if (allocator->supports_concurrent_managed_access) {
     // Device-local managed memory with host mapping support:
     heaps[i++] = (iree_hal_allocator_memory_heap_t){
         .type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
                 IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
                 IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
         .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
                          IREE_HAL_BUFFER_USAGE_DISPATCH |
                          IREE_HAL_BUFFER_USAGE_MAPPING,
         .max_allocation_size = max_allocation_size,
         .min_alignment = min_alignment,
     };
   }

   // Write-combined page-locked host-local memory (upload):
   heaps[i++] = (iree_hal_allocator_memory_heap_t){
       .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
               IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
               IREE_HAL_MEMORY_TYPE_HOST_COHERENT,
       .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
                        IREE_HAL_BUFFER_USAGE_DISPATCH |
                        IREE_HAL_BUFFER_USAGE_MAPPING,
       .max_allocation_size = max_allocation_size,
       .min_alignment = min_alignment,
   };

   // Cached page-locked host-local memory (download):
   heaps[i++] = (iree_hal_allocator_memory_heap_t){
       .type = IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE |
               IREE_HAL_MEMORY_TYPE_HOST_LOCAL |
               IREE_HAL_MEMORY_TYPE_HOST_COHERENT |
               IREE_HAL_MEMORY_TYPE_HOST_CACHED,
       .allowed_usage = IREE_HAL_BUFFER_USAGE_TRANSFER |
                        IREE_HAL_BUFFER_USAGE_DISPATCH |
                        IREE_HAL_BUFFER_USAGE_MAPPING,
       .max_allocation_size = max_allocation_size,
       .min_alignment = min_alignment,
   };

   IREE_ASSERT(i == count);
   return iree_ok_status();
 }

 static iree_hal_buffer_compatibility_t
 iree_hal_cuda2_allocator_query_buffer_compatibility(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t* IREE_RESTRICT allocation_size) {
   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);

   // All buffers can be allocated on the heap.
   iree_hal_buffer_compatibility_t compatibility =
       IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE;

   // Buffers are importable in CUDA under most cases, though performance may
   // vary wildly. We don't fully verify that the buffer parameters are
   // self-consistent and just look at whether we can get a device pointer.
   if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
     compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE;
   }

   // Buffers can only be used on the queue if they are device visible.
   if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) {
     if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) {
       compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER;
     }
     if (iree_any_bit_set(params->usage,
                          IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) {
       compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH;
     }
   }

   // If concurrent managed access is not supported then make device-local +
   // host-visible allocations fall back to host-local + device-visible
   // page-locked memory. This will be significantly slower for the device to
   // access but the compiler only uses this type for readback staging buffers
   // and it's better to function than function fast.
   if (!allocator->supports_concurrent_managed_access &&
       iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
                                           IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
     compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_LOW_PERFORMANCE;
     params->type &= ~(IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
                       IREE_HAL_MEMORY_TYPE_HOST_VISIBLE);
     params->type |=
         IREE_HAL_MEMORY_TYPE_HOST_LOCAL | IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE;
   }

   // We are now optimal.
   params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL;

   // Guard against the corner case where the requested buffer size is 0. The
   // application is unlikely to do anything when requesting a 0-byte buffer; but
   // it can happen in real world use cases. So we should at least not crash.
   if (*allocation_size == 0) *allocation_size = 4;

   return compatibility;
 }

 static void iree_hal_cuda2_buffer_free(
     const iree_hal_cuda2_dynamic_symbols_t* cuda_symbols,
     iree_hal_cuda2_buffer_type_t buffer_type, CUdeviceptr device_ptr,
     void* host_ptr) {
   IREE_TRACE_ZONE_BEGIN(z0);
   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFree");
       IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFree(device_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemFreeHost");
       IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemFreeHost(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "cuMemHostUnregister");
       IREE_CUDA_IGNORE_ERROR(cuda_symbols, cuMemHostUnregister(host_ptr));
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_ASYNC: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; async)");
       break;
     }
     case IREE_HAL_CUDA_BUFFER_TYPE_EXTERNAL: {
       IREE_TRACE_ZONE_APPEND_TEXT(z0, "(ignored; external)");
       break;
     }
   }
   IREE_TRACE_ZONE_END(z0);
 }

 static iree_status_t iree_hal_cuda2_allocator_allocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_device_size_t allocation_size, iree_const_byte_span_t initial_data,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   IREE_ASSERT_ARGUMENT(base_allocator);
   IREE_ASSERT_ARGUMENT(params);
   IREE_ASSERT_ARGUMENT(out_buffer);
   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);

   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_hal_buffer_compatibility_t compatibility =
       iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) {
 #if IREE_STATUS_MODE
     iree_bitfield_string_temp_t temp0, temp1, temp2;
     iree_string_view_t memory_type_str =
         iree_hal_memory_type_format(params->type, &temp0);
     iree_string_view_t usage_str =
         iree_hal_buffer_usage_format(params->usage, &temp1);
     iree_string_view_t compatibility_str =
         iree_hal_buffer_compatibility_format(compatibility, &temp2);
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "allocator cannot allocate a buffer with the given parameters; "
         "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
         (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
         usage_str.data, (int)compatibility_str.size, compatibility_str.data);
 #else
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "allocator cannot allocate a buffer with the given parameters");
 #endif  // IREE_STATUS_MODE
   }

   iree_status_t status = iree_ok_status();
   iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;
   IREE_TRACE_ZONE_BEGIN_NAMED(z0, "iree_hal_cuda2_buffer_allocate");
   IREE_TRACE_ZONE_APPEND_VALUE(z0, allocation_size);
   if (iree_all_bits_set(compat_params.type,
                         IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
     if (iree_all_bits_set(compat_params.type,
                           IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) {
       // Device local + host visible.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
       status = IREE_CURESULT_TO_STATUS(
           allocator->symbols, cuMemAllocManaged(&device_ptr, allocation_size,
                                                 CU_MEM_ATTACH_GLOBAL));
       if (iree_status_is_ok(status) &&
           allocator->supports_concurrent_managed_access) {
         // Prefetch the buffer to the GPU stream.
         status = IREE_CURESULT_TO_STATUS(
             allocator->symbols,
             cuMemPrefetchAsync(device_ptr, allocation_size, allocator->device,
                                allocator->stream));
       }
       host_ptr = (void*)device_ptr;
     } else {
       // Device only.
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
       status = IREE_CURESULT_TO_STATUS(
           allocator->symbols, cuMemAlloc(&device_ptr, allocation_size));
     }
   } else {
     // Host local cases.
     buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST;
     unsigned int flags = CU_MEMHOSTALLOC_DEVICEMAP;
     if (!iree_all_bits_set(compat_params.type,
                            IREE_HAL_MEMORY_TYPE_HOST_CACHED)) {
       flags |= CU_MEMHOSTALLOC_WRITECOMBINED;
     }
     status = IREE_CURESULT_TO_STATUS(
         allocator->symbols, cuMemHostAlloc(&host_ptr, allocation_size, flags));
     if (iree_status_is_ok(status)) {
       status = IREE_CURESULT_TO_STATUS(
           allocator->symbols,
           cuMemHostGetDevicePointer(&device_ptr, host_ptr, /*flags=*/0));
     }
   }
   IREE_TRACE_ZONE_END(z0);

   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
         compat_params.usage, allocation_size,
         /*byte_offset=*/0,
         /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr,
         iree_hal_buffer_release_callback_null(),
         iree_hal_allocator_host_allocator(base_allocator), &buffer);
   }

   // Copy the initial contents into the buffer. This may require staging.
   if (iree_status_is_ok(status) &&
       !iree_const_byte_span_is_empty(initial_data)) {
     status = iree_hal_device_transfer_range(
         allocator->base_device,
         iree_hal_make_host_transfer_buffer_span((void*)initial_data.data,
                                                 initial_data.data_length),
         0, iree_hal_make_device_transfer_buffer(buffer), 0,
         initial_data.data_length, IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT,
         iree_infinite_timeout());
   }

   if (iree_status_is_ok(status)) {
     IREE_TRACE_ALLOC_NAMED(IREE_HAL_CUDA_ALLOCATOR_ID,
                            (void*)iree_hal_cuda2_buffer_device_pointer(buffer),
                            allocation_size);
     IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc(
         &allocator->statistics, compat_params.type, allocation_size));
     *out_buffer = buffer;
   } else {
     if (!buffer) {
       iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
                                  host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
   }
   return status;
 }

 static void iree_hal_cuda2_allocator_deallocate_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT base_buffer) {
   IREE_ASSERT_ARGUMENT(base_allocator);
   IREE_ASSERT_ARGUMENT(base_buffer);
   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);

   const iree_hal_cuda2_buffer_type_t buffer_type =
       iree_hal_cuda2_buffer_type(base_buffer);

   // WARNING: we may be called from a random thread and need to ensure that we
   // have an active CUDA context. Unfortunately CUDA is CUDA and trying to
   // change the context here will result in full device synchronization. In the
   // future we'll need to do something fairly complex such as having a dedicated
   // thread with a persistently bound context that does nothing but free
   // buffers. The load on this will be lighter when queue-ordered allocations
   // are used or any sort of pooling policy is applied.
   //
   // WARNING: with CUDA's lazy error propagation it's possible that by the time
   // this code is running something else has triggered device loss and we can't
   // actually use the context. In that case we can't perform the frees and want
   // to silently ignore them: whatever the user tries to do next will fail in
   // the same way and if we were deallocating this buffer as part of a tear-down
   // on failure we don't want to end up dying during cleanup.
   iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type,
                              iree_hal_cuda2_buffer_device_pointer(base_buffer),
                              iree_hal_cuda2_buffer_host_pointer(base_buffer));

   switch (buffer_type) {
     case IREE_HAL_CUDA_BUFFER_TYPE_DEVICE:
     case IREE_HAL_CUDA_BUFFER_TYPE_HOST: {
       IREE_TRACE_FREE_NAMED(
           IREE_HAL_CUDA_ALLOCATOR_ID,
           (void*)iree_hal_cuda2_buffer_device_pointer(base_buffer));
       IREE_STATISTICS(iree_hal_allocator_statistics_record_free(
           &allocator->statistics, iree_hal_buffer_memory_type(base_buffer),
           iree_hal_buffer_allocation_size(base_buffer)));
       break;
     }
     default:
       // Buffer type not tracked.
       break;
   }

   iree_hal_buffer_destroy(base_buffer);
 }

 static iree_status_t iree_hal_cuda2_allocator_import_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     const iree_hal_buffer_params_t* IREE_RESTRICT params,
     iree_hal_external_buffer_t* IREE_RESTRICT external_buffer,
     iree_hal_buffer_release_callback_t release_callback,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   IREE_ASSERT_ARGUMENT(base_allocator);
   IREE_ASSERT_ARGUMENT(params);
   IREE_ASSERT_ARGUMENT(external_buffer);
   IREE_ASSERT_ARGUMENT(out_buffer);
   iree_hal_cuda2_allocator_t* allocator =
       iree_hal_cuda2_allocator_cast(base_allocator);

   // Coerce options into those required by the current device.
   iree_hal_buffer_params_t compat_params = *params;
   iree_device_size_t allocation_size = external_buffer->size;
   iree_hal_buffer_compatibility_t compatibility =
       iree_hal_cuda2_allocator_query_buffer_compatibility(
           base_allocator, &compat_params, &allocation_size);
   if (!iree_all_bits_set(compatibility,
                          IREE_HAL_BUFFER_COMPATIBILITY_IMPORTABLE)) {
 #if IREE_STATUS_MODE
     iree_bitfield_string_temp_t temp0, temp1, temp2;
     iree_string_view_t memory_type_str =
         iree_hal_memory_type_format(params->type, &temp0);
     iree_string_view_t usage_str =
         iree_hal_buffer_usage_format(params->usage, &temp1);
     iree_string_view_t compatibility_str =
         iree_hal_buffer_compatibility_format(compatibility, &temp2);
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "allocator cannot import a buffer with the given parameters; "
         "memory_type=%.*s, usage=%.*s, compatibility=%.*s",
         (int)memory_type_str.size, memory_type_str.data, (int)usage_str.size,
         usage_str.data, (int)compatibility_str.size, compatibility_str.data);
 #else
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "allocator cannot import a buffer with the given parameters");
 #endif  // IREE_STATUS_MODE
   }

   iree_status_t status = iree_ok_status();
   iree_hal_cuda2_buffer_type_t buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_DEVICE;
   void* host_ptr = NULL;
   CUdeviceptr device_ptr = 0;

   switch (external_buffer->type) {
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION: {
       if (iree_all_bits_set(compat_params.type,
                             IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL)) {
         return iree_make_status(
             IREE_STATUS_INVALID_ARGUMENT,
             "unable to register host allocations as device-local memory");
       }
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_HOST_REGISTERED;
       host_ptr = external_buffer->handle.host_allocation.ptr;
       uint32_t register_flags = 0;
       if (compat_params.access == IREE_HAL_MEMORY_ACCESS_READ) {
         register_flags = CU_MEMHOSTREGISTER_READ_ONLY;
       }
       if (iree_any_bit_set(compat_params.usage,
                            IREE_HAL_BUFFER_USAGE_DISPATCH_INDIRECT_PARAMS |
                                IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
                                IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE |
                                IREE_HAL_BUFFER_USAGE_DISPATCH_IMAGE)) {
         register_flags = CU_MEMHOSTREGISTER_DEVICEMAP;
       }
       status = IREE_CURESULT_TO_STATUS(
           allocator->symbols,
           cuMemHostRegister(host_ptr, external_buffer->size, register_flags),
           "cuMemHostRegister");
       if (iree_status_is_ok(status)) {
         status = IREE_CURESULT_TO_STATUS(
             allocator->symbols,
             cuMemHostGetDevicePointer(&device_ptr, host_ptr, 0),
             "cuMemHostGetDevicePointer");
       }
       break;
     }
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_DEVICE_ALLOCATION: {
       buffer_type = IREE_HAL_CUDA_BUFFER_TYPE_EXTERNAL;
       device_ptr = (CUdeviceptr)external_buffer->handle.device_allocation.ptr;
       break;
     }
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_FD:
     case IREE_HAL_EXTERNAL_BUFFER_TYPE_OPAQUE_WIN32:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                               "handle-based imports not yet implemented");
     default:
       return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                               "external buffer type not supported");
   }

   iree_hal_buffer_t* buffer = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_cuda2_buffer_wrap(
         base_allocator, compat_params.type, compat_params.access,
         compat_params.usage, external_buffer->size, /*byte_offset=*/0,
         /*byte_length=*/external_buffer->size, buffer_type, device_ptr,
         host_ptr, release_callback,
         iree_hal_allocator_host_allocator(base_allocator), &buffer);
   }

   if (iree_status_is_ok(status)) {
     *out_buffer = buffer;
   } else {
     if (!buffer) {
       iree_hal_cuda2_buffer_free(allocator->symbols, buffer_type, device_ptr,
                                  host_ptr);
     } else {
       iree_hal_buffer_release(buffer);
     }
   }
   return status;
 }

 static iree_status_t iree_hal_cuda2_allocator_export_buffer(
     iree_hal_allocator_t* IREE_RESTRICT base_allocator,
     iree_hal_buffer_t* IREE_RESTRICT buffer,
     iree_hal_external_buffer_type_t requested_type,
     iree_hal_external_buffer_flags_t requested_flags,
     iree_hal_external_buffer_t* IREE_RESTRICT out_external_buffer) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "exporting to external buffers not supported");
 }

 static const iree_hal_allocator_vtable_t iree_hal_cuda2_allocator_vtable = {
     .destroy = iree_hal_cuda2_allocator_destroy,
     .host_allocator = iree_hal_cuda2_allocator_host_allocator,
     .trim = iree_hal_cuda2_allocator_trim,
     .query_statistics = iree_hal_cuda2_allocator_query_statistics,
     .query_memory_heaps = iree_hal_cuda2_allocator_query_memory_heaps,
     .query_buffer_compatibility =
         iree_hal_cuda2_allocator_query_buffer_compatibility,
     .allocate_buffer = iree_hal_cuda2_allocator_allocate_buffer,
     .deallocate_buffer = iree_hal_cuda2_allocator_deallocate_buffer,
     .import_buffer = iree_hal_cuda2_allocator_import_buffer,
     .export_buffer = iree_hal_cuda2_allocator_export_buffer,
 };