// Copyright 2023 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <stdint.h>
#include <string.h>

#include "experimental/cuda2/api.h"
#include "experimental/cuda2/cuda_device.h"
#include "experimental/cuda2/cuda_dynamic_symbols.h"
#include "experimental/cuda2/cuda_status_util.h"
#include "experimental/cuda2/nccl_dynamic_symbols.h"
#include "experimental/cuda2/nccl_status_util.h"
#include "iree/base/api.h"
#include "iree/hal/api.h"

// Maximum device name length supported by the CUDA HAL driver.
#define IREE_HAL_CUDA_MAX_DEVICE_NAME_LENGTH 128

// Utility macros to convert between CUDevice and iree_hal_device_id_t.
#define IREE_CUDEVICE_TO_DEVICE_ID(device) (iree_hal_device_id_t)((device) + 1)
#define IREE_DEVICE_ID_TO_CUDEVICE(device_id) (CUdevice)((device_id)-1)

typedef struct iree_hal_cuda2_driver_t {
  // Abstract resource used for injecting reference counting and vtable;
  // must be at offset 0.
  iree_hal_resource_t resource;

  iree_allocator_t host_allocator;

  // Identifier used for registering the driver in the IREE driver registry.
  iree_string_view_t identifier;
  // CUDA driver API dynamic symbols to interact with the CUDA system.
  iree_hal_cuda2_dynamic_symbols_t cuda_symbols;
  // NCCL API dynamic symbols to interact with the CUDA system.
  iree_hal_cuda2_nccl_dynamic_symbols_t nccl_symbols;

  // The default parameters for creating devices using this driver.
  iree_hal_cuda2_device_params_t device_params;

  // The index of the default CUDA device to use if multiple ones are available.
  int default_device_index;
} iree_hal_cuda2_driver_t;

static const iree_hal_driver_vtable_t iree_hal_cuda2_driver_vtable;

static iree_hal_cuda2_driver_t* iree_hal_cuda2_driver_cast(
    iree_hal_driver_t* base_value) {
  IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_cuda2_driver_vtable);
  return (iree_hal_cuda2_driver_t*)base_value;
}

IREE_API_EXPORT void iree_hal_cuda2_driver_options_initialize(
    iree_hal_cuda2_driver_options_t* out_options) {
  IREE_ASSERT_ARGUMENT(out_options);
  memset(out_options, 0, sizeof(*out_options));
  out_options->default_device_index = 0;
}

static iree_status_t iree_hal_cuda2_driver_create_internal(
    iree_string_view_t identifier,
    const iree_hal_cuda2_driver_options_t* options,
    const iree_hal_cuda2_device_params_t* device_params,
    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
  iree_hal_cuda2_driver_t* driver = NULL;
  iree_host_size_t total_size = iree_sizeof_struct(*driver) + identifier.size;
  IREE_RETURN_IF_ERROR(
      iree_allocator_malloc(host_allocator, total_size, (void**)&driver));

  iree_hal_resource_initialize(&iree_hal_cuda2_driver_vtable,
                               &driver->resource);
  driver->host_allocator = host_allocator;
  iree_string_view_append_to_buffer(
      identifier, &driver->identifier,
      (char*)driver + iree_sizeof_struct(*driver));
  driver->default_device_index = options->default_device_index;

  iree_status_t status = iree_hal_cuda2_dynamic_symbols_initialize(
      host_allocator, &driver->cuda_symbols);

  if (iree_status_is_ok(status)) {
    // Try to dynamically load NCCL. This will fail if NCCL is unavailable or
    // incompatible. We only fail on unavailability when the user tries to
    // create a channel and otherwise defer reporting.
    status = iree_hal_cuda2_nccl_dynamic_symbols_initialize(
        host_allocator, &driver->cuda_symbols, &driver->nccl_symbols);
    if (iree_status_is_unavailable(status)) status = iree_status_ignore(status);
  }

  memcpy(&driver->device_params, device_params, sizeof(driver->device_params));

  if (iree_status_is_ok(status)) {
    *out_driver = (iree_hal_driver_t*)driver;
  } else {
    iree_hal_driver_release((iree_hal_driver_t*)driver);
  }
  return status;
}

IREE_API_EXPORT iree_status_t iree_hal_cuda2_driver_create(
    iree_string_view_t identifier,
    const iree_hal_cuda2_driver_options_t* options,
    const iree_hal_cuda2_device_params_t* device_params,
    iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) {
  IREE_ASSERT_ARGUMENT(options);
  IREE_ASSERT_ARGUMENT(device_params);
  IREE_ASSERT_ARGUMENT(out_driver);
  IREE_TRACE_ZONE_BEGIN(z0);

  iree_status_t status = iree_hal_cuda2_driver_create_internal(
      identifier, options, device_params, host_allocator, out_driver);

  IREE_TRACE_ZONE_END(z0);
  return status;
}

static void iree_hal_cuda2_driver_destroy(iree_hal_driver_t* base_driver) {
  IREE_ASSERT_ARGUMENT(base_driver);

  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);
  iree_allocator_t host_allocator = driver->host_allocator;
  IREE_TRACE_ZONE_BEGIN(z0);

  iree_hal_cuda2_nccl_dynamic_symbols_deinitialize(&driver->nccl_symbols);
  iree_hal_cuda2_dynamic_symbols_deinitialize(&driver->cuda_symbols);
  iree_allocator_free(host_allocator, driver);

  IREE_TRACE_ZONE_END(z0);
}

// Initializes the CUDA system.
static iree_status_t iree_hal_cuda2_init(iree_hal_cuda2_driver_t* driver) {
  IREE_TRACE_ZONE_BEGIN(z0);
  iree_status_t status =
      IREE_CURESULT_TO_STATUS(&driver->cuda_symbols, cuInit(0), "cuInit");
  IREE_TRACE_ZONE_END(z0);
  return status;
}

// Populates device information from the given CUDA physical device handle.
// |out_device_info| must point to valid memory and additional data will be
// appended to |buffer_ptr| and the new pointer is returned.
static iree_status_t iree_hal_cuda2_populate_device_info(
    CUdevice device, iree_hal_cuda2_dynamic_symbols_t* syms,
    uint8_t* buffer_ptr, uint8_t** out_buffer_ptr,
    iree_hal_device_info_t* out_device_info) {
  *out_buffer_ptr = buffer_ptr;

  char device_name[IREE_HAL_CUDA_MAX_DEVICE_NAME_LENGTH];
  IREE_CUDA_RETURN_IF_ERROR(
      syms, cuDeviceGetName(device_name, sizeof(device_name), device),
      "cuDeviceGetName");
  memset(out_device_info, 0, sizeof(*out_device_info));
  out_device_info->device_id = IREE_CUDEVICE_TO_DEVICE_ID(device);

  // This matches the output of `nvidia-smi -L`.
  CUuuid device_uuid;
  IREE_CUDA_RETURN_IF_ERROR(syms, cuDeviceGetUuid(&device_uuid, device),
                            "cuDeviceGetUuid");
  char device_path_str[4 + 36 + 1] = {0};
  snprintf(device_path_str, sizeof(device_path_str),
           "GPU-"
           "%02x%02x%02x%02x-"
           "%02x%02x-"
           "%02x%02x-"
           "%02x%02x-"
           "%02x%02x%02x%02x%02x%02x",
           (uint8_t)device_uuid.bytes[0], (uint8_t)device_uuid.bytes[1],
           (uint8_t)device_uuid.bytes[2], (uint8_t)device_uuid.bytes[3],
           (uint8_t)device_uuid.bytes[4], (uint8_t)device_uuid.bytes[5],
           (uint8_t)device_uuid.bytes[6], (uint8_t)device_uuid.bytes[7],
           (uint8_t)device_uuid.bytes[8], (uint8_t)device_uuid.bytes[9],
           (uint8_t)device_uuid.bytes[10], (uint8_t)device_uuid.bytes[11],
           (uint8_t)device_uuid.bytes[12], (uint8_t)device_uuid.bytes[13],
           (uint8_t)device_uuid.bytes[14], (uint8_t)device_uuid.bytes[15]);
  buffer_ptr += iree_string_view_append_to_buffer(
      iree_make_string_view(device_path_str,
                            IREE_ARRAYSIZE(device_path_str) - 1),
      &out_device_info->path, (char*)buffer_ptr);

  iree_string_view_t device_name_str =
      iree_make_string_view(device_name, strlen(device_name));
  buffer_ptr += iree_string_view_append_to_buffer(
      device_name_str, &out_device_info->name, (char*)buffer_ptr);

  *out_buffer_ptr = buffer_ptr;
  return iree_ok_status();
}

// Returns true if the device meets all the required capabilities.
static bool iree_hal_cuda2_is_valid_device(iree_hal_cuda2_driver_t* driver,
                                           CUdevice device) {
  return true;
}

static iree_status_t iree_hal_cuda2_driver_query_available_devices(
    iree_hal_driver_t* base_driver, iree_allocator_t host_allocator,
    iree_host_size_t* out_device_info_count,
    iree_hal_device_info_t** out_device_infos) {
  IREE_ASSERT_ARGUMENT(base_driver);
  IREE_ASSERT_ARGUMENT(out_device_info_count);
  IREE_ASSERT_ARGUMENT(out_device_infos);
  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);
  IREE_TRACE_ZONE_BEGIN(z0);

  // Ensure CUDA is initialized before querying it.
  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_hal_cuda2_init(driver));

  // Query the number of available CUDA devices.
  int device_count = 0;
  IREE_CUDA_RETURN_AND_END_ZONE_IF_ERROR(z0, &driver->cuda_symbols,
                                         cuDeviceGetCount(&device_count),
                                         "cuDeviceGetCount");

  // Allocate the return infos and populate with the devices.
  iree_hal_device_info_t* device_infos = NULL;
  iree_host_size_t total_size =
      device_count * (sizeof(iree_hal_device_info_t) +
                      IREE_HAL_CUDA_MAX_DEVICE_NAME_LENGTH * sizeof(char));
  iree_status_t status =
      iree_allocator_malloc(host_allocator, total_size, (void**)&device_infos);

  int valid_device_count = 0;
  if (iree_status_is_ok(status)) {
    uint8_t* buffer_ptr =
        (uint8_t*)device_infos + device_count * sizeof(iree_hal_device_info_t);
    for (iree_host_size_t i = 0; i < device_count; ++i) {
      CUdevice device = 0;
      status = IREE_CURESULT_TO_STATUS(&driver->cuda_symbols,
                                       cuDeviceGet(&device, i), "cuDeviceGet");
      if (!iree_status_is_ok(status)) break;
      if (!iree_hal_cuda2_is_valid_device(driver, device)) continue;
      status = iree_hal_cuda2_populate_device_info(
          device, &driver->cuda_symbols, buffer_ptr, &buffer_ptr,
          &device_infos[valid_device_count]);
      if (!iree_status_is_ok(status)) break;
      valid_device_count++;
    }
  }
  if (iree_status_is_ok(status)) {
    *out_device_info_count = valid_device_count;
    *out_device_infos = device_infos;
  } else {
    iree_allocator_free(host_allocator, device_infos);
  }

  IREE_TRACE_ZONE_END(z0);
  return status;
}

static iree_status_t iree_hal_cuda2_driver_dump_device_info(
    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
    iree_string_builder_t* builder) {
  IREE_ASSERT_ARGUMENT(base_driver);
  IREE_ASSERT_ARGUMENT(builder);
  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);
  CUdevice device = IREE_DEVICE_ID_TO_CUDEVICE(device_id);

#define IREE_CUDA_QUERY_ATTRIBUTE(attribute, value)                          \
  IREE_CUDA_RETURN_IF_ERROR(                                                 \
      &driver->cuda_symbols,                                                 \
      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attribute, device), \
      "cuDeviceGetAttribute");

  int compute_capability_major = 0, compute_capability_minor = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(COMPUTE_CAPABILITY_MAJOR, compute_capability_major);
  IREE_CUDA_QUERY_ATTRIBUTE(COMPUTE_CAPABILITY_MINOR, compute_capability_minor);
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- gpu-compute-capability: %d.%d", compute_capability_major,
      compute_capability_minor));

  int driver_version = 0;
  IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols,
                            cuDriverGetVersion(&driver_version),
                            "cuDriverGetVersion");
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- driver-max-cuda-version: %d.%d", driver_version / 1000,
      (driver_version % 1000) / 10));

  // Launch configuration limits.
  int max_block_dim_x = 0, max_block_dim_y = 0, max_block_dim_z = 0;
  int max_grid_dim_x = 0, max_grid_dim_y = 0, max_grid_dim_z = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_BLOCK_DIM_X, max_block_dim_x);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_BLOCK_DIM_Y, max_block_dim_y);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_BLOCK_DIM_Z, max_block_dim_z);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_GRID_DIM_X, max_grid_dim_x);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_GRID_DIM_Y, max_grid_dim_y);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_GRID_DIM_Z, max_grid_dim_z);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- launch-max-block-dims: (%d, %d, %d)", max_block_dim_x,
      max_block_dim_y, max_block_dim_z));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- launch-max-grid-dims: (%d, %d, %d)", max_grid_dim_x,
      max_grid_dim_y, max_grid_dim_z));

  // Per block resource limits.
  int max_threads_per_block = 0;
  int max_registers_per_block = 0;
  int max_shared_memory_per_block = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_THREADS_PER_BLOCK, max_threads_per_block);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_REGISTERS_PER_BLOCK, max_registers_per_block);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_SHARED_MEMORY_PER_BLOCK,
                            max_shared_memory_per_block);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- block-max-thread-count: %d", max_threads_per_block));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- block-max-32-bit-register-count: %d",
      max_registers_per_block));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- block-max-shared-memory: %d bytes",
      max_shared_memory_per_block));

  // Per multiprocessor resource limits.
  int max_threads_per_multiprocessor = 0;
  int max_blocks_per_multiprocessor = 0;
  int max_registers_per_multiprocessor = 0;
  int max_shared_memory_per_multiprocessor = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_THREADS_PER_MULTIPROCESSOR,
                            max_threads_per_multiprocessor);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_BLOCKS_PER_MULTIPROCESSOR,
                            max_blocks_per_multiprocessor);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_REGISTERS_PER_MULTIPROCESSOR,
                            max_registers_per_multiprocessor);
  IREE_CUDA_QUERY_ATTRIBUTE(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
                            max_shared_memory_per_multiprocessor);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- multiprocessor-max-thread-count: %d",
      max_threads_per_multiprocessor));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- multiprocessor-max-block-count: %d",
      max_blocks_per_multiprocessor));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- multiprocessor-max-32-bit-register-count: %d",
      max_registers_per_multiprocessor));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- multiprocessor-max-shared-memory: %d bytes",
      max_shared_memory_per_multiprocessor));

  // Memory characteristics.
  int is_integrated_memory = 0;
  int has_unified_address_space = 0;
  int supports_managed_memory = 0;
  int can_map_host_memory = 0;
  int supports_pageable_memory_access = 0;
  int supports_concurrent_managed_access = 0;
  int supports_memory_pools = 0;
  int l2_cache_size = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(INTEGRATED, is_integrated_memory);
  IREE_CUDA_QUERY_ATTRIBUTE(UNIFIED_ADDRESSING, has_unified_address_space);
  IREE_CUDA_QUERY_ATTRIBUTE(MANAGED_MEMORY, supports_managed_memory);
  IREE_CUDA_QUERY_ATTRIBUTE(CAN_MAP_HOST_MEMORY, can_map_host_memory);
  IREE_CUDA_QUERY_ATTRIBUTE(PAGEABLE_MEMORY_ACCESS,
                            supports_pageable_memory_access);
  IREE_CUDA_QUERY_ATTRIBUTE(CONCURRENT_MANAGED_ACCESS,
                            supports_concurrent_managed_access);
  IREE_CUDA_QUERY_ATTRIBUTE(MEMORY_POOLS_SUPPORTED, supports_memory_pools);
  IREE_CUDA_QUERY_ATTRIBUTE(L2_CACHE_SIZE, l2_cache_size);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-is-integrated-memory: %d", is_integrated_memory));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-has-unified-address-space: %d",
      has_unified_address_space));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-supports-managed-memory: %d",
      supports_managed_memory));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-can-map-host-memory-to-device: %d",
      can_map_host_memory));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-supports-pageable-memory-access-from-device: %d",
      supports_pageable_memory_access));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-supports-concurrent-managed-access: %d",
      supports_concurrent_managed_access));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-supports-memory-pools: %d", supports_memory_pools));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- memory-l2-cache-size: %d bytes", l2_cache_size));

  int supports_64bit_memops = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(CAN_USE_64_BIT_STREAM_MEM_OPS,
                            supports_64bit_memops);
  int supports_timeline_semaphore_interop = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(TIMELINE_SEMAPHORE_INTEROP_SUPPORTED,
                            supports_timeline_semaphore_interop);
  int mem_sync_domain_count = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(MEM_SYNC_DOMAIN_COUNT, mem_sync_domain_count);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- sync-supports-64-bit-stream-mem-ops: %d",
      supports_64bit_memops));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- sync-supports-timeline-semaphore-interop: %d",
      supports_timeline_semaphore_interop));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- sync-mem-domain-count: %d", mem_sync_domain_count));

  // Other GPU characteristics.
  int multiprocessor_count = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(MULTIPROCESSOR_COUNT, multiprocessor_count);
  int clock_rate = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(CLOCK_RATE, clock_rate);
  int warp_size = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(WARP_SIZE, warp_size);
  int execution_timeout = 0;
  IREE_CUDA_QUERY_ATTRIBUTE(KERNEL_EXEC_TIMEOUT, execution_timeout);

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- gpu-multiprocessor-count: %d", multiprocessor_count));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- gpu-clock-rate: %d kHz", clock_rate));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- gpu-warp-size: %d", warp_size));
  IREE_RETURN_IF_ERROR(iree_string_builder_append_format(
      builder, "\n- kernel-has-execution-timeout: %d", execution_timeout));

  IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, "\n"));

#undef IREE_CUDA_QUERY_ATTRIBUTE

  return iree_ok_status();
}

static iree_status_t iree_hal_cuda2_driver_select_default_device(
    iree_hal_driver_t* base_driver, iree_hal_cuda2_dynamic_symbols_t* syms,
    int default_device_index, iree_allocator_t host_allocator,
    CUdevice* out_device) {
  iree_hal_device_info_t* device_infos = NULL;
  iree_host_size_t device_count = 0;
  IREE_RETURN_IF_ERROR(iree_hal_cuda2_driver_query_available_devices(
      base_driver, host_allocator, &device_count, &device_infos));

  iree_status_t status = iree_ok_status();
  if (device_count == 0) {
    status = iree_make_status(IREE_STATUS_UNAVAILABLE,
                              "no compatible CUDA devices were found");
  } else if (default_device_index >= device_count) {
    status = iree_make_status(IREE_STATUS_NOT_FOUND,
                              "default device %d not found (of %" PRIhsz
                              " enumerated)",
                              default_device_index, device_count);
  } else {
    *out_device = IREE_DEVICE_ID_TO_CUDEVICE(
        device_infos[default_device_index].device_id);
  }
  iree_allocator_free(host_allocator, device_infos);

  return status;
}

static iree_status_t iree_hal_cuda2_driver_create_device_by_id(
    iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id,
    iree_host_size_t param_count, const iree_string_pair_t* params,
    iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
  IREE_ASSERT_ARGUMENT(base_driver);
  IREE_ASSERT_ARGUMENT(out_device);

  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);
  IREE_TRACE_ZONE_BEGIN(z0);

  // Ensure CUDA is initialized before querying it.
  IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_hal_cuda2_init(driver));

  // Use either the specified device (enumerated earlier) or whatever default
  // one was specified when the driver was created.
  CUdevice device = 0;
  if (device_id == IREE_HAL_DEVICE_ID_DEFAULT) {
    IREE_RETURN_AND_END_ZONE_IF_ERROR(
        z0, iree_hal_cuda2_driver_select_default_device(
                base_driver, &driver->cuda_symbols,
                driver->default_device_index, host_allocator, &device));
  } else {
    device = IREE_DEVICE_ID_TO_CUDEVICE(device_id);
  }

  iree_string_view_t device_name = iree_make_cstring_view("cuda2");

  // Attempt to create the device now.
  iree_status_t status = iree_hal_cuda2_device_create(
      base_driver, device_name, &driver->device_params, &driver->cuda_symbols,
      &driver->nccl_symbols, device, host_allocator, out_device);

  IREE_TRACE_ZONE_END(z0);
  return status;
}

static iree_status_t iree_hal_cuda2_driver_create_device_by_uuid(
    iree_hal_driver_t* base_driver, iree_string_view_t driver_name,
    const CUuuid* device_uuid, iree_host_size_t param_count,
    const iree_string_pair_t* params, iree_allocator_t host_allocator,
    iree_hal_device_t** out_device) {
  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);

  // Ensure CUDA is initialized before querying it.
  IREE_RETURN_IF_ERROR(iree_hal_cuda2_init(driver));

  // CUDA doesn't have an API to do this so we need to scan all devices to
  // find the one with the matching UUID.
  int device_count = 0;
  IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols,
                            cuDeviceGetCount(&device_count),
                            "cuDeviceGetCount");
  CUdevice device = 0;
  bool found_device = false;
  for (int i = 0; i < device_count; i++) {
    IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols, cuDeviceGet(&device, i),
                              "cuDeviceGet");
    CUuuid query_uuid;
    IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols,
                              cuDeviceGetUuid(&query_uuid, device),
                              "cuDeviceGetUuid");
    if (memcmp(&device_uuid->bytes[0], &query_uuid.bytes[0],
               sizeof(device_uuid)) == 0) {
      found_device = true;
      break;
    }
  }
  if (!found_device) {
    return iree_make_status(
        IREE_STATUS_NOT_FOUND,
        "CUDA device with UUID GPU-"
        "%02x%02x%02x%02x-"
        "%02x%02x-"
        "%02x%02x-"
        "%02x%02x-"
        "%02x%02x%02x%02x%02x%02x"
        " not found",
        (uint8_t)device_uuid->bytes[0], (uint8_t)device_uuid->bytes[1],
        (uint8_t)device_uuid->bytes[2], (uint8_t)device_uuid->bytes[3],
        (uint8_t)device_uuid->bytes[4], (uint8_t)device_uuid->bytes[5],
        (uint8_t)device_uuid->bytes[6], (uint8_t)device_uuid->bytes[7],
        (uint8_t)device_uuid->bytes[8], (uint8_t)device_uuid->bytes[9],
        (uint8_t)device_uuid->bytes[10], (uint8_t)device_uuid->bytes[11],
        (uint8_t)device_uuid->bytes[12], (uint8_t)device_uuid->bytes[13],
        (uint8_t)device_uuid->bytes[14], (uint8_t)device_uuid->bytes[15]);
  }

  iree_status_t status = iree_hal_cuda2_driver_create_device_by_id(
      base_driver, IREE_CUDEVICE_TO_DEVICE_ID(device), param_count, params,
      host_allocator, out_device);

  return status;
}

static iree_status_t iree_hal_cuda2_driver_create_device_by_index(
    iree_hal_driver_t* base_driver, iree_string_view_t driver_name,
    int device_index, iree_host_size_t param_count,
    const iree_string_pair_t* params, iree_allocator_t host_allocator,
    iree_hal_device_t** out_device) {
  iree_hal_cuda2_driver_t* driver = iree_hal_cuda2_driver_cast(base_driver);

  // Ensure CUDA is initialized before querying it.
  IREE_RETURN_IF_ERROR(iree_hal_cuda2_init(driver));

  // Query the number of available CUDA devices.
  int device_count = 0;
  IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols,
                            cuDeviceGetCount(&device_count),
                            "cuDeviceGetCount");
  if (device_index >= device_count) {
    return iree_make_status(IREE_STATUS_NOT_FOUND,
                            "device %d not found (of %d enumerated)",
                            device_index, device_count);
  }

  CUdevice device = 0;
  IREE_CUDA_RETURN_IF_ERROR(&driver->cuda_symbols,
                            cuDeviceGet(&device, device_index), "cuDeviceGet");

  iree_status_t status = iree_hal_cuda2_driver_create_device_by_id(
      base_driver, IREE_CUDEVICE_TO_DEVICE_ID(device), param_count, params,
      host_allocator, out_device);

  return status;
}

static iree_status_t iree_hal_cuda2_driver_create_device_by_path(
    iree_hal_driver_t* base_driver, iree_string_view_t driver_name,
    iree_string_view_t device_path, iree_host_size_t param_count,
    const iree_string_pair_t* params, iree_allocator_t host_allocator,
    iree_hal_device_t** out_device) {
  IREE_ASSERT_ARGUMENT(base_driver);
  IREE_ASSERT_ARGUMENT(out_device);

  if (iree_string_view_is_empty(device_path)) {
    return iree_hal_cuda2_driver_create_device_by_id(
        base_driver, IREE_HAL_DEVICE_ID_DEFAULT, param_count, params,
        host_allocator, out_device);
  }

  if (iree_string_view_consume_prefix(&device_path, IREE_SV("GPU-"))) {
    // UUID as returned by cuDeviceGetUuid.
    CUuuid device_uuid;
    if (!iree_string_view_parse_hex_bytes(device_path,
                                          IREE_ARRAYSIZE(device_uuid.bytes),
                                          (uint8_t*)device_uuid.bytes)) {
      return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                              "invalid GPU UUID: '%.*s'", (int)device_path.size,
                              device_path.data);
    }
    return iree_hal_cuda2_driver_create_device_by_uuid(
        base_driver, driver_name, &device_uuid, param_count, params,
        host_allocator, out_device);
  }

  // Try to parse as a device index.
  int device_index = 0;
  if (iree_string_view_atoi_int32(device_path, &device_index)) {
    return iree_hal_cuda2_driver_create_device_by_index(
        base_driver, driver_name, device_index, param_count, params,
        host_allocator, out_device);
  }

  return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported device path");
}

static const iree_hal_driver_vtable_t iree_hal_cuda2_driver_vtable = {
    .destroy = iree_hal_cuda2_driver_destroy,
    .query_available_devices = iree_hal_cuda2_driver_query_available_devices,
    .dump_device_info = iree_hal_cuda2_driver_dump_device_info,
    .create_device_by_id = iree_hal_cuda2_driver_create_device_by_id,
    .create_device_by_path = iree_hal_cuda2_driver_create_device_by_path,
};
