| /* |
| * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. |
| * |
| * NOTICE TO LICENSEE: |
| * |
| * This source code and/or documentation ("Licensed Deliverables") are |
| * subject to NVIDIA intellectual property rights under U.S. and |
| * international Copyright laws. |
| * |
| * These Licensed Deliverables contained herein is PROPRIETARY and |
| * CONFIDENTIAL to NVIDIA and is being provided under the terms and |
| * conditions of a form of NVIDIA software license agreement by and |
| * between NVIDIA and Licensee ("License Agreement") or electronically |
| * accepted by Licensee. Notwithstanding any terms or conditions to |
| * the contrary in the License Agreement, reproduction or disclosure |
| * of the Licensed Deliverables to any third party without the express |
| * written consent of NVIDIA is prohibited. |
| * |
| * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
| * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE |
| * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS |
| * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. |
| * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED |
| * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, |
| * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. |
| * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
| * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY |
| * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY |
| * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
| * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
| * OF THESE LICENSED DELIVERABLES. |
| * |
| * U.S. Government End Users. These Licensed Deliverables are a |
| * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT |
| * 1995), consisting of "commercial computer software" and "commercial |
| * computer software documentation" as such terms are used in 48 |
| * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government |
| * only as a commercial end item. Consistent with 48 C.F.R.12.212 and |
| * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all |
| * U.S. Government End Users acquire the Licensed Deliverables with |
| * only those rights set forth herein. |
| * |
| * Any use of the Licensed Deliverables in individual and commercial |
| * software must include, in the user documentation and internal |
| * comments to the code, the above Disclaimer and U.S. Government End |
| * Users Notice. |
| */ |
| |
| #ifndef __cuda_cuda_h__ |
| #define __cuda_cuda_h__ |
| |
| #include <stdlib.h> |
| #ifdef _MSC_VER |
| typedef unsigned __int32 cuuint32_t; |
| typedef unsigned __int64 cuuint64_t; |
| #else |
| #include <stdint.h> |
| typedef uint32_t cuuint32_t; |
| typedef uint64_t cuuint64_t; |
| #endif |
| |
| #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) |
| #define __CUDA_DEPRECATED |
| #elif defined(_MSC_VER) |
| #define __CUDA_DEPRECATED __declspec(deprecated) |
| #elif defined(__GNUC__) |
| #define __CUDA_DEPRECATED __attribute__((deprecated)) |
| #else |
| #define __CUDA_DEPRECATED |
| #endif |
| |
| #if defined(CUDA_FORCE_API_VERSION) |
| #error "CUDA_FORCE_API_VERSION is no longer supported." |
| #endif |
| |
| #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) |
| #define __CUDA_API_PER_THREAD_DEFAULT_STREAM |
| #define __CUDA_API_PTDS(api) api ## _ptds |
| #define __CUDA_API_PTSZ(api) api ## _ptsz |
| #else |
| #define __CUDA_API_PTDS(api) api |
| #define __CUDA_API_PTSZ(api) api |
| #endif |
| |
| #define cuDeviceTotalMem cuDeviceTotalMem_v2 |
| #define cuCtxCreate cuCtxCreate_v2 |
| #define cuModuleGetGlobal cuModuleGetGlobal_v2 |
| #define cuMemGetInfo cuMemGetInfo_v2 |
| #define cuMemAlloc cuMemAlloc_v2 |
| #define cuMemAllocPitch cuMemAllocPitch_v2 |
| #define cuMemFree cuMemFree_v2 |
| #define cuMemGetAddressRange cuMemGetAddressRange_v2 |
| #define cuMemAllocHost cuMemAllocHost_v2 |
| #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 |
| #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) |
| #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) |
| #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) |
| #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) |
| #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) |
| #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) |
| #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) |
| #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) |
| #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) |
| #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) |
| #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) |
| #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) |
| #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) |
| #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) |
| #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) |
| #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) |
| #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) |
| #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) |
| #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) |
| #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) |
| #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) |
| #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) |
| #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) |
| #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) |
| #define cuArrayCreate cuArrayCreate_v2 |
| #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 |
| #define cuArray3DCreate cuArray3DCreate_v2 |
| #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 |
| #define cuTexRefSetAddress cuTexRefSetAddress_v2 |
| #define cuTexRefGetAddress cuTexRefGetAddress_v2 |
| #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 |
| #define cuCtxDestroy cuCtxDestroy_v2 |
| #define cuCtxPopCurrent cuCtxPopCurrent_v2 |
| #define cuCtxPushCurrent cuCtxPushCurrent_v2 |
| #define cuStreamDestroy cuStreamDestroy_v2 |
| #define cuEventDestroy cuEventDestroy_v2 |
| #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 |
| #define cuLinkCreate cuLinkCreate_v2 |
| #define cuLinkAddData cuLinkAddData_v2 |
| #define cuLinkAddFile cuLinkAddFile_v2 |
| #define cuMemHostRegister cuMemHostRegister_v2 |
| #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 |
| #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) |
| #define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 |
| #define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 |
| #define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 |
| #define cuGraphInstantiate cuGraphInstantiate_v2 |
| |
| #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) |
| #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) |
| #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) |
| #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) |
| #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) |
| #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) |
| #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) |
| #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) |
| |
| #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) |
| #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) |
| #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) |
| #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) |
| #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) |
| #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) |
| |
| #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) |
| #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) |
| #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) |
| #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) |
| #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) |
| #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) |
| #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) |
| #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) |
| #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) |
| #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) |
| #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) |
| #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) |
| #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) |
| #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) |
| #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) |
| #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) |
| |
| #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) |
| #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) |
| #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) |
| #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) |
| #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) |
| |
| #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) |
| |
| #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) |
| #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) |
| |
| #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) |
| #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) |
| #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) |
| #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) |
| #endif |
| |
| /** |
| * \file cuda.h |
| * \brief Header file for the CUDA Toolkit application programming interface. |
| * |
| * \file cudaGL.h |
| * \brief Header file for the OpenGL interoperability functions of the |
| * low-level CUDA driver application programming interface. |
| * |
| * \file cudaD3D9.h |
| * \brief Header file for the Direct3D 9 interoperability functions of the |
| * low-level CUDA driver application programming interface. |
| */ |
| |
| /** |
| * \defgroup CUDA_TYPES Data types used by CUDA driver |
| * @{ |
| */ |
| |
| /** |
| * CUDA API version number |
| */ |
| #define CUDA_VERSION 11000 |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| |
| /** |
| * CUDA device pointer |
| * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. |
| */ |
| #if defined(_WIN64) || defined(__LP64__) |
| typedef unsigned long long CUdeviceptr; |
| #else |
| typedef unsigned int CUdeviceptr; |
| #endif |
| |
| typedef int CUdevice; /**< CUDA device */ |
| typedef struct CUctx_st *CUcontext; /**< CUDA context */ |
| typedef struct CUmod_st *CUmodule; /**< CUDA module */ |
| typedef struct CUfunc_st *CUfunction; /**< CUDA function */ |
| typedef struct CUarray_st *CUarray; /**< CUDA array */ |
| typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ |
| typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ |
| typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ |
| typedef struct CUevent_st *CUevent; /**< CUDA event */ |
| typedef struct CUstream_st *CUstream; /**< CUDA stream */ |
| typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ |
| typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ |
| typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ |
| typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ |
| typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ |
| typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ |
| typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ |
| typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ |
| |
| #ifndef CU_UUID_HAS_BEEN_DEFINED |
| #define CU_UUID_HAS_BEEN_DEFINED |
| typedef struct CUuuid_st { /**< CUDA definition of UUID */ |
| char bytes[16]; |
| } CUuuid; |
| #endif |
| |
| /** |
| * CUDA IPC handle size |
| */ |
| #define CU_IPC_HANDLE_SIZE 64 |
| |
| /** |
| * CUDA IPC event handle |
| */ |
| typedef struct CUipcEventHandle_st { |
| char reserved[CU_IPC_HANDLE_SIZE]; |
| } CUipcEventHandle; |
| |
| /** |
| * CUDA IPC mem handle |
| */ |
| typedef struct CUipcMemHandle_st { |
| char reserved[CU_IPC_HANDLE_SIZE]; |
| } CUipcMemHandle; |
| |
| /** |
| * CUDA Ipc Mem Flags |
| */ |
| typedef enum CUipcMem_flags_enum { |
| CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ |
| } CUipcMem_flags; |
| |
| |
| /** |
| * CUDA Mem Attach Flags |
| */ |
| typedef enum CUmemAttach_flags_enum { |
| CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ |
| CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ |
| CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ |
| } CUmemAttach_flags; |
| |
| /** |
| * Context creation flags |
| */ |
| typedef enum CUctx_flags_enum { |
| CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ |
| CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ |
| CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ |
| CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ |
| CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling |
| * \deprecated This flag was deprecated as of CUDA 4.0 |
| * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ |
| CU_CTX_SCHED_MASK = 0x07, |
| CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ |
| CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ |
| CU_CTX_FLAGS_MASK = 0x1f |
| } CUctx_flags; |
| |
| /** |
| * Stream creation flags |
| */ |
| typedef enum CUstream_flags_enum { |
| CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ |
| CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ |
| } CUstream_flags; |
| |
| /** |
| * Legacy stream handle |
| * |
| * Stream handle that can be passed as a CUstream to use an implicit stream |
| * with legacy synchronization behavior. |
| * |
| * See details of the \link_sync_behavior |
| */ |
| #define CU_STREAM_LEGACY ((CUstream)0x1) |
| |
| /** |
| * Per-thread stream handle |
| * |
| * Stream handle that can be passed as a CUstream to use an implicit stream |
| * with per-thread synchronization behavior. |
| * |
| * See details of the \link_sync_behavior |
| */ |
| #define CU_STREAM_PER_THREAD ((CUstream)0x2) |
| |
| /** |
| * Event creation flags |
| */ |
| typedef enum CUevent_flags_enum { |
| CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ |
| CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ |
| CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ |
| CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ |
| } CUevent_flags; |
| |
| /** |
| * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 |
| */ |
| typedef enum CUstreamWaitValue_flags_enum { |
| CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit |
| values). Note this is a cyclic comparison which ignores wraparound. |
| (Default behavior.) */ |
| CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ |
| CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ |
| CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be |
| queried with ::cuDeviceGetAttribute() and |
| ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ |
| CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This |
| means that, if a remote write operation is guaranteed to have reached the |
| device before the wait can be satisfied, that write is guaranteed to be |
| visible to downstream device work. The device is permitted to reorder |
| remote writes internally. For example, this flag would be required if |
| two remote writes arrive in a defined order, the wait is satisfied by the |
| second write, and downstream work needs to observe the first write. |
| Support for this operation is restricted to selected platforms and can be |
| queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ |
| } CUstreamWaitValue_flags; |
| |
| /** |
| * Flags for ::cuStreamWriteValue32 |
| */ |
| typedef enum CUstreamWriteValue_flags_enum { |
| CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ |
| CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued |
| before it, as a performance optimization. Normally, |
| ::cuStreamWriteValue32 will provide a memory fence before the |
| write, which has similar semantics to |
| __threadfence_system() but is scoped to the stream |
| rather than a CUDA thread. */ |
| } CUstreamWriteValue_flags; |
| |
| /** |
| * Operations for ::cuStreamBatchMemOp |
| */ |
| typedef enum CUstreamBatchMemOpType_enum { |
| CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ |
| CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ |
| CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ |
| CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ |
| CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a |
| standalone operation. */ |
| } CUstreamBatchMemOpType; |
| |
| /** |
| * Per-operation parameters for ::cuStreamBatchMemOp |
| */ |
| typedef union CUstreamBatchMemOpParams_union { |
| CUstreamBatchMemOpType operation; |
| struct CUstreamMemOpWaitValueParams_st { |
| CUstreamBatchMemOpType operation; |
| CUdeviceptr address; |
| union { |
| cuuint32_t value; |
| cuuint64_t value64; |
| }; |
| unsigned int flags; |
| CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ |
| } waitValue; |
| struct CUstreamMemOpWriteValueParams_st { |
| CUstreamBatchMemOpType operation; |
| CUdeviceptr address; |
| union { |
| cuuint32_t value; |
| cuuint64_t value64; |
| }; |
| unsigned int flags; |
| CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ |
| } writeValue; |
| struct CUstreamMemOpFlushRemoteWritesParams_st { |
| CUstreamBatchMemOpType operation; |
| unsigned int flags; |
| } flushRemoteWrites; |
| cuuint64_t pad[6]; |
| } CUstreamBatchMemOpParams; |
| |
| /** |
| * Occupancy calculator flag |
| */ |
| typedef enum CUoccupancy_flags_enum { |
| CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ |
| CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ |
| } CUoccupancy_flags; |
| |
| /** |
| * Array formats |
| */ |
| typedef enum CUarray_format_enum { |
| CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ |
| CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ |
| CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ |
| CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ |
| CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ |
| CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ |
| CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ |
| CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ |
| } CUarray_format; |
| |
| /** |
| * Texture reference addressing modes |
| */ |
| typedef enum CUaddress_mode_enum { |
| CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ |
| CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ |
| CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ |
| CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ |
| } CUaddress_mode; |
| |
| /** |
| * Texture reference filtering modes |
| */ |
| typedef enum CUfilter_mode_enum { |
| CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ |
| CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ |
| } CUfilter_mode; |
| |
| /** |
| * Device properties |
| */ |
| typedef enum CUdevice_attribute_enum { |
| CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ |
| CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ |
| CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ |
| CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ |
| CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ |
| CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ |
| CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ |
| CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ |
| CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ |
| CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ |
| CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ |
| CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ |
| CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ |
| CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ |
| CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ |
| CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ |
| CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ |
| CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ |
| CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ |
| CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ |
| CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ |
| CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ |
| CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ |
| CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ |
| CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ |
| CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ |
| CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ |
| CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ |
| CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ |
| CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ |
| CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ |
| CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ |
| CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ |
| CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ |
| CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ |
| CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ |
| CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ |
| CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ |
| CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ |
| CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ |
| CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ |
| CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ |
| CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ |
| CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ |
| CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ |
| CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ |
| CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ |
| CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ |
| CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ |
| CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ |
| CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ |
| CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */ |
| CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ |
| CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ |
| CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ |
| CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ |
| CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ |
| CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual address management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ |
| CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ |
| CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ |
| CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate */ |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ |
| CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ |
| CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Device's maximum L2 persisting lines capacity setting in bytes */ |
| CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< The maximum value of CUaccessPolicyWindow::num_bytes. */ |
| CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ |
| CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ |
| CU_DEVICE_ATTRIBUTE_MAX |
| } CUdevice_attribute; |
| |
| /** |
| * Legacy device properties |
| */ |
| typedef struct CUdevprop_st { |
| int maxThreadsPerBlock; /**< Maximum number of threads per block */ |
| int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ |
| int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ |
| int sharedMemPerBlock; /**< Shared memory available per block in bytes */ |
| int totalConstantMemory; /**< Constant memory available on device in bytes */ |
| int SIMDWidth; /**< Warp size in threads */ |
| int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ |
| int regsPerBlock; /**< 32-bit registers available per block */ |
| int clockRate; /**< Clock frequency in kilohertz */ |
| int textureAlign; /**< Alignment requirement for textures */ |
| } CUdevprop; |
| |
| /** |
| * Pointer information |
| */ |
| typedef enum CUpointer_attribute_enum { |
| CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ |
| CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ |
| CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ |
| CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ |
| CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ |
| CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ |
| CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ |
| CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ |
| CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ |
| CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ |
| CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ |
| CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ |
| CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ |
| CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ |
| CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15 /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ |
| } CUpointer_attribute; |
| |
| /** |
| * Function properties |
| */ |
| typedef enum CUfunction_attribute_enum { |
| /** |
| * The maximum number of threads per block, beyond which a launch of the |
| * function would fail. This number depends on both the function and the |
| * device on which the function is currently loaded. |
| */ |
| CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, |
| |
| /** |
| * The size in bytes of statically-allocated shared memory required by |
| * this function. This does not include dynamically-allocated shared |
| * memory requested by the user at runtime. |
| */ |
| CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, |
| |
| /** |
| * The size in bytes of user-allocated constant memory required by this |
| * function. |
| */ |
| CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, |
| |
| /** |
| * The size in bytes of local memory used by each thread of this function. |
| */ |
| CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, |
| |
| /** |
| * The number of registers used by each thread of this function. |
| */ |
| CU_FUNC_ATTRIBUTE_NUM_REGS = 4, |
| |
| /** |
| * The PTX virtual architecture version for which the function was |
| * compiled. This value is the major PTX version * 10 + the minor PTX |
| * version, so a PTX version 1.3 function would return the value 13. |
| * Note that this may return the undefined value of 0 for cubins |
| * compiled prior to CUDA 3.0. |
| */ |
| CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, |
| |
| /** |
| * The binary architecture version for which the function was compiled. |
| * This value is the major binary version * 10 + the minor binary version, |
| * so a binary version 1.3 function would return the value 13. Note that |
| * this will return a value of 10 for legacy cubins that do not have a |
| * properly-encoded binary architecture version. |
| */ |
| CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, |
| |
| /** |
| * The attribute to indicate whether the function has been compiled with |
| * user specified option "-Xptxas --dlcm=ca" set . |
| */ |
| CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, |
| |
| /** |
| * The maximum size in bytes of dynamically-allocated shared memory that can be used by |
| * this function. If the user-specified dynamic shared memory size is larger than this |
| * value, the launch will fail. |
| * See ::cuFuncSetAttribute |
| */ |
| CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, |
| |
| /** |
| * On devices where the L1 cache and shared memory use the same hardware resources, |
| * this sets the shared memory carveout preference, in percent of the total shared memory. |
| * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. |
| * This is only a hint, and the driver can choose a different ratio if required to execute the function. |
| * See ::cuFuncSetAttribute |
| */ |
| CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, |
| |
| CU_FUNC_ATTRIBUTE_MAX |
| } CUfunction_attribute; |
| |
| /** |
| * Function cache configurations |
| */ |
| typedef enum CUfunc_cache_enum { |
| CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ |
| CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ |
| CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ |
| CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ |
| } CUfunc_cache; |
| |
| /** |
| * Shared memory configurations |
| */ |
| typedef enum CUsharedconfig_enum { |
| CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ |
| CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ |
| CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ |
| } CUsharedconfig; |
| |
| /** |
| * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute |
| */ |
| typedef enum CUshared_carveout_enum { |
| CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ |
| CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ |
| CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ |
| } CUshared_carveout; |
| |
| /** |
| * Memory types |
| */ |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ |
| CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ |
| CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ |
| CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ |
| } CUmemorytype; |
| |
| /** |
| * Compute Modes |
| */ |
| typedef enum CUcomputemode_enum { |
| CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ |
| CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ |
| CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ |
| } CUcomputemode; |
| |
| /** |
| * Memory advise values |
| */ |
| typedef enum CUmem_advise_enum { |
| CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ |
| CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ |
| CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ |
| CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ |
| CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ |
| CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ |
| } CUmem_advise; |
| |
| typedef enum CUmem_range_attribute_enum { |
| CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ |
| CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ |
| CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ |
| CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ |
| } CUmem_range_attribute; |
| |
| /** |
| * Online compiler and linker options |
| */ |
| typedef enum CUjit_option_enum |
| { |
| /** |
| * Max number of registers that a thread may use.\n |
| * Option type: unsigned int\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_MAX_REGISTERS = 0, |
| |
| /** |
| * IN: Specifies minimum number of threads per block to target compilation |
| * for\n |
| * OUT: Returns the number of threads the compiler actually targeted. |
| * This restricts the resource utilization fo the compiler (e.g. max |
| * registers) such that a block with the given number of threads should be |
| * able to launch based on register limitations. Note, this option does not |
| * currently take into account any other resource limitations, such as |
| * shared memory utilization.\n |
| * Cannot be combined with ::CU_JIT_TARGET.\n |
| * Option type: unsigned int\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_THREADS_PER_BLOCK, |
| |
| /** |
| * Overwrites the option value with the total wall clock time, in |
| * milliseconds, spent in the compiler and linker\n |
| * Option type: float\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_WALL_TIME, |
| |
| /** |
| * Pointer to a buffer in which to print any log messages |
| * that are informational in nature (the buffer size is specified via |
| * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n |
| * Option type: char *\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_INFO_LOG_BUFFER, |
| |
| /** |
| * IN: Log buffer size in bytes. Log messages will be capped at this size |
| * (including null terminator)\n |
| * OUT: Amount of log buffer filled with messages\n |
| * Option type: unsigned int\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, |
| |
| /** |
| * Pointer to a buffer in which to print any log messages that |
| * reflect errors (the buffer size is specified via option |
| * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n |
| * Option type: char *\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_ERROR_LOG_BUFFER, |
| |
| /** |
| * IN: Log buffer size in bytes. Log messages will be capped at this size |
| * (including null terminator)\n |
| * OUT: Amount of log buffer filled with messages\n |
| * Option type: unsigned int\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, |
| |
| /** |
| * Level of optimizations to apply to generated code (0 - 4), with 4 |
| * being the default and highest level of optimizations.\n |
| * Option type: unsigned int\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_OPTIMIZATION_LEVEL, |
| |
| /** |
| * No option value required. Determines the target based on the current |
| * attached context (default)\n |
| * Option type: No option value needed\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_TARGET_FROM_CUCONTEXT, |
| |
| /** |
| * Target is chosen based on supplied ::CUjit_target. Cannot be |
| * combined with ::CU_JIT_THREADS_PER_BLOCK.\n |
| * Option type: unsigned int for enumerated type ::CUjit_target\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_TARGET, |
| |
| /** |
| * Specifies choice of fallback strategy if matching cubin is not found. |
| * Choice is based on supplied ::CUjit_fallback. This option cannot be |
| * used with cuLink* APIs as the linker requires exact matches.\n |
| * Option type: unsigned int for enumerated type ::CUjit_fallback\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_FALLBACK_STRATEGY, |
| |
| /** |
| * Specifies whether to create debug information in output (-g) |
| * (0: false, default)\n |
| * Option type: int\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_GENERATE_DEBUG_INFO, |
| |
| /** |
| * Generate verbose log messages (0: false, default)\n |
| * Option type: int\n |
| * Applies to: compiler and linker |
| */ |
| CU_JIT_LOG_VERBOSE, |
| |
| /** |
| * Generate line number information (-lineinfo) (0: false, default)\n |
| * Option type: int\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_GENERATE_LINE_INFO, |
| |
| /** |
| * Specifies whether to enable caching explicitly (-dlcm) \n |
| * Choice is based on supplied ::CUjit_cacheMode_enum.\n |
| * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n |
| * Applies to: compiler only |
| */ |
| CU_JIT_CACHE_MODE, |
| |
| /** |
| * The below jit options are used for internal purposes only, in this version of CUDA |
| */ |
| CU_JIT_NEW_SM3X_OPT, |
| CU_JIT_FAST_COMPILE, |
| |
| /** |
| * Array of device symbol names that will be relocated to the corresponing |
| * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n |
| * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n |
| * When loding a device module, driver will relocate all encountered |
| * unresolved symbols to the host addresses.\n |
| * It is only allowed to register symbols that correspond to unresolved |
| * global variables.\n |
| * It is illegal to register the same device symbol at multiple addresses.\n |
| * Option type: const char **\n |
| * Applies to: dynamic linker only |
| */ |
| CU_JIT_GLOBAL_SYMBOL_NAMES, |
| |
| /** |
| * Array of host addresses that will be used to relocate corresponding |
| * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n |
| * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n |
| * Option type: void **\n |
| * Applies to: dynamic linker only |
| */ |
| CU_JIT_GLOBAL_SYMBOL_ADDRESSES, |
| |
| /** |
| * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and |
| * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n |
| * Option type: unsigned int\n |
| * Applies to: dynamic linker only |
| */ |
| CU_JIT_GLOBAL_SYMBOL_COUNT, |
| |
| CU_JIT_NUM_OPTIONS |
| |
| } CUjit_option; |
| |
| /** |
| * Online compilation targets |
| */ |
| typedef enum CUjit_target_enum |
| { |
| CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ |
| CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ |
| CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ |
| CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ |
| CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ |
| CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ |
| CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ |
| CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ |
| CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ |
| CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ |
| CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ |
| CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ |
| CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ |
| CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ |
| CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ |
| CU_TARGET_COMPUTE_80 = 80 /**< Compute device class 8.0.*/ |
| } CUjit_target; |
| |
| /** |
| * Cubin matching fallback strategies |
| */ |
| typedef enum CUjit_fallback_enum |
| { |
| CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ |
| |
| CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ |
| |
| } CUjit_fallback; |
| |
| /** |
| * Caching modes for dlcm |
| */ |
| typedef enum CUjit_cacheMode_enum |
| { |
| CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ |
| CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ |
| CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ |
| } CUjit_cacheMode; |
| |
| /** |
| * Device code formats |
| */ |
| typedef enum CUjitInputType_enum |
| { |
| /** |
| * Compiled device-class-specific device code\n |
| * Applicable options: none |
| */ |
| CU_JIT_INPUT_CUBIN = 0, |
| |
| /** |
| * PTX source code\n |
| * Applicable options: PTX compiler options |
| */ |
| CU_JIT_INPUT_PTX, |
| |
| /** |
| * Bundle of multiple cubins and/or PTX of some device code\n |
| * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
| */ |
| CU_JIT_INPUT_FATBINARY, |
| |
| /** |
| * Host object with embedded device code\n |
| * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
| */ |
| CU_JIT_INPUT_OBJECT, |
| |
| /** |
| * Archive of host objects with embedded device code\n |
| * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY |
| */ |
| CU_JIT_INPUT_LIBRARY, |
| |
| CU_JIT_NUM_INPUT_TYPES |
| } CUjitInputType; |
| |
| typedef struct CUlinkState_st *CUlinkState; |
| |
| /** |
| * Flags to register a graphics resource |
| */ |
| typedef enum CUgraphicsRegisterFlags_enum { |
| CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, |
| CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, |
| CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, |
| CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, |
| CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 |
| } CUgraphicsRegisterFlags; |
| |
| /** |
| * Flags for mapping and unmapping interop resources |
| */ |
| typedef enum CUgraphicsMapResourceFlags_enum { |
| CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, |
| CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, |
| CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 |
| } CUgraphicsMapResourceFlags; |
| |
| /** |
| * Array indices for cube faces |
| */ |
| typedef enum CUarray_cubemap_face_enum { |
| CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ |
| CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ |
| CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ |
| CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ |
| CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ |
| CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ |
| } CUarray_cubemap_face; |
| |
| /** |
| * Limits |
| */ |
| typedef enum CUlimit_enum { |
| CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ |
| CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ |
| CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ |
| CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ |
| CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ |
| CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ |
| CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ |
| CU_LIMIT_MAX |
| } CUlimit; |
| |
| /** |
| * Resource types |
| */ |
| typedef enum CUresourcetype_enum { |
| CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ |
| CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ |
| CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ |
| CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ |
| } CUresourcetype; |
| |
| #ifdef _WIN32 |
| #define CUDA_CB __stdcall |
| #else |
| #define CUDA_CB |
| #endif |
| |
| /** |
| * CUDA host function |
| * \param userData Argument value passed to the function |
| */ |
| typedef void (CUDA_CB *CUhostFn)(void *userData); |
| |
| /** |
| * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members |
| */ |
| typedef enum CUaccessProperty_enum { |
| CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ |
| CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ |
| CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ |
| } CUaccessProperty; |
| |
| /** |
| * Specifies an access policy for a window, a contiguous extent of memory |
| * beginning at base_ptr and ending at base_ptr + num_bytes. |
| * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. |
| * Partition into many segments and assign segments such that: |
| * sum of "hit segments" / window == approx. ratio. |
| * sum of "miss segments" / window == approx 1-ratio. |
| * Segments and ratio specifications are fitted to the capabilities of |
| * the architecture. |
| * Accesses in a hit segment apply the hitProp access policy. |
| * Accesses in a miss segment apply the missProp access policy. |
| */ |
| typedef struct CUaccessPolicyWindow_st { |
| void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ |
| size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ |
| float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ |
| CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ |
| CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ |
| } CUaccessPolicyWindow; |
| |
| /** |
| * GPU kernel node parameters |
| */ |
| typedef struct CUDA_KERNEL_NODE_PARAMS_st { |
| CUfunction func; /**< Kernel to launch */ |
| unsigned int gridDimX; /**< Width of grid in blocks */ |
| unsigned int gridDimY; /**< Height of grid in blocks */ |
| unsigned int gridDimZ; /**< Depth of grid in blocks */ |
| unsigned int blockDimX; /**< X dimension of each thread block */ |
| unsigned int blockDimY; /**< Y dimension of each thread block */ |
| unsigned int blockDimZ; /**< Z dimension of each thread block */ |
| unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ |
| void **kernelParams; /**< Array of pointers to kernel parameters */ |
| void **extra; /**< Extra options */ |
| } CUDA_KERNEL_NODE_PARAMS; |
| |
| /** |
| * Memset node parameters |
| */ |
| typedef struct CUDA_MEMSET_NODE_PARAMS_st { |
| CUdeviceptr dst; /**< Destination device pointer */ |
| size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ |
| unsigned int value; /**< Value to be set */ |
| unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ |
| size_t width; /**< Width in bytes, of the row */ |
| size_t height; /**< Number of rows */ |
| } CUDA_MEMSET_NODE_PARAMS; |
| |
| /** |
| * Host node parameters |
| */ |
| typedef struct CUDA_HOST_NODE_PARAMS_st { |
| CUhostFn fn; /**< The function to call when the node executes */ |
| void* userData; /**< Argument to pass to the function */ |
| } CUDA_HOST_NODE_PARAMS; |
| |
| /** |
| * Graph node types |
| */ |
| typedef enum CUgraphNodeType_enum { |
| CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ |
| CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ |
| CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ |
| CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ |
| CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ |
| CU_GRAPH_NODE_TYPE_EMPTY = 5 /**< Empty (no-op) node */ |
| } CUgraphNodeType; |
| |
| typedef enum CUsynchronizationPolicy_enum { |
| CU_SYNC_POLICY_AUTO = 1, |
| CU_SYNC_POLICY_SPIN = 2, |
| CU_SYNC_POLICY_YIELD = 3, |
| CU_SYNC_POLICY_BLOCKING_SYNC = 4 |
| } CUsynchronizationPolicy; |
| |
| /** |
| * Graph kernel node Attributes |
| */ |
| typedef enum CUkernelNodeAttrID_enum { |
| CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ |
| CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ |
| } CUkernelNodeAttrID; |
| |
| /** |
| * Graph attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute |
| */ |
| typedef union CUkernelNodeAttrValue_union { |
| CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ |
| int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ |
| } CUkernelNodeAttrValue; |
| |
| /** |
| * Possible stream capture statuses returned by ::cuStreamIsCapturing |
| */ |
| typedef enum CUstreamCaptureStatus_enum { |
| CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ |
| CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ |
| CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that |
| has been invalidated, but not terminated */ |
| } CUstreamCaptureStatus; |
| |
| /** |
| * Possible modes for stream capture thread interactions. For more details see |
| * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode |
| */ |
| typedef enum CUstreamCaptureMode_enum { |
| CU_STREAM_CAPTURE_MODE_GLOBAL = 0, |
| CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, |
| CU_STREAM_CAPTURE_MODE_RELAXED = 2 |
| } CUstreamCaptureMode; |
| |
| /** |
| * Stream Attributes |
| */ |
| typedef enum CUstreamAttrID_enum { |
| CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ |
| CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ |
| } CUstreamAttrID; |
| |
| /** |
| * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute |
| */ |
| typedef union CUstreamAttrValue_union { |
| CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ |
| CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ |
| } CUstreamAttrValue; |
| |
| /** |
| * Error codes |
| */ |
| typedef enum cudaError_enum { |
| /** |
| * The API call returned with no errors. In the case of query calls, this |
| * also means that the operation being queried is complete (see |
| * ::cuEventQuery() and ::cuStreamQuery()). |
| */ |
| CUDA_SUCCESS = 0, |
| |
| /** |
| * This indicates that one or more of the parameters passed to the API call |
| * is not within an acceptable range of values. |
| */ |
| CUDA_ERROR_INVALID_VALUE = 1, |
| |
| /** |
| * The API call failed because it was unable to allocate enough memory to |
| * perform the requested operation. |
| */ |
| CUDA_ERROR_OUT_OF_MEMORY = 2, |
| |
| /** |
| * This indicates that the CUDA driver has not been initialized with |
| * ::cuInit() or that initialization has failed. |
| */ |
| CUDA_ERROR_NOT_INITIALIZED = 3, |
| |
| /** |
| * This indicates that the CUDA driver is in the process of shutting down. |
| */ |
| CUDA_ERROR_DEINITIALIZED = 4, |
| |
| /** |
| * This indicates profiler is not initialized for this run. This can |
| * happen when the application is running with external profiling tools |
| * like visual profiler. |
| */ |
| CUDA_ERROR_PROFILER_DISABLED = 5, |
| |
| /** |
| * \deprecated |
| * This error return is deprecated as of CUDA 5.0. It is no longer an error |
| * to attempt to enable/disable the profiling via ::cuProfilerStart or |
| * ::cuProfilerStop without initialization. |
| */ |
| CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, |
| |
| /** |
| * \deprecated |
| * This error return is deprecated as of CUDA 5.0. It is no longer an error |
| * to call cuProfilerStart() when profiling is already enabled. |
| */ |
| CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, |
| |
| /** |
| * \deprecated |
| * This error return is deprecated as of CUDA 5.0. It is no longer an error |
| * to call cuProfilerStop() when profiling is already disabled. |
| */ |
| CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, |
| |
| /** |
| * This indicates that no CUDA-capable devices were detected by the installed |
| * CUDA driver. |
| */ |
| CUDA_ERROR_NO_DEVICE = 100, |
| |
| /** |
| * This indicates that the device ordinal supplied by the user does not |
| * correspond to a valid CUDA device. |
| */ |
| CUDA_ERROR_INVALID_DEVICE = 101, |
| |
| |
| /** |
| * This indicates that the device kernel image is invalid. This can also |
| * indicate an invalid CUDA module. |
| */ |
| CUDA_ERROR_INVALID_IMAGE = 200, |
| |
| /** |
| * This most frequently indicates that there is no context bound to the |
| * current thread. This can also be returned if the context passed to an |
| * API call is not a valid handle (such as a context that has had |
| * ::cuCtxDestroy() invoked on it). This can also be returned if a user |
| * mixes different API versions (i.e. 3010 context with 3020 API calls). |
| * See ::cuCtxGetApiVersion() for more details. |
| */ |
| CUDA_ERROR_INVALID_CONTEXT = 201, |
| |
| /** |
| * This indicated that the context being supplied as a parameter to the |
| * API call was already the active context. |
| * \deprecated |
| * This error return is deprecated as of CUDA 3.2. It is no longer an |
| * error to attempt to push the active context via ::cuCtxPushCurrent(). |
| */ |
| CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, |
| |
| /** |
| * This indicates that a map or register operation has failed. |
| */ |
| CUDA_ERROR_MAP_FAILED = 205, |
| |
| /** |
| * This indicates that an unmap or unregister operation has failed. |
| */ |
| CUDA_ERROR_UNMAP_FAILED = 206, |
| |
| /** |
| * This indicates that the specified array is currently mapped and thus |
| * cannot be destroyed. |
| */ |
| CUDA_ERROR_ARRAY_IS_MAPPED = 207, |
| |
| /** |
| * This indicates that the resource is already mapped. |
| */ |
| CUDA_ERROR_ALREADY_MAPPED = 208, |
| |
| /** |
| * This indicates that there is no kernel image available that is suitable |
| * for the device. This can occur when a user specifies code generation |
| * options for a particular CUDA source file that do not include the |
| * corresponding device configuration. |
| */ |
| CUDA_ERROR_NO_BINARY_FOR_GPU = 209, |
| |
| /** |
| * This indicates that a resource has already been acquired. |
| */ |
| CUDA_ERROR_ALREADY_ACQUIRED = 210, |
| |
| /** |
| * This indicates that a resource is not mapped. |
| */ |
| CUDA_ERROR_NOT_MAPPED = 211, |
| |
| /** |
| * This indicates that a mapped resource is not available for access as an |
| * array. |
| */ |
| CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, |
| |
| /** |
| * This indicates that a mapped resource is not available for access as a |
| * pointer. |
| */ |
| CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, |
| |
| /** |
| * This indicates that an uncorrectable ECC error was detected during |
| * execution. |
| */ |
| CUDA_ERROR_ECC_UNCORRECTABLE = 214, |
| |
| /** |
| * This indicates that the ::CUlimit passed to the API call is not |
| * supported by the active device. |
| */ |
| CUDA_ERROR_UNSUPPORTED_LIMIT = 215, |
| |
| /** |
| * This indicates that the ::CUcontext passed to the API call can |
| * only be bound to a single CPU thread at a time but is already |
| * bound to a CPU thread. |
| */ |
| CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, |
| |
| /** |
| * This indicates that peer access is not supported across the given |
| * devices. |
| */ |
| CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, |
| |
| /** |
| * This indicates that a PTX JIT compilation failed. |
| */ |
| CUDA_ERROR_INVALID_PTX = 218, |
| |
| /** |
| * This indicates an error with OpenGL or DirectX context. |
| */ |
| CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, |
| |
| /** |
| * This indicates that an uncorrectable NVLink error was detected during the |
| * execution. |
| */ |
| CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, |
| |
| /** |
| * This indicates that the PTX JIT compiler library was not found. |
| */ |
| CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, |
| |
| /** |
| * This indicates that the device kernel source is invalid. |
| */ |
| CUDA_ERROR_INVALID_SOURCE = 300, |
| |
| /** |
| * This indicates that the file specified was not found. |
| */ |
| CUDA_ERROR_FILE_NOT_FOUND = 301, |
| |
| /** |
| * This indicates that a link to a shared object failed to resolve. |
| */ |
| CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, |
| |
| /** |
| * This indicates that initialization of a shared object failed. |
| */ |
| CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, |
| |
| /** |
| * This indicates that an OS call failed. |
| */ |
| CUDA_ERROR_OPERATING_SYSTEM = 304, |
| |
| /** |
| * This indicates that a resource handle passed to the API call was not |
| * valid. Resource handles are opaque types like ::CUstream and ::CUevent. |
| */ |
| CUDA_ERROR_INVALID_HANDLE = 400, |
| |
| /** |
| * This indicates that a resource required by the API call is not in a |
| * valid state to perform the requested operation. |
| */ |
| CUDA_ERROR_ILLEGAL_STATE = 401, |
| |
| /** |
| * This indicates that a named symbol was not found. Examples of symbols |
| * are global/constant variable names, texture names, and surface names. |
| */ |
| CUDA_ERROR_NOT_FOUND = 500, |
| |
| /** |
| * This indicates that asynchronous operations issued previously have not |
| * completed yet. This result is not actually an error, but must be indicated |
| * differently than ::CUDA_SUCCESS (which indicates completion). Calls that |
| * may return this value include ::cuEventQuery() and ::cuStreamQuery(). |
| */ |
| CUDA_ERROR_NOT_READY = 600, |
| |
| /** |
| * While executing a kernel, the device encountered a |
| * load or store instruction on an invalid memory address. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_ILLEGAL_ADDRESS = 700, |
| |
| /** |
| * This indicates that a launch did not occur because it did not have |
| * appropriate resources. This error usually indicates that the user has |
| * attempted to pass too many arguments to the device kernel, or the |
| * kernel launch specifies too many threads for the kernel's register |
| * count. Passing arguments of the wrong size (i.e. a 64-bit pointer |
| * when a 32-bit int is expected) is equivalent to passing too many |
| * arguments and can also result in this error. |
| */ |
| CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, |
| |
| /** |
| * This indicates that the device kernel took too long to execute. This can |
| * only occur if timeouts are enabled - see the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_LAUNCH_TIMEOUT = 702, |
| |
| /** |
| * This error indicates a kernel launch that uses an incompatible texturing |
| * mode. |
| */ |
| CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, |
| |
| /** |
| * This error indicates that a call to ::cuCtxEnablePeerAccess() is |
| * trying to re-enable peer access to a context which has already |
| * had peer access to it enabled. |
| */ |
| CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, |
| |
| /** |
| * This error indicates that ::cuCtxDisablePeerAccess() is |
| * trying to disable peer access which has not been enabled yet |
| * via ::cuCtxEnablePeerAccess(). |
| */ |
| CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, |
| |
| /** |
| * This error indicates that the primary context for the specified device |
| * has already been initialized. |
| */ |
| CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, |
| |
| /** |
| * This error indicates that the context current to the calling thread |
| * has been destroyed using ::cuCtxDestroy, or is a primary context which |
| * has not yet been initialized. |
| */ |
| CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, |
| |
| /** |
| * A device-side assert triggered during kernel execution. The context |
| * cannot be used anymore, and must be destroyed. All existing device |
| * memory allocations from this context are invalid and must be |
| * reconstructed if the program is to continue using CUDA. |
| */ |
| CUDA_ERROR_ASSERT = 710, |
| |
| /** |
| * This error indicates that the hardware resources required to enable |
| * peer access have been exhausted for one or more of the devices |
| * passed to ::cuCtxEnablePeerAccess(). |
| */ |
| CUDA_ERROR_TOO_MANY_PEERS = 711, |
| |
| /** |
| * This error indicates that the memory range passed to ::cuMemHostRegister() |
| * has already been registered. |
| */ |
| CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, |
| |
| /** |
| * This error indicates that the pointer passed to ::cuMemHostUnregister() |
| * does not correspond to any currently registered memory region. |
| */ |
| CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, |
| |
| /** |
| * While executing a kernel, the device encountered a stack error. |
| * This can be due to stack corruption or exceeding the stack size limit. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_HARDWARE_STACK_ERROR = 714, |
| |
| /** |
| * While executing a kernel, the device encountered an illegal instruction. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, |
| |
| /** |
| * While executing a kernel, the device encountered a load or store instruction |
| * on a memory address which is not aligned. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_MISALIGNED_ADDRESS = 716, |
| |
| /** |
| * While executing a kernel, the device encountered an instruction |
| * which can only operate on memory locations in certain address spaces |
| * (global, shared, or local), but was supplied a memory address not |
| * belonging to an allowed address space. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, |
| |
| /** |
| * While executing a kernel, the device program counter wrapped its address space. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_INVALID_PC = 718, |
| |
| /** |
| * An exception occurred on the device while executing a kernel. Common |
| * causes include dereferencing an invalid device pointer and accessing |
| * out of bounds shared memory. Less common cases can be system specific - more |
| * information about these cases can be found in the system specific user guide. |
| * This leaves the process in an inconsistent state and any further CUDA work |
| * will return the same error. To continue using CUDA, the process must be terminated |
| * and relaunched. |
| */ |
| CUDA_ERROR_LAUNCH_FAILED = 719, |
| |
| /** |
| * This error indicates that the number of blocks launched per grid for a kernel that was |
| * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice |
| * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor |
| * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors |
| * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. |
| */ |
| CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, |
| |
| /** |
| * This error indicates that the attempted operation is not permitted. |
| */ |
| CUDA_ERROR_NOT_PERMITTED = 800, |
| |
| /** |
| * This error indicates that the attempted operation is not supported |
| * on the current system or device. |
| */ |
| CUDA_ERROR_NOT_SUPPORTED = 801, |
| |
| /** |
| * This error indicates that the system is not yet ready to start any CUDA |
| * work. To continue using CUDA, verify the system configuration is in a |
| * valid state and all required driver daemons are actively running. |
| * More information about this error can be found in the system specific |
| * user guide. |
| */ |
| CUDA_ERROR_SYSTEM_NOT_READY = 802, |
| |
| /** |
| * This error indicates that there is a mismatch between the versions of |
| * the display driver and the CUDA driver. Refer to the compatibility documentation |
| * for supported versions. |
| */ |
| CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, |
| |
| /** |
| * This error indicates that the system was upgraded to run with forward compatibility |
| * but the visible hardware detected by CUDA does not support this configuration. |
| * Refer to the compatibility documentation for the supported hardware matrix or ensure |
| * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES |
| * environment variable. |
| */ |
| CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, |
| |
| /** |
| * This error indicates that the operation is not permitted when |
| * the stream is capturing. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, |
| |
| /** |
| * This error indicates that the current capture sequence on the stream |
| * has been invalidated due to a previous error. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, |
| |
| /** |
| * This error indicates that the operation would have resulted in a merge |
| * of two independent capture sequences. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, |
| |
| /** |
| * This error indicates that the capture was not initiated in this stream. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, |
| |
| /** |
| * This error indicates that the capture sequence contains a fork that was |
| * not joined to the primary stream. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, |
| |
| /** |
| * This error indicates that a dependency would have been created which |
| * crosses the capture sequence boundary. Only implicit in-stream ordering |
| * dependencies are allowed to cross the boundary. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, |
| |
| /** |
| * This error indicates a disallowed implicit dependency on a current capture |
| * sequence from cudaStreamLegacy. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, |
| |
| /** |
| * This error indicates that the operation is not permitted on an event which |
| * was last recorded in a capturing stream. |
| */ |
| CUDA_ERROR_CAPTURED_EVENT = 907, |
| |
| /** |
| * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED |
| * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a |
| * different thread. |
| */ |
| CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, |
| |
| /** |
| * This error indicates that the timeout specified for the wait operation has lapsed. |
| */ |
| CUDA_ERROR_TIMEOUT = 909, |
| |
| /** |
| * This error indicates that the graph update was not performed because it included |
| * changes which violated constraints specific to instantiated graph update. |
| */ |
| CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, |
| |
| /** |
| * This indicates that an unknown internal error has occurred. |
| */ |
| CUDA_ERROR_UNKNOWN = 999 |
| } CUresult; |
| |
| /** |
| * P2P Attributes |
| */ |
| typedef enum CUdevice_P2PAttribute_enum { |
| CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ |
| CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ |
| CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ |
| CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ |
| CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ |
| } CUdevice_P2PAttribute; |
| |
| /** |
| * CUDA stream callback |
| * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. |
| * \param status ::CUDA_SUCCESS or any persistent error on the stream. |
| * \param userData User parameter provided at registration. |
| */ |
| typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); |
| |
| /** |
| * Block size to per-block dynamic shared memory mapping for a certain |
| * kernel \param blockSize Block size of the kernel. |
| * |
| * \return The dynamic shared memory needed by a block. |
| */ |
| typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); |
| |
| /** |
| * If set, host memory is portable between CUDA contexts. |
| * Flag for ::cuMemHostAlloc() |
| */ |
| #define CU_MEMHOSTALLOC_PORTABLE 0x01 |
| |
| /** |
| * If set, host memory is mapped into CUDA address space and |
| * ::cuMemHostGetDevicePointer() may be called on the host pointer. |
| * Flag for ::cuMemHostAlloc() |
| */ |
| #define CU_MEMHOSTALLOC_DEVICEMAP 0x02 |
| |
| /** |
| * If set, host memory is allocated as write-combined - fast to write, |
| * faster to DMA, slow to read except via SSE4 streaming load instruction |
| * (MOVNTDQA). |
| * Flag for ::cuMemHostAlloc() |
| */ |
| #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 |
| |
| /** |
| * If set, host memory is portable between CUDA contexts. |
| * Flag for ::cuMemHostRegister() |
| */ |
| #define CU_MEMHOSTREGISTER_PORTABLE 0x01 |
| |
| /** |
| * If set, host memory is mapped into CUDA address space and |
| * ::cuMemHostGetDevicePointer() may be called on the host pointer. |
| * Flag for ::cuMemHostRegister() |
| */ |
| #define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 |
| |
| /** |
| * If set, the passed memory pointer is treated as pointing to some |
| * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. |
| * On Windows the flag is a no-op. |
| * On Linux that memory is marked as non cache-coherent for the GPU and |
| * is expected to be physically contiguous. It may return |
| * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, |
| * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. |
| * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED |
| * is returned. |
| * Flag for ::cuMemHostRegister() |
| */ |
| #define CU_MEMHOSTREGISTER_IOMEMORY 0x04 |
| |
| /** |
| * 2D memory copy parameters |
| */ |
| typedef struct CUDA_MEMCPY2D_st { |
| size_t srcXInBytes; /**< Source X in bytes */ |
| size_t srcY; /**< Source Y */ |
| |
| CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
| const void *srcHost; /**< Source host pointer */ |
| CUdeviceptr srcDevice; /**< Source device pointer */ |
| CUarray srcArray; /**< Source array reference */ |
| size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
| |
| size_t dstXInBytes; /**< Destination X in bytes */ |
| size_t dstY; /**< Destination Y */ |
| |
| CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
| void *dstHost; /**< Destination host pointer */ |
| CUdeviceptr dstDevice; /**< Destination device pointer */ |
| CUarray dstArray; /**< Destination array reference */ |
| size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
| |
| size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ |
| size_t Height; /**< Height of 2D memory copy */ |
| } CUDA_MEMCPY2D; |
| |
| /** |
| * 3D memory copy parameters |
| */ |
| typedef struct CUDA_MEMCPY3D_st { |
| size_t srcXInBytes; /**< Source X in bytes */ |
| size_t srcY; /**< Source Y */ |
| size_t srcZ; /**< Source Z */ |
| size_t srcLOD; /**< Source LOD */ |
| CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
| const void *srcHost; /**< Source host pointer */ |
| CUdeviceptr srcDevice; /**< Source device pointer */ |
| CUarray srcArray; /**< Source array reference */ |
| void *reserved0; /**< Must be NULL */ |
| size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
| size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
| |
| size_t dstXInBytes; /**< Destination X in bytes */ |
| size_t dstY; /**< Destination Y */ |
| size_t dstZ; /**< Destination Z */ |
| size_t dstLOD; /**< Destination LOD */ |
| CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
| void *dstHost; /**< Destination host pointer */ |
| CUdeviceptr dstDevice; /**< Destination device pointer */ |
| CUarray dstArray; /**< Destination array reference */ |
| void *reserved1; /**< Must be NULL */ |
| size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
| size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
| |
| size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ |
| size_t Height; /**< Height of 3D memory copy */ |
| size_t Depth; /**< Depth of 3D memory copy */ |
| } CUDA_MEMCPY3D; |
| |
| /** |
| * 3D memory cross-context copy parameters |
| */ |
| typedef struct CUDA_MEMCPY3D_PEER_st { |
| size_t srcXInBytes; /**< Source X in bytes */ |
| size_t srcY; /**< Source Y */ |
| size_t srcZ; /**< Source Z */ |
| size_t srcLOD; /**< Source LOD */ |
| CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
| const void *srcHost; /**< Source host pointer */ |
| CUdeviceptr srcDevice; /**< Source device pointer */ |
| CUarray srcArray; /**< Source array reference */ |
| CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ |
| size_t srcPitch; /**< Source pitch (ignored when src is array) */ |
| size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
| |
| size_t dstXInBytes; /**< Destination X in bytes */ |
| size_t dstY; /**< Destination Y */ |
| size_t dstZ; /**< Destination Z */ |
| size_t dstLOD; /**< Destination LOD */ |
| CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
| void *dstHost; /**< Destination host pointer */ |
| CUdeviceptr dstDevice; /**< Destination device pointer */ |
| CUarray dstArray; /**< Destination array reference */ |
| CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ |
| size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ |
| size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
| |
| size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ |
| size_t Height; /**< Height of 3D memory copy */ |
| size_t Depth; /**< Depth of 3D memory copy */ |
| } CUDA_MEMCPY3D_PEER; |
| |
| /** |
| * Array descriptor |
| */ |
| typedef struct CUDA_ARRAY_DESCRIPTOR_st |
| { |
| size_t Width; /**< Width of array */ |
| size_t Height; /**< Height of array */ |
| |
| CUarray_format Format; /**< Array format */ |
| unsigned int NumChannels; /**< Channels per array element */ |
| } CUDA_ARRAY_DESCRIPTOR; |
| |
| /** |
| * 3D array descriptor |
| */ |
| typedef struct CUDA_ARRAY3D_DESCRIPTOR_st |
| { |
| size_t Width; /**< Width of 3D array */ |
| size_t Height; /**< Height of 3D array */ |
| size_t Depth; /**< Depth of 3D array */ |
| |
| CUarray_format Format; /**< Array format */ |
| unsigned int NumChannels; /**< Channels per array element */ |
| unsigned int Flags; /**< Flags */ |
| } CUDA_ARRAY3D_DESCRIPTOR; |
| |
| /** |
| * CUDA Resource descriptor |
| */ |
| typedef struct CUDA_RESOURCE_DESC_st |
| { |
| CUresourcetype resType; /**< Resource type */ |
| |
| union { |
| struct { |
| CUarray hArray; /**< CUDA array */ |
| } array; |
| struct { |
| CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ |
| } mipmap; |
| struct { |
| CUdeviceptr devPtr; /**< Device pointer */ |
| CUarray_format format; /**< Array format */ |
| unsigned int numChannels; /**< Channels per array element */ |
| size_t sizeInBytes; /**< Size in bytes */ |
| } linear; |
| struct { |
| CUdeviceptr devPtr; /**< Device pointer */ |
| CUarray_format format; /**< Array format */ |
| unsigned int numChannels; /**< Channels per array element */ |
| size_t width; /**< Width of the array in elements */ |
| size_t height; /**< Height of the array in elements */ |
| size_t pitchInBytes; /**< Pitch between two rows in bytes */ |
| } pitch2D; |
| struct { |
| int reserved[32]; |
| } reserved; |
| } res; |
| |
| unsigned int flags; /**< Flags (must be zero) */ |
| } CUDA_RESOURCE_DESC; |
| |
| /** |
| * Texture descriptor |
| */ |
| typedef struct CUDA_TEXTURE_DESC_st { |
| CUaddress_mode addressMode[3]; /**< Address modes */ |
| CUfilter_mode filterMode; /**< Filter mode */ |
| unsigned int flags; /**< Flags */ |
| unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ |
| CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ |
| float mipmapLevelBias; /**< Mipmap level bias */ |
| float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ |
| float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ |
| float borderColor[4]; /**< Border Color */ |
| int reserved[12]; |
| } CUDA_TEXTURE_DESC; |
| |
| /** |
| * Resource view format |
| */ |
| typedef enum CUresourceViewFormat_enum |
| { |
| CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ |
| CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ |
| CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ |
| CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ |
| CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ |
| CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ |
| CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ |
| CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ |
| CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ |
| CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ |
| CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ |
| CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ |
| CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ |
| CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ |
| } CUresourceViewFormat; |
| |
| /** |
| * Resource view descriptor |
| */ |
| typedef struct CUDA_RESOURCE_VIEW_DESC_st |
| { |
| CUresourceViewFormat format; /**< Resource view format */ |
| size_t width; /**< Width of the resource view */ |
| size_t height; /**< Height of the resource view */ |
| size_t depth; /**< Depth of the resource view */ |
| unsigned int firstMipmapLevel; /**< First defined mipmap level */ |
| unsigned int lastMipmapLevel; /**< Last defined mipmap level */ |
| unsigned int firstLayer; /**< First layer index */ |
| unsigned int lastLayer; /**< Last layer index */ |
| unsigned int reserved[16]; |
| } CUDA_RESOURCE_VIEW_DESC; |
| |
| /** |
| * GPU Direct v3 tokens |
| */ |
| typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { |
| unsigned long long p2pToken; |
| unsigned int vaSpaceToken; |
| } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; |
| |
| /** |
| * Kernel launch parameters |
| */ |
| typedef struct CUDA_LAUNCH_PARAMS_st { |
| CUfunction function; /**< Kernel to launch */ |
| unsigned int gridDimX; /**< Width of grid in blocks */ |
| unsigned int gridDimY; /**< Height of grid in blocks */ |
| unsigned int gridDimZ; /**< Depth of grid in blocks */ |
| unsigned int blockDimX; /**< X dimension of each thread block */ |
| unsigned int blockDimY; /**< Y dimension of each thread block */ |
| unsigned int blockDimZ; /**< Z dimension of each thread block */ |
| unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ |
| CUstream hStream; /**< Stream identifier */ |
| void **kernelParams; /**< Array of pointers to kernel parameters */ |
| } CUDA_LAUNCH_PARAMS; |
| |
| /** |
| * External memory handle types |
| */ |
| typedef enum CUexternalMemoryHandleType_enum { |
| /** |
| * Handle is an opaque file descriptor |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, |
| /** |
| * Handle is an opaque shared NT handle |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
| /** |
| * Handle is an opaque, globally shared handle |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
| /** |
| * Handle is a D3D12 heap object |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, |
| /** |
| * Handle is a D3D12 committed resource |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, |
| /** |
| * Handle is a shared NT handle to a D3D11 resource |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, |
| /** |
| * Handle is a globally shared handle to a D3D11 resource |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, |
| /** |
| * Handle is an NvSciBuf object |
| */ |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 |
| } CUexternalMemoryHandleType; |
| |
| /** |
| * Indicates that the external memory object is a dedicated resource |
| */ |
| #define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 |
| |
| /** When the /p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS |
| * contains this flag, it indicates that signaling an external semaphore object |
| * should skip performing appropriate memory synchronization operations over all |
| * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, |
| * which otherwise are performed by default to ensure data coherency with other |
| * importers of the same NvSciBuf memory objects. |
| */ |
| #define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 |
| |
| /** When the /p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS |
| * contains this flag, it indicates that waiting on an external semaphore object |
| * should skip performing appropriate memory synchronization operations over all |
| * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, |
| * which otherwise are performed by default to ensure data coherency with other |
| * importers of the same NvSciBuf memory objects. |
| */ |
| #define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 |
| |
| /** |
| * When /p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, |
| * it indicates that application needs signaler specific NvSciSyncAttr |
| * to be filled by ::cuDeviceGetNvSciSyncAttributes. |
| */ |
| #define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 |
| |
| /** |
| * When /p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, |
| * it indicates that application needs waiter specific NvSciSyncAttr |
| * to be filled by ::cuDeviceGetNvSciSyncAttributes. |
| */ |
| #define CUDA_NVSCISYNC_ATTR_WAIT 0x2 |
| /** |
| * External memory handle descriptor |
| */ |
| typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { |
| /** |
| * Type of the handle |
| */ |
| CUexternalMemoryHandleType type; |
| union { |
| /** |
| * File descriptor referencing the memory object. Valid |
| * when type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD |
| */ |
| int fd; |
| /** |
| * Win32 handle referencing the semaphore object. Valid when |
| * type is one of the following: |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE |
| * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
| * Exactly one of 'handle' and 'name' must be non-NULL. If |
| * type is one of the following: |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
| * then 'name' must be NULL. |
| */ |
| struct { |
| /** |
| * Valid NT handle. Must be NULL if 'name' is non-NULL |
| */ |
| void *handle; |
| /** |
| * Name of a valid memory object. |
| * Must be NULL if 'handle' is non-NULL. |
| */ |
| const void *name; |
| } win32; |
| /** |
| * A handle representing an NvSciBuf Object. Valid when type |
| * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF |
| */ |
| const void *nvSciBufObject; |
| } handle; |
| /** |
| * Size of the memory allocation |
| */ |
| unsigned long long size; |
| /** |
| * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED |
| */ |
| unsigned int flags; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; |
| |
| /** |
| * External memory buffer descriptor |
| */ |
| typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { |
| /** |
| * Offset into the memory object where the buffer's base is |
| */ |
| unsigned long long offset; |
| /** |
| * Size of the buffer |
| */ |
| unsigned long long size; |
| /** |
| * Flags reserved for future use. Must be zero. |
| */ |
| unsigned int flags; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; |
| |
| /** |
| * External memory mipmap descriptor |
| */ |
| typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { |
| /** |
| * Offset into the memory object where the base level of the |
| * mipmap chain is. |
| */ |
| unsigned long long offset; |
| /** |
| * Format, dimension and type of base level of the mipmap chain |
| */ |
| CUDA_ARRAY3D_DESCRIPTOR arrayDesc; |
| /** |
| * Total number of levels in the mipmap chain |
| */ |
| unsigned int numLevels; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; |
| |
| /** |
| * External semaphore handle types |
| */ |
| typedef enum CUexternalSemaphoreHandleType_enum { |
| /** |
| * Handle is an opaque file descriptor |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, |
| /** |
| * Handle is an opaque shared NT handle |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
| /** |
| * Handle is an opaque, globally shared handle |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
| /** |
| * Handle is a shared NT handle referencing a D3D12 fence object |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, |
| /** |
| * Handle is a shared NT handle referencing a D3D11 fence object |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, |
| /** |
| * Opaque handle to NvSciSync Object |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, |
| /** |
| * Handle is a shared NT handle referencing a D3D11 keyed mutex object |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, |
| /** |
| * Handle is a globally shared handle referencing a D3D11 keyed mutex object |
| */ |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8 |
| } CUexternalSemaphoreHandleType; |
| |
| /** |
| * External semaphore handle descriptor |
| */ |
| typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { |
| /** |
| * Type of the handle |
| */ |
| CUexternalSemaphoreHandleType type; |
| union { |
| /** |
| * File descriptor referencing the semaphore object. Valid |
| * when type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD |
| */ |
| int fd; |
| /** |
| * Win32 handle referencing the semaphore object. Valid when |
| * type is one of the following: |
| * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 |
| * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE |
| * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE |
| * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX |
| * Exactly one of 'handle' and 'name' must be non-NULL. If |
| * type is one of the following: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
| * then 'name' must be NULL. |
| */ |
| struct { |
| /** |
| * Valid NT handle. Must be NULL if 'name' is non-NULL |
| */ |
| void *handle; |
| /** |
| * Name of a valid synchronization primitive. |
| * Must be NULL if 'handle' is non-NULL. |
| */ |
| const void *name; |
| } win32; |
| /** |
| * Valid NvSciSyncObj. Must be non NULL |
| */ |
| const void* nvSciSyncObj; |
| } handle; |
| /** |
| * Flags reserved for the future. Must be zero. |
| */ |
| unsigned int flags; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; |
| |
| /** |
| * External semaphore signal parameters |
| */ |
| typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { |
| struct { |
| /** |
| * Parameters for fence objects |
| */ |
| struct { |
| /** |
| * Value of fence to be signaled |
| */ |
| unsigned long long value; |
| } fence; |
| union { |
| /** |
| * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType |
| * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. |
| */ |
| void *fence; |
| unsigned long long reserved; |
| } nvSciSync; |
| /** |
| * Parameters for keyed mutex objects |
| */ |
| struct { |
| /** |
| * Value of key to release the mutex with |
| */ |
| unsigned long long key; |
| } keyedMutex; |
| unsigned int reserved[12]; |
| } params; |
| /** |
| * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to |
| * signal a ::CUexternalSemaphore of type |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is |
| * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates |
| * that while signaling the ::CUexternalSemaphore, no memory synchronization |
| * operations should be performed for any external memory object imported |
| * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. |
| * For all other types of ::CUexternalSemaphore, flags must be zero. |
| */ |
| unsigned int flags; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; |
| |
| /** |
| * External semaphore wait parameters |
| */ |
| typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { |
| struct { |
| /** |
| * Parameters for fence objects |
| */ |
| struct { |
| /** |
| * Value of fence to be waited on |
| */ |
| unsigned long long value; |
| } fence; |
| /** |
| * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType |
| * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. |
| */ |
| union { |
| void *fence; |
| unsigned long long reserved; |
| } nvSciSync; |
| /** |
| * Parameters for keyed mutex objects |
| */ |
| struct { |
| /** |
| * Value of key to acquire the mutex with |
| */ |
| unsigned long long key; |
| /** |
| * Timeout in milliseconds to wait to acquire the mutex |
| */ |
| unsigned int timeoutMs; |
| } keyedMutex; |
| unsigned int reserved[10]; |
| } params; |
| /** |
| * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on |
| * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
| * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC |
| * which indicates that while waiting for the ::CUexternalSemaphore, no memory |
| * synchronization operations should be performed for any external memory |
| * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. |
| * For all other types of ::CUexternalSemaphore, flags must be zero. |
| */ |
| unsigned int flags; |
| unsigned int reserved[16]; |
| } CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; |
| |
| |
| typedef unsigned long long CUmemGenericAllocationHandle; |
| |
| /** |
| * Flags for specifying particular handle types |
| */ |
| typedef enum CUmemAllocationHandleType_enum { |
| CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ |
| CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ |
| CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ |
| CU_MEM_HANDLE_TYPE_MAX = 0xFFFFFFFF |
| } CUmemAllocationHandleType; |
| |
| /** |
| * Specifies the memory protection flags for mapping. |
| */ |
| typedef enum CUmemAccess_flags_enum { |
| CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ |
| CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ |
| CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ |
| CU_MEM_ACCESS_FLAGS_PROT_MAX = 0xFFFFFFFF |
| } CUmemAccess_flags; |
| |
| /** |
| * Specifies the type of location |
| */ |
| typedef enum CUmemLocationType_enum { |
| CU_MEM_LOCATION_TYPE_INVALID = 0x0, |
| CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ |
| CU_MEM_LOCATION_TYPE_MAX = 0xFFFFFFFF |
| } CUmemLocationType; |
| |
| /** |
| * Defines the allocation types available |
| */ |
| typedef enum CUmemAllocationType_enum { |
| CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, |
| |
| /** This allocation type is 'pinned', i.e. cannot migrate from its current |
| * location while the application is actively using it |
| */ |
| CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, |
| CU_MEM_ALLOCATION_TYPE_MAX = 0xFFFFFFFF |
| } CUmemAllocationType; |
| |
| /** |
| * Flag for requesting different optimal and required granularities for an allocation. |
| */ |
| typedef enum CUmemAllocationGranularity_flags_enum { |
| CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ |
| CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ |
| } CUmemAllocationGranularity_flags; |
| |
| /** |
| * Specifies a location for an allocation. |
| */ |
| typedef struct CUmemLocation_st { |
| CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ |
| int id; /**< identifier for a given this location's ::CUmemLocationType. */ |
| } CUmemLocation; |
| |
| /** |
| * Specifies compression attribute for an allocation. |
| */ |
| typedef enum CUmemAllocationCompType_enum { |
| CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ |
| CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ |
| } CUmemAllocationCompType; |
| |
| /** |
| * Specifies the allocation properties for a allocation. |
| */ |
| typedef struct CUmemAllocationProp_st { |
| /** Allocation type */ |
| CUmemAllocationType type; |
| /** requested ::CUmemAllocationHandleType */ |
| CUmemAllocationHandleType requestedHandleTypes; |
| /** Location of allocation */ |
| CUmemLocation location; |
| /** |
| * Windows-specific LPSECURITYATTRIBUTES required when |
| * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines |
| * the scope of which exported allocations may be tranferred to other |
| * processes. In all other cases, this field is required to be zero. |
| */ |
| void *win32HandleMetaData; |
| struct { |
| /** |
| * Additional allocation hint for requesting compressible memory. |
| * Compressed memory allows higher bandwidth, but may cause |
| * compression resource thrashing, and compressed memory may not be |
| * mappeable on all devices. |
| */ |
| unsigned char compressionType; |
| unsigned char gpuDirectRDMACapable; |
| unsigned char reserved[6]; |
| } allocFlags; |
| } CUmemAllocationProp; |
| |
| /** |
| * Memory access descriptor |
| */ |
| typedef struct CUmemAccessDesc_st { |
| CUmemLocation location; /**< Location on which the request is to change it's accessibility */ |
| CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ |
| } CUmemAccessDesc; |
| |
| typedef enum CUgraphExecUpdateResult_enum { |
| CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ |
| CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ |
| CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ |
| CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ |
| CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed */ |
| CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ |
| CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6 /**< The update failed because something about the node is not supported */ |
| } CUgraphExecUpdateResult; |
| |
| /** |
| * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only |
| * waits for prior work in the stream corresponding to that GPU to complete before the |
| * kernel begins execution. |
| */ |
| #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 |
| |
| /** |
| * If set, any subsequent work pushed in a stream that participated in a call to |
| * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on |
| * the GPU corresponding to that stream to complete before it begins execution. |
| */ |
| #define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 |
| |
| /** |
| * If set, the CUDA array is a collection of layers, where each layer is either a 1D |
| * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number |
| * of layers, not the depth of a 3D array. |
| */ |
| #define CUDA_ARRAY3D_LAYERED 0x01 |
| |
| /** |
| * Deprecated, use CUDA_ARRAY3D_LAYERED |
| */ |
| #define CUDA_ARRAY3D_2DARRAY 0x01 |
| |
| /** |
| * This flag must be set in order to bind a surface reference |
| * to the CUDA array |
| */ |
| #define CUDA_ARRAY3D_SURFACE_LDST 0x02 |
| |
| /** |
| * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The |
| * width of such a CUDA array must be equal to its height, and Depth must be six. |
| * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps |
| * and Depth must be a multiple of six. |
| */ |
| #define CUDA_ARRAY3D_CUBEMAP 0x04 |
| |
| /** |
| * This flag must be set in order to perform texture gather operations |
| * on a CUDA array. |
| */ |
| #define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 |
| |
| /** |
| * This flag if set indicates that the CUDA |
| * array is a DEPTH_TEXTURE. |
| */ |
| #define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 |
| |
| /** |
| * This flag indicates that the CUDA array may be bound as a color target |
| * in an external graphics API |
| */ |
| #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 |
| |
| /** |
| * Override the texref format with a format inferred from the array. |
| * Flag for ::cuTexRefSetArray() |
| */ |
| #define CU_TRSA_OVERRIDE_FORMAT 0x01 |
| |
| /** |
| * Read the texture as integers rather than promoting the values to floats |
| * in the range [0,1]. |
| * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
| */ |
| #define CU_TRSF_READ_AS_INTEGER 0x01 |
| |
| /** |
| * Use normalized texture coordinates in the range [0,1) instead of [0,dim). |
| * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
| */ |
| #define CU_TRSF_NORMALIZED_COORDINATES 0x02 |
| |
| /** |
| * Perform sRGB->linear conversion during texture read. |
| * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
| */ |
| #define CU_TRSF_SRGB 0x10 |
| |
| /** |
| * Disable any trilinear filtering optimizations. |
| * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() |
| */ |
| #define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 |
| |
| /** |
| * End of array terminator for the \p extra parameter to |
| * ::cuLaunchKernel |
| */ |
| #define CU_LAUNCH_PARAM_END ((void*)0x00) |
| |
| /** |
| * Indicator that the next value in the \p extra parameter to |
| * ::cuLaunchKernel will be a pointer to a buffer containing all kernel |
| * parameters used for launching kernel \p f. This buffer needs to |
| * honor all alignment/padding requirements of the individual parameters. |
| * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the |
| * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no |
| * effect. |
| */ |
| #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) |
| |
| /** |
| * Indicator that the next value in the \p extra parameter to |
| * ::cuLaunchKernel will be a pointer to a size_t which contains the |
| * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. |
| * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified |
| * in the \p extra array if the value associated with |
| * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. |
| */ |
| #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) |
| |
| /** |
| * For texture references loaded into the module, use default texunit from |
| * texture reference. |
| */ |
| #define CU_PARAM_TR_DEFAULT -1 |
| |
| /** |
| * Device that represents the CPU |
| */ |
| #define CU_DEVICE_CPU ((CUdevice)-1) |
| |
| /** |
| * Device that represents an invalid device |
| */ |
| #define CU_DEVICE_INVALID ((CUdevice)-2) |
| |
| /** @} */ /* END CUDA_TYPES */ |
| |
| #if defined(__GNUC__) |
| #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) |
| #pragma GCC visibility push(default) |
| #endif |
| #endif |
| |
| #ifdef _WIN32 |
| #define CUDAAPI __stdcall |
| #else |
| #define CUDAAPI |
| #endif |
| |
| /** |
| * \defgroup CUDA_ERROR Error Handling |
| * |
| * ___MANBRIEF___ error handling functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the error handling functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Gets the string description of an error code |
| * |
| * Sets \p *pStr to the address of a NULL-terminated string description |
| * of the error code \p error. |
| * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE |
| * will be returned and \p *pStr will be set to the NULL address. |
| * |
| * \param error - Error code to convert to string |
| * \param pStr - Address of the string pointer. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::CUresult, |
| * ::cudaGetErrorString |
| */ |
| CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); |
| |
| /** |
| * \brief Gets the string representation of an error code enum name |
| * |
| * Sets \p *pStr to the address of a NULL-terminated string representation |
| * of the name of the enum error code \p error. |
| * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE |
| * will be returned and \p *pStr will be set to the NULL address. |
| * |
| * \param error - Error code to convert to string |
| * \param pStr - Address of the string pointer. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::CUresult, |
| * ::cudaGetErrorName |
| */ |
| CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); |
| |
| /** @} */ /* END CUDA_ERROR */ |
| |
| /** |
| * \defgroup CUDA_INITIALIZE Initialization |
| * |
| * ___MANBRIEF___ initialization functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the initialization functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Initialize the CUDA driver API |
| * |
| * Initializes the driver API and must be called before any other function from |
| * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() |
| * has not been called, any function from the driver API will return |
| * ::CUDA_ERROR_NOT_INITIALIZED. |
| * |
| * \param Flags - Initialization flag for CUDA. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, |
| * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE |
| * \notefnerr |
| */ |
| CUresult CUDAAPI cuInit(unsigned int Flags); |
| |
| /** @} */ /* END CUDA_INITIALIZE */ |
| |
| /** |
| * \defgroup CUDA_VERSION Version Management |
| * |
| * ___MANBRIEF___ version management functions of the low-level CUDA driver |
| * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the version management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Returns the latest CUDA version supported by driver |
| * |
| * Returns in \p *driverVersion the version of CUDA supported by |
| * the driver. The version is returned as |
| * (1000 × major + 10 × minor). For example, CUDA 9.2 |
| * would be represented by 9020. |
| * |
| * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if |
| * \p driverVersion is NULL. |
| * |
| * \param driverVersion - Returns the CUDA driver version |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cudaDriverGetVersion, |
| * ::cudaRuntimeGetVersion |
| */ |
| CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); |
| |
| /** @} */ /* END CUDA_VERSION */ |
| |
| /** |
| * \defgroup CUDA_DEVICE Device Management |
| * |
| * ___MANBRIEF___ device management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the device management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Returns a handle to a compute device |
| * |
| * Returns in \p *device a device handle given an ordinal in the range <b>[0, |
| * ::cuDeviceGetCount()-1]</b>. |
| * |
| * \param device - Returned device handle |
| * \param ordinal - Device number to get handle for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGetLuid, |
| * ::cuDeviceTotalMem |
| */ |
| CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); |
| |
| /** |
| * \brief Returns the number of compute-capable devices |
| * |
| * Returns in \p *count the number of devices with compute capability greater |
| * than or equal to 2.0 that are available for execution. If there is no such |
| * device, ::cuDeviceGetCount() returns 0. |
| * |
| * \param count - Returned number of compute-capable devices |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGetLuid, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem, |
| * ::cudaGetDeviceCount |
| */ |
| CUresult CUDAAPI cuDeviceGetCount(int *count); |
| |
| /** |
| * \brief Returns an identifer string for the device |
| * |
| * Returns an ASCII string identifying the device \p dev in the NULL-terminated |
| * string pointed to by \p name. \p len specifies the maximum length of the |
| * string that may be returned. |
| * |
| * \param name - Returned identifier string for the device |
| * \param len - Maximum length of string to store in \p name |
| * \param dev - Device to get identifier string for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGetLuid, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem, |
| * ::cudaGetDeviceProperties |
| */ |
| CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); |
| |
| /** |
| * \brief Return an UUID for the device |
| * |
| * Returns 16-octets identifing the device \p dev in the structure |
| * pointed by the \p uuid. |
| * |
| * \param uuid - Returned UUID |
| * \param dev - Device to get identifier string for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetLuid, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem, |
| * ::cudaGetDeviceProperties |
| */ |
| CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); |
| |
| #if defined(_WIN32) |
| /** |
| * \brief Return an LUID and device node mask for the device |
| * |
| * Return identifying information (\p luid and \p deviceNodeMask) to allow |
| * matching device with graphics APIs. |
| * |
| * \param luid - Returned LUID |
| * \param deviceNodeMask - Returned device node mask |
| * \param dev - Device to get identifier string for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem, |
| * ::cudaGetDeviceProperties |
| */ |
| CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); |
| #endif |
| |
| /** |
| * \brief Returns the total amount of memory on the device |
| * |
| * Returns in \p *bytes the total amount of memory available on the device |
| * \p dev in bytes. |
| * |
| * \param bytes - Returned memory available on device in bytes |
| * \param dev - Device handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGet, |
| * ::cudaMemGetInfo |
| */ |
| CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); |
| |
| /** |
| * \brief Returns information about the device |
| * |
| * Returns in \p *pi the integer value of the attribute \p attrib on device |
| * \p dev. The supported attributes are: |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per |
| * block; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of |
| * shared memory available to a thread block in bytes; |
| * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for |
| * __constant__ variables in a CUDA C kernel in bytes; |
| * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the |
| * memory copy functions that involve memory regions allocated through |
| * ::cuMemAllocPitch(); |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D |
| * texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width |
| * for a 1D texture bound to linear memory; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum |
| * mipmapped 1D texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D |
| * texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D |
| * texture height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width |
| * for a 2D texture bound to linear memory; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height |
| * for a 2D texture bound to linear memory; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch |
| * in bytes for a 2D texture bound to linear memory; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum |
| * mipmapped 2D texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum |
| * mipmapped 2D texture height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D |
| * texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D |
| * texture height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D |
| * texture depth; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: |
| * Alternate maximum 3D texture width, 0 if no alternate |
| * maximum 3D texture size is supported; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: |
| * Alternate maximum 3D texture height, 0 if no alternate |
| * maximum 3D texture size is supported; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: |
| * Alternate maximum 3D texture depth, 0 if no alternate |
| * maximum 3D texture size is supported; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: |
| * Maximum cubemap texture width or height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: |
| * Maximum 1D layered texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: |
| * Maximum layers in a 1D layered texture; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: |
| * Maximum 2D layered texture width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: |
| * Maximum 2D layered texture height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: |
| * Maximum layers in a 2D layered texture; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: |
| * Maximum cubemap layered texture width or height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: |
| * Maximum layers in a cubemap layered texture; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: |
| * Maximum 1D surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: |
| * Maximum 2D surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: |
| * Maximum 2D surface height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: |
| * Maximum 3D surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: |
| * Maximum 3D surface height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: |
| * Maximum 3D surface depth; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: |
| * Maximum 1D layered surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: |
| * Maximum layers in a 1D layered surface; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: |
| * Maximum 2D layered surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: |
| * Maximum 2D layered surface height; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: |
| * Maximum layers in a 2D layered surface; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: |
| * Maximum cubemap surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: |
| * Maximum cubemap layered surface width; |
| * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: |
| * Maximum layers in a cubemap layered surface; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit |
| * registers available to a thread block; |
| * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; |
| * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture |
| * base addresses aligned to ::textureAlign bytes do not need an offset |
| * applied to texture fetches; |
| * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement |
| * for 2D texture references bound to pitched memory; |
| * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy |
| * memory between host and device while executing a kernel, or 0 if not; |
| * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on |
| * the device; |
| * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit |
| * for kernels executed on the device, or 0 if not; |
| * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the |
| * memory subsystem, or 0 if not; |
| * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host |
| * memory into the CUDA address space, or 0 if not; |
| * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently |
| * in. Available modes are as follows: |
| * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and |
| * can have multiple CUDA contexts present at a single time. |
| * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is |
| * prohibited from creating new CUDA contexts. |
| * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device |
| * can have only one context used by a single process at a time. |
| * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports |
| * executing multiple kernels within the same context simultaneously, or 0 if |
| * not. It is not guaranteed that multiple kernels will be resident |
| * on the device concurrently so this feature should not be relied upon for |
| * correctness; |
| * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the |
| * device, 0 if error correction is disabled or not supported by the device; |
| * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; |
| * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier |
| * of the device; |
| * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device |
| * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC |
| * is only available on Tesla hardware running Windows Vista or later; |
| * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; |
| * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; |
| * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; |
| * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with |
| * the host, or 0 if not; |
| * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; |
| * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; |
| * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals |
| * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; |
| * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals |
| * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of |
| * shared memory available to a multiprocessor in bytes; this amount is shared |
| * by all thread blocks simultaneously resident on a multiprocessor; |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit |
| * registers available to a multiprocessor; this number is shared by all thread |
| * blocks simultaneously resident on a multiprocessor; |
| * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory |
| * on this system, 0 if allocating managed memory is not supported by the device on this system. |
| * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. |
| * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices |
| * associated with the same board. Devices on the same multi-GPU board will share the same identifier. |
| * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host |
| * supports native atomic operations. |
| * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance |
| * (in floating-point operations per second) to double precision performance. |
| * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing |
| * pageable memory without calling cudaHostRegister on it. |
| * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory |
| * concurrently with the CPU. |
| * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. |
| * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered |
| * memory at the same virtual address as the CPU. |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size |
| * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. |
| * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES |
| * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's |
| * page tables. |
| * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. |
| * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED: Device supports virtual address management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs |
| * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate |
| * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate |
| * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: maximum L2 cache size in byte for persisting lines |
| * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor. |
| * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate |
| * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes. |
| * |
| * \param pi - Returned device attribute value |
| * \param attrib - Device attribute to query |
| * \param dev - Device handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem, |
| * ::cudaDeviceGetAttribute, |
| * ::cudaGetDeviceProperties |
| */ |
| CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); |
| |
| /** |
| * \brief Return NvSciSync attributes that this device can support. |
| * |
| * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that |
| * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList |
| * can be used to create an NvSciSync object that matches this device's capabilities. |
| * |
| * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is |
| * already set this API will return ::CUDA_ERROR_INVALID_VALUE. |
| * |
| * The applications should set \p nvSciSyncAttrList to a valid |
| * NvSciSyncAttrList failing which this API will return |
| * ::CUDA_ERROR_INVALID_HANDLE. |
| * |
| * The \p flags controls how applications intends to use |
| * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: |
| * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to |
| * signal an NvSciSync on this CUDA device. |
| * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to |
| * wait on an NvSciSync on this CUDA device. |
| * |
| * At least one of these flags must be set, failing which the API |
| * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal |
| * to one another: a developer may set both these flags that allows to |
| * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. |
| * |
| * \param nvSciSyncAttrList - Return NvSciSync attributes supported. |
| * \param dev - Valid Cuda Device to get NvSciSync attributes for. |
| * \param flags - flags describing NvSciSync usage. |
| * |
| * \return |
| * |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_NOT_SUPPORTED, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * |
| * \sa |
| * ::cuImportExternalSemaphore, |
| * ::cuDestroyExternalSemaphore, |
| * ::cuSignalExternalSemaphoresAsync, |
| * ::cuWaitExternalSemaphoresAsync |
| */ |
| CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); |
| |
| /** @} */ /* END CUDA_DEVICE */ |
| |
| /** |
| * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] |
| * |
| * ___MANBRIEF___ deprecated device management functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the device management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Returns properties for a selected device |
| * |
| * \deprecated |
| * |
| * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). |
| * |
| * Returns in \p *prop the properties of device \p dev. The ::CUdevprop |
| * structure is defined as: |
| * |
| * \code |
| typedef struct CUdevprop_st { |
| int maxThreadsPerBlock; |
| int maxThreadsDim[3]; |
| int maxGridSize[3]; |
| int sharedMemPerBlock; |
| int totalConstantMemory; |
| int SIMDWidth; |
| int memPitch; |
| int regsPerBlock; |
| int clockRate; |
| int textureAlign |
| } CUdevprop; |
| * \endcode |
| * where: |
| * |
| * - ::maxThreadsPerBlock is the maximum number of threads per block; |
| * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; |
| * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; |
| * - ::sharedMemPerBlock is the total amount of shared memory available per |
| * block in bytes; |
| * - ::totalConstantMemory is the total amount of constant memory available on |
| * the device in bytes; |
| * - ::SIMDWidth is the warp size; |
| * - ::memPitch is the maximum pitch allowed by the memory copy functions that |
| * involve memory regions allocated through ::cuMemAllocPitch(); |
| * - ::regsPerBlock is the total number of registers available per block; |
| * - ::clockRate is the clock frequency in kilohertz; |
| * - ::textureAlign is the alignment requirement; texture base addresses that |
| * are aligned to ::textureAlign bytes do not need an offset applied to |
| * texture fetches. |
| * |
| * \param prop - Returned properties of device |
| * \param dev - Device to get properties for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); |
| |
| /** |
| * \brief Returns the compute capability of the device |
| * |
| * \deprecated |
| * |
| * This function was deprecated as of CUDA 5.0 and its functionality superceded |
| * by ::cuDeviceGetAttribute(). |
| * |
| * Returns in \p *major and \p *minor the major and minor revision numbers that |
| * define the compute capability of the device \p dev. |
| * |
| * \param major - Major revision number |
| * \param minor - Minor revision number |
| * \param dev - Device handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetCount, |
| * ::cuDeviceGetName, |
| * ::cuDeviceGetUuid, |
| * ::cuDeviceGet, |
| * ::cuDeviceTotalMem |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); |
| |
| /** @} */ /* END CUDA_DEVICE_DEPRECATED */ |
| |
| /** |
| * \defgroup CUDA_PRIMARY_CTX Primary Context Management |
| * |
| * ___MANBRIEF___ primary context management functions of the low-level CUDA driver |
| * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the primary context management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * The primary context is unique per device and shared with the CUDA runtime API. |
| * These functions allow integration with other libraries using CUDA. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Retain the primary context on the GPU |
| * |
| * Retains the primary context on the device, creating it if necessary, |
| * increasing its usage count. The caller must call |
| * ::cuDevicePrimaryCtxRelease() when done using the context. |
| * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. |
| * |
| * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of |
| * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() |
| * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode |
| * of the device. |
| * The <i>nvidia-smi</i> tool can be used to set the compute mode for |
| * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a |
| * -h option to it. |
| * |
| * Please note that the primary context always supports pinned allocations. Other |
| * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). |
| * |
| * \param pctx - Returned context handle of the new context |
| * \param dev - Device for which primary context is requested |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa ::cuDevicePrimaryCtxRelease, |
| * ::cuDevicePrimaryCtxSetFlags, |
| * ::cuCtxCreate, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); |
| |
| /** |
| * \brief Release the primary context on the GPU |
| * |
| * Releases the primary context interop on the device by decreasing the usage |
| * count by 1. If the usage drops to 0 the primary context of device \p dev |
| * will be destroyed regardless of how many threads it is current to. |
| * |
| * Please note that unlike ::cuCtxDestroy() this method does not pop the context |
| * from stack in any circumstances. |
| * |
| * \param dev - Device which primary context is released |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa ::cuDevicePrimaryCtxRetain, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); |
| |
| /** |
| * \brief Set flags for the primary context |
| * |
| * Sets the flags for the primary context on the device overwriting perviously |
| * set ones. |
| * |
| * The three LSBs of the \p flags parameter can be used to control how the OS |
| * thread, which owns the CUDA context at the time of an API call, interacts |
| * with the OS scheduler when waiting for results from the GPU. Only one of |
| * the scheduling flags can be set when creating a context. |
| * |
| * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for |
| * results from the GPU. This can decrease latency when waiting for the GPU, |
| * but may lower the performance of CPU threads if they are performing work in |
| * parallel with the CUDA thread. |
| * |
| * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for |
| * results from the GPU. This can increase latency when waiting for the GPU, |
| * but can increase the performance of CPU threads performing work in parallel |
| * with the GPU. |
| * |
| * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
| * synchronization primitive when waiting for the GPU to finish work. |
| * |
| * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
| * synchronization primitive when waiting for the GPU to finish work. <br> |
| * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was |
| * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. |
| * |
| * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, |
| * uses a heuristic based on the number of active CUDA contexts in the |
| * process \e C and the number of logical processors in the system \e P. If |
| * \e C > \e P, then CUDA will yield to other OS threads when waiting for |
| * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while |
| * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). |
| * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on |
| * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC |
| * for low-powered devices. |
| * |
| * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory |
| * after resizing local memory for a kernel. This can prevent thrashing by |
| * local memory allocations when launching many kernels with high local |
| * memory usage at the cost of potentially increased memory usage. <br> |
| * <b>Deprecated:</b> This flag is deprecated and the behavior enabled |
| * by this flag is now the default and cannot be disabled. |
| * |
| * \param dev - Device for which the primary context flags are set |
| * \param flags - New flags for the device |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \notefnerr |
| * |
| * \sa ::cuDevicePrimaryCtxRetain, |
| * ::cuDevicePrimaryCtxGetState, |
| * ::cuCtxCreate, |
| * ::cuCtxGetFlags, |
| * ::cudaSetDeviceFlags |
| */ |
| CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); |
| |
| /** |
| * \brief Get the state of the primary context |
| * |
| * Returns in \p *flags the flags for the primary context of \p dev, and in |
| * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag |
| * values. |
| * |
| * \param dev - Device to get primary context flags for |
| * \param flags - Pointer to store flags |
| * \param active - Pointer to store context state; 0 = inactive, 1 = active |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDevicePrimaryCtxSetFlags, |
| * ::cuCtxGetFlags, |
| * ::cudaGetDeviceFlags |
| */ |
| CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); |
| |
| /** |
| * \brief Destroy all allocations and reset all state on the primary context |
| * |
| * Explicitly destroys and cleans up all resources associated with the current |
| * device in the current process. |
| * |
| * Note that it is responsibility of the calling function to ensure that no |
| * other module in the process is using the device any more. For that reason |
| * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. |
| * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() |
| * even after resetting the device. |
| * |
| * \param dev - Device for which primary context is destroyed |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE |
| * \notefnerr |
| * |
| * \sa ::cuDevicePrimaryCtxRetain, |
| * ::cuDevicePrimaryCtxRelease, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cudaDeviceReset |
| */ |
| CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); |
| |
| /** @} */ /* END CUDA_PRIMARY_CTX */ |
| |
| |
| /** |
| * \defgroup CUDA_CTX Context Management |
| * |
| * ___MANBRIEF___ context management functions of the low-level CUDA driver |
| * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the context management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * Please note that some functions are described in |
| * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Create a CUDA context |
| * |
| * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. |
| * |
| * Creates a new CUDA context and associates it with the calling thread. The |
| * \p flags parameter is described below. The context is created with a usage |
| * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or |
| * when done using the context. If a context is already current to the thread, |
| * it is supplanted by the newly created context and may be restored by a subsequent |
| * call to ::cuCtxPopCurrent(). |
| * |
| * The three LSBs of the \p flags parameter can be used to control how the OS |
| * thread, which owns the CUDA context at the time of an API call, interacts |
| * with the OS scheduler when waiting for results from the GPU. Only one of |
| * the scheduling flags can be set when creating a context. |
| * |
| * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for |
| * results from the GPU. This can decrease latency when waiting for the GPU, |
| * but may lower the performance of CPU threads if they are performing work in |
| * parallel with the CUDA thread. |
| * |
| * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for |
| * results from the GPU. This can increase latency when waiting for the GPU, |
| * but can increase the performance of CPU threads performing work in parallel |
| * with the GPU. |
| * |
| * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
| * synchronization primitive when waiting for the GPU to finish work. |
| * |
| * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a |
| * synchronization primitive when waiting for the GPU to finish work. <br> |
| * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was |
| * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. |
| * |
| * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, |
| * uses a heuristic based on the number of active CUDA contexts in the |
| * process \e C and the number of logical processors in the system \e P. If |
| * \e C > \e P, then CUDA will yield to other OS threads when waiting for |
| * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while |
| * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). |
| * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on |
| * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC |
| * for low-powered devices. |
| * |
| * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. |
| * This flag must be set in order to allocate pinned host memory that is |
| * accessible to the GPU. |
| * |
| * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory |
| * after resizing local memory for a kernel. This can prevent thrashing by |
| * local memory allocations when launching many kernels with high local |
| * memory usage at the cost of potentially increased memory usage. <br> |
| * <b>Deprecated:</b> This flag is deprecated and the behavior enabled |
| * by this flag is now the default and cannot be disabled. |
| * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). |
| * |
| * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of |
| * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() |
| * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the |
| * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set |
| * the compute mode for * devices. |
| * Documentation for <i>nvidia-smi</i> can be obtained by passing a |
| * -h option to it. |
| * |
| * \param pctx - Returned context handle of the new context |
| * \param flags - Context creation flags |
| * \param dev - Device to create context on |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); |
| |
| /** |
| * \brief Destroy a CUDA context |
| * |
| * Destroys the CUDA context specified by \p ctx. The context \p ctx will be |
| * destroyed regardless of how many threads it is current to. |
| * It is the responsibility of the calling function to ensure that no API |
| * call issues using \p ctx while ::cuCtxDestroy() is executing. |
| * |
| * If \p ctx is current to the calling thread then \p ctx will also be |
| * popped from the current thread's context stack (as though ::cuCtxPopCurrent() |
| * were called). If \p ctx is current to other threads, then \p ctx will |
| * remain current to those threads, and attempting to access \p ctx from |
| * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. |
| * |
| * \param ctx - Context to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); |
| |
| /** |
| * \brief Pushes a context on the current CPU thread |
| * |
| * Pushes the given context \p ctx onto the CPU thread's stack of current |
| * contexts. The specified context becomes the CPU thread's current context, so |
| * all CUDA functions that operate on the current context are affected. |
| * |
| * The previous current context may be made current again by calling |
| * ::cuCtxDestroy() or ::cuCtxPopCurrent(). |
| * |
| * \param ctx - Context to push |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); |
| |
| /** |
| * \brief Pops the current CUDA context from the current CPU thread. |
| * |
| * Pops the current CUDA context from the CPU thread and passes back the |
| * old context handle in \p *pctx. That context may then be made current |
| * to a different CPU thread by calling ::cuCtxPushCurrent(). |
| * |
| * If a context was current to the CPU thread before ::cuCtxCreate() or |
| * ::cuCtxPushCurrent() was called, this function makes that context current to |
| * the CPU thread again. |
| * |
| * \param pctx - Returned new context handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); |
| |
| /** |
| * \brief Binds the specified CUDA context to the calling CPU thread |
| * |
| * Binds the specified CUDA context to the calling CPU thread. |
| * If \p ctx is NULL then the CUDA context previously bound to the |
| * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. |
| * |
| * If there exists a CUDA context stack on the calling CPU thread, this |
| * will replace the top of that stack with \p ctx. |
| * If \p ctx is NULL then this will be equivalent to popping the top |
| * of the calling CPU thread's CUDA context stack (or a no-op if the |
| * calling CPU thread's CUDA context stack is empty). |
| * |
| * \param ctx - Context to bind to the calling CPU thread |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa |
| * ::cuCtxGetCurrent, |
| * ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cudaSetDevice |
| */ |
| CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); |
| |
| /** |
| * \brief Returns the CUDA context bound to the calling CPU thread. |
| * |
| * Returns in \p *pctx the CUDA context bound to the calling CPU thread. |
| * If no context is bound to the calling CPU thread then \p *pctx is |
| * set to NULL and ::CUDA_SUCCESS is returned. |
| * |
| * \param pctx - Returned context handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * \notefnerr |
| * |
| * \sa |
| * ::cuCtxSetCurrent, |
| * ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cudaGetDevice |
| */ |
| CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); |
| |
| /** |
| * \brief Returns the device ID for the current context |
| * |
| * Returns in \p *device the ordinal of the current context's device. |
| * |
| * \param device - Returned device ID for the current context |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cudaGetDevice |
| */ |
| CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); |
| |
| /** |
| * \brief Returns the flags for the current context |
| * |
| * Returns in \p *flags the flags of the current context. See ::cuCtxCreate |
| * for flag values. |
| * |
| * \param flags - Pointer to store flags of current context |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetCurrent, |
| * ::cuCtxGetDevice |
| * ::cuCtxGetLimit, |
| * ::cuCtxGetSharedMemConfig, |
| * ::cuCtxGetStreamPriorityRange, |
| * ::cudaGetDeviceFlags |
| */ |
| CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); |
| |
| /** |
| * \brief Block for a context's tasks to complete |
| * |
| * Blocks until the device has completed all preceding requested tasks. |
| * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. |
| * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the |
| * CPU thread will block until the GPU context has finished its work. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cudaDeviceSynchronize |
| */ |
| CUresult CUDAAPI cuCtxSynchronize(void); |
| |
| /** |
| * \brief Set resource limits |
| * |
| * Setting \p limit to \p value is a request by the application to update |
| * the current limit maintained by the context. The driver is free to |
| * modify the requested value to meet h/w requirements (this could be |
| * clamping to minimum or maximum values, rounding up to nearest element |
| * size, etc). The application can use ::cuCtxGetLimit() to find out exactly |
| * what the limit has been set to. |
| * |
| * Setting each ::CUlimit has its own specific restrictions, so each is |
| * discussed here. |
| * |
| * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. |
| * The driver automatically increases the per-thread stack size |
| * for each kernel launch as needed. This size isn't reset back to the |
| * original value after each launch. Setting this value will take effect |
| * immediately, and if necessary, the device will block until all preceding |
| * requested tasks are complete. |
| * |
| * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used |
| * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE |
| * must be performed before launching any kernel that uses the ::printf() |
| * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. |
| * |
| * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used |
| * by the ::malloc() and ::free() device system calls. Setting |
| * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel |
| * that uses the ::malloc() or ::free() device system calls, otherwise |
| * ::CUDA_ERROR_INVALID_VALUE will be returned. |
| * |
| * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of |
| * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting |
| * this limit must be performed before any launch of a kernel that uses the |
| * device runtime and calls ::cudaDeviceSynchronize() above the default sync |
| * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail |
| * with error code ::cudaErrorSyncDepthExceeded if the limitation is |
| * violated. This limit can be set smaller than the default or up the maximum |
| * launch depth of 24. When setting this limit, keep in mind that additional |
| * levels of sync depth require the driver to reserve large amounts of device |
| * memory which can no longer be used for user allocations. If these |
| * reservations of device memory fail, ::cuCtxSetLimit() will return |
| * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. |
| * This limit is only applicable to devices of compute capability 3.5 and |
| * higher. Attempting to set this limit on devices of compute capability less |
| * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being |
| * returned. |
| * |
| * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of |
| * outstanding device runtime launches that can be made from the current |
| * context. A grid is outstanding from the point of launch up until the grid |
| * is known to have been completed. Device runtime launches which violate |
| * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when |
| * ::cudaGetLastError() is called after launch. If more pending launches than |
| * the default (2048 launches) are needed for a module using the device |
| * runtime, this limit can be increased. Keep in mind that being able to |
| * sustain additional pending launches will require the driver to reserve |
| * larger amounts of device memory upfront which can no longer be used for |
| * allocations. If these reservations fail, ::cuCtxSetLimit() will return |
| * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. |
| * This limit is only applicable to devices of compute capability 3.5 and |
| * higher. Attempting to set this limit on devices of compute capability less |
| * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being |
| * returned. |
| * |
| * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. |
| * Values can range from 0B to 128B. This is purely a performance hint and |
| * it can be ignored or clamped depending on the platform. |
| * |
| * \param limit - Limit to set |
| * \param value - Size of limit |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNSUPPORTED_LIMIT, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSynchronize, |
| * ::cudaDeviceSetLimit |
| */ |
| CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); |
| |
| /** |
| * \brief Returns resource limits |
| * |
| * Returns in \p *pvalue the current size of \p limit. The supported |
| * ::CUlimit values are: |
| * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. |
| * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the |
| * ::printf() device system call. |
| * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the |
| * ::malloc() and ::free() device system calls. |
| * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread |
| * can issue the device runtime call ::cudaDeviceSynchronize() to wait on |
| * child grid launches to complete. |
| * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding |
| * device runtime launches that can be made from this context. |
| * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. |
| * |
| * \param limit - Limit to query |
| * \param pvalue - Returned size of limit |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNSUPPORTED_LIMIT |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cudaDeviceGetLimit |
| */ |
| CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); |
| |
| /** |
| * \brief Returns the preferred cache configuration for the current context. |
| * |
| * On devices where the L1 cache and shared memory use the same hardware |
| * resources, this function returns through \p pconfig the preferred cache configuration |
| * for the current context. This is only a preference. The driver will use |
| * the requested configuration if possible, but it is free to choose a different |
| * configuration if required to execute functions. |
| * |
| * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices |
| * where the size of the L1 cache and shared memory are fixed. |
| * |
| * The supported cache configurations are: |
| * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
| * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
| * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
| * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
| * |
| * \param pconfig - Returned cache configuration |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cuFuncSetCacheConfig, |
| * ::cudaDeviceGetCacheConfig |
| */ |
| CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); |
| |
| /** |
| * \brief Sets the preferred cache configuration for the current context. |
| * |
| * On devices where the L1 cache and shared memory use the same hardware |
| * resources, this sets through \p config the preferred cache configuration for |
| * the current context. This is only a preference. The driver will use |
| * the requested configuration if possible, but it is free to choose a different |
| * configuration if required to execute the function. Any function preference |
| * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide |
| * setting. Setting the context-wide cache configuration to |
| * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer |
| * to not change the cache configuration unless required to launch the kernel. |
| * |
| * This setting does nothing on devices where the size of the L1 cache and |
| * shared memory are fixed. |
| * |
| * Launching a kernel with a different preference than the most recent |
| * preference setting may insert a device-side synchronization point. |
| * |
| * The supported cache configurations are: |
| * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
| * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
| * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
| * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
| * |
| * \param config - Requested cache configuration |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cuFuncSetCacheConfig, |
| * ::cudaDeviceSetCacheConfig |
| */ |
| CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); |
| |
| /** |
| * \brief Returns the current shared memory configuration for the current context. |
| * |
| * This function will return in \p pConfig the current size of shared memory banks |
| * in the current context. On devices with configurable shared memory banks, |
| * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all |
| * subsequent kernel launches will by default use the new bank size. When |
| * ::cuCtxGetSharedMemConfig is called on devices without configurable shared |
| * memory, it will return the fixed bank size of the hardware. |
| * |
| * The returned bank configurations can be either: |
| * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is |
| * four bytes. |
| * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will |
| * eight bytes. |
| * |
| * \param pConfig - returned shared memory configuration |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cuCtxGetSharedMemConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cudaDeviceGetSharedMemConfig |
| */ |
| CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); |
| |
| /** |
| * \brief Sets the shared memory configuration for the current context. |
| * |
| * On devices with configurable shared memory banks, this function will set |
| * the context's shared memory bank size which is used for subsequent kernel |
| * launches. |
| * |
| * Changed the shared memory configuration between launches may insert a device |
| * side synchronization point between those launches. |
| * |
| * Changing the shared memory bank size will not increase shared memory usage |
| * or affect occupancy of kernels, but may have major effects on performance. |
| * Larger bank sizes will allow for greater potential bandwidth to shared memory, |
| * but will change what kinds of accesses to shared memory will result in bank |
| * conflicts. |
| * |
| * This function will do nothing on devices with fixed shared memory bank size. |
| * |
| * The supported bank configurations are: |
| * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial |
| * setting (currently, four bytes). |
| * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to |
| * be natively four bytes. |
| * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to |
| * be natively eight bytes. |
| * |
| * \param config - requested shared memory configuration |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cuCtxGetSharedMemConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cudaDeviceSetSharedMemConfig |
| */ |
| CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); |
| |
| /** |
| * \brief Gets the context's API version. |
| * |
| * Returns a version number in \p version corresponding to the capabilities of |
| * the context (e.g. 3010 or 3020), which library developers can use to direct |
| * callers to a specific API version. If \p ctx is NULL, returns the API version |
| * used to create the currently bound context. |
| * |
| * Note that new API versions are only introduced when context capabilities are |
| * changed that break binary compatibility, so the API version and driver version |
| * may be different. For example, it is valid for the API version to be 3020 while |
| * the driver version is 4020. |
| * |
| * \param ctx - Context to check |
| * \param version - Pointer to version |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); |
| |
| /** |
| * \brief Returns numerical values that correspond to the least and |
| * greatest stream priorities. |
| * |
| * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond |
| * to the least and greatest stream priorities respectively. Stream priorities |
| * follow a convention where lower numbers imply greater priorities. The range of |
| * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. |
| * If the user attempts to create a stream with a priority value that is |
| * outside the meaningful range as specified by this API, the priority is |
| * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority |
| * respectively. See ::cuStreamCreateWithPriority for details on creating a |
| * priority stream. |
| * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value |
| * is not desired. |
| * |
| * This function will return '0' in both \p *leastPriority and \p *greatestPriority if |
| * the current context's device does not support stream priorities |
| * (see ::cuDeviceGetAttribute). |
| * |
| * \param leastPriority - Pointer to an int in which the numerical value for least |
| * stream priority is returned |
| * \param greatestPriority - Pointer to an int in which the numerical value for greatest |
| * stream priority is returned |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreateWithPriority, |
| * ::cuStreamGetPriority, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize, |
| * ::cudaDeviceGetStreamPriorityRange |
| */ |
| CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); |
| |
| /** |
| * \brief Resets all persisting lines in cache to normal status. |
| * |
| * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal |
| * status. Takes effect on function return. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); |
| |
| /** @} */ /* END CUDA_CTX */ |
| |
| /** |
| * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] |
| * |
| * ___MANBRIEF___ deprecated context management functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the deprecated context management functions of the low-level |
| * CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Increment a context's usage-count |
| * |
| * \deprecated |
| * |
| * Note that this function is deprecated and should not be used. |
| * |
| * Increments the usage count of the context and passes back a context handle |
| * in \p *pctx that must be passed to ::cuCtxDetach() when the application is |
| * done with the context. ::cuCtxAttach() fails if there is no context current |
| * to the thread. |
| * |
| * Currently, the \p flags parameter must be 0. |
| * |
| * \param pctx - Returned context handle of the current context |
| * \param flags - Context attach flags (must be 0) |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxDetach, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); |
| |
| /** |
| * \brief Decrement a context's usage-count |
| * |
| * \deprecated |
| * |
| * Note that this function is deprecated and should not be used. |
| * |
| * Decrements the usage count of the context \p ctx, and destroys the context |
| * if the usage count goes to 0. The context must be a handle that was passed |
| * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the |
| * calling thread. |
| * |
| * \param ctx - Context to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxCreate, |
| * ::cuCtxDestroy, |
| * ::cuCtxGetApiVersion, |
| * ::cuCtxGetCacheConfig, |
| * ::cuCtxGetDevice, |
| * ::cuCtxGetFlags, |
| * ::cuCtxGetLimit, |
| * ::cuCtxPopCurrent, |
| * ::cuCtxPushCurrent, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxSetLimit, |
| * ::cuCtxSynchronize |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); |
| |
| /** @} */ /* END CUDA_CTX_DEPRECATED */ |
| |
| |
| /** |
| * \defgroup CUDA_MODULE Module Management |
| * |
| * ___MANBRIEF___ module management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the module management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Loads a compute module |
| * |
| * Takes a filename \p fname and loads the corresponding module \p module into |
| * the current context. The CUDA driver API does not attempt to lazily |
| * allocate the resources needed by a module; if the memory for functions and |
| * data (constant and global) needed by the module cannot be allocated, |
| * ::cuModuleLoad() fails. The file should be a \e cubin file as output by |
| * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or |
| * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. |
| * |
| * \param module - Returned module |
| * \param fname - Filename of module to load |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_NOT_FOUND, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_FILE_NOT_FOUND, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
| * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
| * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload |
| */ |
| CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); |
| |
| /** |
| * \brief Load a module's data |
| * |
| * Takes a pointer \p image and loads the corresponding module \p module into |
| * the current context. The pointer may be obtained by mapping a \e cubin or |
| * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file |
| * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin |
| * object into the executable resources and using operating system calls such |
| * as Windows \c FindResource() to obtain the pointer. |
| * |
| * \param module - Returned module |
| * \param image - Module data to load |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
| * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
| * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload |
| */ |
| CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); |
| |
| /** |
| * \brief Load a module's data with options |
| * |
| * Takes a pointer \p image and loads the corresponding module \p module into |
| * the current context. The pointer may be obtained by mapping a \e cubin or |
| * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file |
| * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin |
| * object into the executable resources and using operating system calls such |
| * as Windows \c FindResource() to obtain the pointer. Options are passed as |
| * an array via \p options and any corresponding parameters are passed in |
| * \p optionValues. The number of total options is supplied via \p numOptions. |
| * Any outputs will be returned via \p optionValues. |
| * |
| * \param module - Returned module |
| * \param image - Module data to load |
| * \param numOptions - Number of options |
| * \param options - Options for JIT |
| * \param optionValues - Option values for JIT |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
| * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
| * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload |
| */ |
| CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); |
| |
| /** |
| * \brief Load a module's data |
| * |
| * Takes a pointer \p fatCubin and loads the corresponding module \p module |
| * into the current context. The pointer represents a <i>fat binary</i> object, |
| * which is a collection of different \e cubin and/or \e PTX files, all |
| * representing the same device code, but compiled and optimized for different |
| * architectures. |
| * |
| * Prior to CUDA 4.0, there was no documented API for constructing and using |
| * fat binary objects by programmers. Starting with CUDA 4.0, fat binary |
| * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc. |
| * More information can be found in the \b nvcc document. |
| * |
| * \param module - Returned module |
| * \param fatCubin - Fat binary to load |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_NOT_FOUND, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU, |
| * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, |
| * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleUnload |
| */ |
| CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); |
| |
| /** |
| * \brief Unloads a module |
| * |
| * Unloads a module \p hmod from the current context. |
| * |
| * \param hmod - Module to unload |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary |
| */ |
| CUresult CUDAAPI cuModuleUnload(CUmodule hmod); |
| |
| /** |
| * \brief Returns a function handle |
| * |
| * Returns in \p *hfunc the handle of the function of name \p name located in |
| * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() |
| * returns ::CUDA_ERROR_NOT_FOUND. |
| * |
| * \param hfunc - Returned function handle |
| * \param hmod - Module to retrieve function from |
| * \param name - Name of function to retrieve |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload |
| */ |
| CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); |
| |
| /** |
| * \brief Returns a global pointer from a module |
| * |
| * Returns in \p *dptr and \p *bytes the base pointer and size of the |
| * global of name \p name located in module \p hmod. If no variable of that name |
| * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both |
| * parameters \p dptr and \p bytes are optional. If one of them is |
| * NULL, it is ignored. |
| * |
| * \param dptr - Returned global device pointer |
| * \param bytes - Returned global size in bytes |
| * \param hmod - Module to retrieve global from |
| * \param name - Name of global to retrieve |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload, |
| * ::cudaGetSymbolAddress, |
| * ::cudaGetSymbolSize |
| */ |
| CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); |
| |
| /** |
| * \brief Returns a handle to a texture reference |
| * |
| * Returns in \p *pTexRef the handle of the texture reference of name \p name |
| * in the module \p hmod. If no texture reference of that name exists, |
| * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference |
| * handle should not be destroyed, since it will be destroyed when the module |
| * is unloaded. |
| * |
| * \param pTexRef - Returned texture reference |
| * \param hmod - Module to retrieve texture reference from |
| * \param name - Name of texture reference to retrieve |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetSurfRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload, |
| * ::cudaGetTextureReference |
| */ |
| CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); |
| |
| /** |
| * \brief Returns a handle to a surface reference |
| * |
| * Returns in \p *pSurfRef the handle of the surface reference of name \p name |
| * in the module \p hmod. If no surface reference of that name exists, |
| * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. |
| * |
| * \param pSurfRef - Returned surface reference |
| * \param hmod - Module to retrieve surface reference from |
| * \param name - Name of surface reference to retrieve |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuModuleGetFunction, |
| * ::cuModuleGetGlobal, |
| * ::cuModuleGetTexRef, |
| * ::cuModuleLoad, |
| * ::cuModuleLoadData, |
| * ::cuModuleLoadDataEx, |
| * ::cuModuleLoadFatBinary, |
| * ::cuModuleUnload, |
| * ::cudaGetSurfaceReference |
| */ |
| CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); |
| |
| /** |
| * \brief Creates a pending JIT linker invocation. |
| * |
| * If the call is successful, the caller owns the returned CUlinkState, which |
| * should eventually be destroyed with ::cuLinkDestroy. The |
| * device code machine size (32 or 64 bit) will match the calling application. |
| * |
| * Both linker and compiler options may be specified. Compiler options will |
| * be applied to inputs to this linker action which must be compiled from PTX. |
| * The options ::CU_JIT_WALL_TIME, |
| * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES |
| * will accumulate data until the CUlinkState is destroyed. |
| * |
| * \p optionValues must remain valid for the life of the CUlinkState if output |
| * options are used. No other references to inputs are maintained after this |
| * call returns. |
| * |
| * \param numOptions Size of options arrays |
| * \param options Array of linker and compiler options |
| * \param optionValues Array of option values, each cast to void * |
| * \param stateOut On success, this will contain a CUlinkState to specify |
| * and complete this action |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND |
| * \notefnerr |
| * |
| * \sa ::cuLinkAddData, |
| * ::cuLinkAddFile, |
| * ::cuLinkComplete, |
| * ::cuLinkDestroy |
| */ |
| CUresult CUDAAPI |
| cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); |
| |
| /** |
| * \brief Add an input to a pending linker invocation |
| * |
| * Ownership of \p data is retained by the caller. No reference is retained to any |
| * inputs after this call returns. |
| * |
| * This method accepts only compiler options, which are used if the data must |
| * be compiled from PTX, and does not accept any of |
| * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, |
| * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. |
| * |
| * \param state A pending linker action. |
| * \param type The type of the input data. |
| * \param data The input data. PTX must be NULL-terminated. |
| * \param size The length of the input data. |
| * \param name An optional name for this input in log messages. |
| * \param numOptions Size of options. |
| * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). |
| * \param optionValues Array of option values, each cast to void *. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_IMAGE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU |
| * |
| * \sa ::cuLinkCreate, |
| * ::cuLinkAddFile, |
| * ::cuLinkComplete, |
| * ::cuLinkDestroy |
| */ |
| CUresult CUDAAPI |
| cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, |
| unsigned int numOptions, CUjit_option *options, void **optionValues); |
| |
| /** |
| * \brief Add a file input to a pending linker invocation |
| * |
| * No reference is retained to any inputs after this call returns. |
| * |
| * This method accepts only compiler options, which are used if the input |
| * must be compiled from PTX, and does not accept any of |
| * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, |
| * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. |
| * |
| * This method is equivalent to invoking ::cuLinkAddData on the contents |
| * of the file. |
| * |
| * \param state A pending linker action |
| * \param type The type of the input data |
| * \param path Path to the input file |
| * \param numOptions Size of options |
| * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) |
| * \param optionValues Array of option values, each cast to void * |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_FILE_NOT_FOUND |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_IMAGE, |
| * ::CUDA_ERROR_INVALID_PTX, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NO_BINARY_FOR_GPU |
| * |
| * \sa ::cuLinkCreate, |
| * ::cuLinkAddData, |
| * ::cuLinkComplete, |
| * ::cuLinkDestroy |
| */ |
| CUresult CUDAAPI |
| cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, |
| unsigned int numOptions, CUjit_option *options, void **optionValues); |
| |
| /** |
| * \brief Complete a pending linker invocation |
| * |
| * Completes the pending linker action and returns the cubin image for the linked |
| * device code, which can be used with ::cuModuleLoadData. The cubin is owned by |
| * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. |
| * This call does not destroy \p state. |
| * |
| * \param state A pending linker invocation |
| * \param cubinOut On success, this will point to the output image |
| * \param sizeOut Optional parameter to receive the size of the generated image |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * |
| * \sa ::cuLinkCreate, |
| * ::cuLinkAddData, |
| * ::cuLinkAddFile, |
| * ::cuLinkDestroy, |
| * ::cuModuleLoadData |
| */ |
| CUresult CUDAAPI |
| cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); |
| |
| /** |
| * \brief Destroys state for a JIT linker invocation. |
| * |
| * \param state State object for the linker invocation |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * |
| * \sa ::cuLinkCreate |
| */ |
| CUresult CUDAAPI |
| cuLinkDestroy(CUlinkState state); |
| |
| /** @} */ /* END CUDA_MODULE */ |
| |
| |
| /** |
| * \defgroup CUDA_MEM Memory Management |
| * |
| * ___MANBRIEF___ memory management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the memory management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Gets free and total memory |
| * |
| * Returns in \p *free and \p *total respectively, the free and total amount of |
| * memory available for allocation by the CUDA context, in bytes. |
| * |
| * \param free - Returned free memory in bytes |
| * \param total - Returned total memory in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemGetInfo |
| */ |
| CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); |
| |
| /** |
| * \brief Allocates device memory |
| * |
| * Allocates \p bytesize bytes of linear memory on the device and returns in |
| * \p *dptr a pointer to the allocated memory. The allocated memory is suitably |
| * aligned for any kind of variable. The memory is not cleared. If \p bytesize |
| * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. |
| * |
| * \param dptr - Returned device pointer |
| * \param bytesize - Requested allocation size in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMalloc |
| */ |
| CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); |
| |
| /** |
| * \brief Allocates pitched device memory |
| * |
| * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on |
| * the device and returns in \p *dptr a pointer to the allocated memory. The |
| * function may pad the allocation to ensure that corresponding pointers in |
| * any given row will continue to meet the alignment requirements for |
| * coalescing as the address is updated from row to row. \p ElementSizeBytes |
| * specifies the size of the largest reads and writes that will be performed |
| * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced |
| * memory transactions are not possible on other data sizes). If |
| * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, |
| * the kernel will run correctly, but possibly at reduced speed. The pitch |
| * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the |
| * allocation. The intended usage of pitch is as a separate parameter of the |
| * allocation, used to compute addresses within the 2D array. Given the row |
| * and column of an array element of type \b T, the address is computed as: |
| * \code |
| T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; |
| * \endcode |
| * |
| * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with |
| * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is |
| * recommended that programmers consider performing pitch allocations using |
| * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is |
| * especially true if the application will be performing 2D memory copies |
| * between different regions of device memory (whether linear memory or CUDA |
| * arrays). |
| * |
| * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed |
| * to match or exceed the alignment requirement for texture binding with |
| * ::cuTexRefSetAddress2D(). |
| * |
| * \param dptr - Returned device pointer |
| * \param pPitch - Returned pitch of allocation in bytes |
| * \param WidthInBytes - Requested allocation width in bytes |
| * \param Height - Requested allocation height in rows |
| * \param ElementSizeBytes - Size of largest reads/writes for range |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMallocPitch |
| */ |
| CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); |
| |
| /** |
| * \brief Frees device memory |
| * |
| * Frees the memory space pointed to by \p dptr, which must have been returned |
| * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). |
| * |
| * \param dptr - Pointer to memory to free |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaFree |
| */ |
| CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); |
| |
| /** |
| * \brief Get information on memory allocations |
| * |
| * Returns the base address in \p *pbase and size in \p *psize of the |
| * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input |
| * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one |
| * of them is NULL, it is ignored. |
| * |
| * \param pbase - Returned base address |
| * \param psize - Returned size of device memory allocation |
| * \param dptr - Device pointer to query |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_NOT_FOUND, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 |
| */ |
| CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); |
| |
| /** |
| * \brief Allocates page-locked host memory |
| * |
| * Allocates \p bytesize bytes of host memory that is page-locked and |
| * accessible to the device. The driver tracks the virtual memory ranges |
| * allocated with this function and automatically accelerates calls to |
| * functions such as ::cuMemcpy(). Since the memory can be accessed directly by |
| * the device, it can be read or written with much higher bandwidth than |
| * pageable memory obtained with functions such as ::malloc(). Allocating |
| * excessive amounts of memory with ::cuMemAllocHost() may degrade system |
| * performance, since it reduces the amount of memory available to the system |
| * for paging. As a result, this function is best used sparingly to allocate |
| * staging areas for data exchange between host and device. |
| * |
| * Note all host memory allocated using ::cuMemHostAlloc() will automatically |
| * be immediately accessible to all contexts on all devices which support unified |
| * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). |
| * The device pointer that may be used to access this host memory from those |
| * contexts is always equal to the returned host pointer \p *pp. |
| * See \ref CUDA_UNIFIED for additional details. |
| * |
| * \param pp - Returned host pointer to page-locked memory |
| * \param bytesize - Requested allocation size in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMallocHost |
| */ |
| CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); |
| |
| /** |
| * \brief Frees page-locked host memory |
| * |
| * Frees the memory space pointed to by \p p, which must have been returned by |
| * a previous call to ::cuMemAllocHost(). |
| * |
| * \param p - Pointer to memory to free |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaFreeHost |
| */ |
| CUresult CUDAAPI cuMemFreeHost(void *p); |
| |
| /** |
| * \brief Allocates page-locked host memory |
| * |
| * Allocates \p bytesize bytes of host memory that is page-locked and accessible |
| * to the device. The driver tracks the virtual memory ranges allocated with |
| * this function and automatically accelerates calls to functions such as |
| * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, |
| * it can be read or written with much higher bandwidth than pageable memory |
| * obtained with functions such as ::malloc(). Allocating excessive amounts of |
| * pinned memory may degrade system performance, since it reduces the amount |
| * of memory available to the system for paging. As a result, this function is |
| * best used sparingly to allocate staging areas for data exchange between |
| * host and device. |
| * |
| * The \p Flags parameter enables different options to be specified that |
| * affect the allocation, as follows. |
| * |
| * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be |
| * considered as pinned memory by all CUDA contexts, not just the one that |
| * performed the allocation. |
| * |
| * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address |
| * space. The device pointer to the memory may be obtained by calling |
| * ::cuMemHostGetDevicePointer(). |
| * |
| * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined |
| * (WC). WC memory can be transferred across the PCI Express bus more |
| * quickly on some system configurations, but cannot be read efficiently by |
| * most CPUs. WC memory is a good option for buffers that will be written by |
| * the CPU and read by the GPU via mapped pinned memory or host->device |
| * transfers. |
| * |
| * All of these flags are orthogonal to one another: a developer may allocate |
| * memory that is portable, mapped and/or write-combined with no restrictions. |
| * |
| * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in |
| * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect. |
| * |
| * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for |
| * devices that do not support mapped pinned memory. The failure is deferred |
| * to ::cuMemHostGetDevicePointer() because the memory may be mapped into |
| * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. |
| * |
| * The memory allocated by this function must be freed with ::cuMemFreeHost(). |
| * |
| * Note all host memory allocated using ::cuMemHostAlloc() will automatically |
| * be immediately accessible to all contexts on all devices which support unified |
| * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). |
| * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer |
| * that may be used to access this host memory from those contexts is always equal |
| * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED |
| * is specified, then the function ::cuMemHostGetDevicePointer() must be used |
| * to query the device pointer, even if the context supports unified addressing. |
| * See \ref CUDA_UNIFIED for additional details. |
| * |
| * \param pp - Returned host pointer to page-locked memory |
| * \param bytesize - Requested allocation size in bytes |
| * \param Flags - Flags for allocation request |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaHostAlloc |
| */ |
| CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); |
| |
| /** |
| * \brief Passes back device pointer of mapped pinned memory |
| * |
| * Passes back the device pointer \p pdptr corresponding to the mapped, pinned |
| * host buffer \p p allocated by ::cuMemHostAlloc. |
| * |
| * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP |
| * flag was not specified at the time the memory was allocated, or if the |
| * function is called on a GPU that does not support mapped pinned memory. |
| * |
| * For devices that have a non-zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory |
| * can also be accessed from the device using the host pointer \p p. |
| * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not |
| * match the original host pointer \p p and depends on the devices visible to the |
| * application. If all devices visible to the application have a non-zero value for the |
| * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() |
| * will match the original pointer \p p. If any device visible to the application |
| * has a zero value for the device attribute, the device pointer returned by |
| * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, |
| * but it will be suitable for use on all devices provided Unified Virtual Addressing |
| * is enabled. In such systems, it is valid to access the memory using either pointer |
| * on devices that have a non-zero value for the device attribute. Note however that |
| * such devices should access the memory using only of the two pointers and not both. |
| * |
| * \p Flags provides for future releases. For now, it must be set to 0. |
| * |
| * \param pdptr - Returned device pointer |
| * \param p - Host pointer |
| * \param Flags - Options (must be 0) |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaHostGetDevicePointer |
| */ |
| CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); |
| |
| /** |
| * \brief Passes back flags that were used for a pinned allocation |
| * |
| * Passes back the flags \p pFlags that were specified when allocating |
| * the pinned host buffer \p p allocated by ::cuMemHostAlloc. |
| * |
| * ::cuMemHostGetFlags() will fail if the pointer does not reside in |
| * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). |
| * |
| * \param pFlags - Returned flags word |
| * \param p - Host pointer |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemAllocHost, |
| * ::cuMemHostAlloc, |
| * ::cudaHostGetFlags |
| */ |
| CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); |
| |
| /** |
| * \brief Allocates memory that will be automatically managed by the Unified Memory system |
| * |
| * Allocates \p bytesize bytes of managed memory on the device and returns in |
| * \p *dptr a pointer to the allocated memory. If the device doesn't support |
| * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support |
| * for managed memory can be queried using the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably |
| * aligned for any kind of variable. The memory is not cleared. If \p bytesize |
| * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer |
| * is valid on the CPU and on all GPUs in the system that support managed memory. |
| * All accesses to this pointer must obey the Unified Memory programming model. |
| * |
| * \p flags specifies the default stream association for this allocation. |
| * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If |
| * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from |
| * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the |
| * allocation should not be accessed from devices that have a zero value for the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to |
| * ::cuStreamAttachMemAsync will be required to enable access on such devices. |
| * |
| * If the association is later changed via ::cuStreamAttachMemAsync to |
| * a single stream, the default association as specifed during ::cuMemAllocManaged |
| * is restored when that stream is destroyed. For __managed__ variables, the |
| * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a |
| * stream is an asynchronous operation, and as a result, the change to default |
| * association won't happen until all work in the stream has completed. |
| * |
| * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. |
| * |
| * Device memory oversubscription is possible for GPUs that have a non-zero value for the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on |
| * such GPUs may be evicted from device memory to host memory at any time by the Unified |
| * Memory driver in order to make room for other allocations. |
| * |
| * In a multi-GPU system where all GPUs have a non-zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this |
| * API returns and instead may be populated on access. In such systems, managed memory can |
| * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to |
| * maintain data locality and prevent excessive page faults to the extent possible. The application |
| * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application |
| * can also explicitly migrate memory to a desired processor's memory via |
| * ::cuMemPrefetchAsync. |
| * |
| * In a multi-GPU system where all of the GPUs have a zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support |
| * with each other, the physical storage for managed memory is created on the GPU which is active |
| * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced |
| * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate |
| * memory among such GPUs. |
| * |
| * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and |
| * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS |
| * is zero for at least one of those GPUs, the location chosen for physical storage of managed |
| * memory is system-dependent. |
| * - On Linux, the location chosen will be device memory as long as the current set of active |
| * contexts are on devices that either have peer-to-peer support with each other or have a |
| * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
| * If there is an active context on a GPU that does not have a non-zero value for that device |
| * attribute and it does not have peer-to-peer support with the other devices that have active |
| * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. |
| * Note that this means that managed memory that is located in device memory is migrated to |
| * host memory if a new context is created on a GPU that doesn't have a non-zero value for |
| * the device attribute and does not support peer-to-peer with at least one of the other devices |
| * that has an active context. This in turn implies that context creation may fail if there is |
| * insufficient host memory to migrate all managed allocations. |
| * - On Windows, the physical storage is always created in 'zero-copy' or host memory. |
| * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these |
| * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to |
| * restrict CUDA to only use those GPUs that have peer-to-peer support. |
| * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a |
| * non-zero value to force the driver to always use device memory for physical storage. |
| * When this environment variable is set to a non-zero value, all contexts created in |
| * that process on devices that support managed memory have to be peer-to-peer compatible |
| * with each other. Context creation will fail if a context is created on a device that |
| * supports managed memory and is not peer-to-peer compatible with any of the other |
| * managed memory supporting devices on which contexts were previously created, even if |
| * those contexts have been destroyed. These environment variables are described |
| * in the CUDA programming guide under the "CUDA environment variables" section. |
| * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. |
| * |
| * \param dptr - Returned device pointer |
| * \param bytesize - Requested allocation size in bytes |
| * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_NOT_SUPPORTED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, |
| * ::cudaMallocManaged |
| */ |
| CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); |
| |
| /** |
| * \brief Returns a handle to a compute device |
| * |
| * Returns in \p *device a device handle given a PCI bus ID string. |
| * |
| * \param dev - Returned device handle |
| * |
| * \param pciBusId - String in one of the following forms: |
| * [domain]:[bus]:[device].[function] |
| * [domain]:[bus]:[device] |
| * [bus]:[device].[function] |
| * where \p domain, \p bus, \p device, and \p function are all hexadecimal values |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGet, |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetPCIBusId, |
| * ::cudaDeviceGetByPCIBusId |
| */ |
| CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); |
| |
| /** |
| * \brief Returns a PCI Bus Id string for the device |
| * |
| * Returns an ASCII string identifying the device \p dev in the NULL-terminated |
| * string pointed to by \p pciBusId. \p len specifies the maximum length of the |
| * string that may be returned. |
| * |
| * \param pciBusId - Returned identifier string for the device in the following format |
| * [domain]:[bus]:[device].[function] |
| * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. |
| * pciBusId should be large enough to store 13 characters including the NULL-terminator. |
| * |
| * \param len - Maximum length of string to store in \p name |
| * |
| * \param dev - Device to get identifier string for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceGet, |
| * ::cuDeviceGetAttribute, |
| * ::cuDeviceGetByPCIBusId, |
| * ::cudaDeviceGetPCIBusId |
| */ |
| CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); |
| |
| /** |
| * \brief Gets an interprocess handle for a previously allocated event |
| * |
| * Takes as input a previously allocated event. This event must have been |
| * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING |
| * flags set. This opaque handle may be copied into other processes and |
| * opened with ::cuIpcOpenEventHandle to allow efficient hardware |
| * synchronization between GPU work in different processes. |
| * |
| * After the event has been opened in the importing process, |
| * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and |
| * ::cuEventQuery may be used in either process. Performing operations |
| * on the imported event after the exported event has been freed |
| * with ::cuEventDestroy will result in undefined behavior. |
| * |
| * IPC functionality is restricted to devices with support for unified |
| * addressing on Linux and Windows operating systems. |
| * IPC functionality on Windows is restricted to GPUs in TCC mode |
| * |
| * \param pHandle - Pointer to a user allocated CUipcEventHandle |
| * in which to return the opaque event handle |
| * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and |
| * ::CU_EVENT_DISABLE_TIMING flags. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_MAP_FAILED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuEventCreate, |
| * ::cuEventDestroy, |
| * ::cuEventSynchronize, |
| * ::cuEventQuery, |
| * ::cuStreamWaitEvent, |
| * ::cuIpcOpenEventHandle, |
| * ::cuIpcGetMemHandle, |
| * ::cuIpcOpenMemHandle, |
| * ::cuIpcCloseMemHandle, |
| * ::cudaIpcGetEventHandle |
| */ |
| CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); |
| |
| /** |
| * \brief Opens an interprocess event handle for use in the current process |
| * |
| * Opens an interprocess event handle exported from another process with |
| * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like |
| * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. |
| * This event must be freed with ::cuEventDestroy. |
| * |
| * Performing operations on the imported event after the exported event has |
| * been freed with ::cuEventDestroy will result in undefined behavior. |
| * |
| * IPC functionality is restricted to devices with support for unified |
| * addressing on Linux and Windows operating systems. |
| * IPC functionality on Windows is restricted to GPUs in TCC mode |
| * |
| * \param phEvent - Returns the imported event |
| * \param handle - Interprocess handle to open |
| * |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_MAP_FAILED, |
| * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuEventCreate, |
| * ::cuEventDestroy, |
| * ::cuEventSynchronize, |
| * ::cuEventQuery, |
| * ::cuStreamWaitEvent, |
| * ::cuIpcGetEventHandle, |
| * ::cuIpcGetMemHandle, |
| * ::cuIpcOpenMemHandle, |
| * ::cuIpcCloseMemHandle, |
| * ::cudaIpcOpenEventHandle |
| */ |
| CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); |
| |
| /** |
| * \brief Gets an interprocess memory handle for an existing device memory |
| * allocation |
| * |
| * Takes a pointer to the base of an existing device memory allocation created |
| * with ::cuMemAlloc and exports it for use in another process. This is a |
| * lightweight operation and may be called multiple times on an allocation |
| * without adverse effects. |
| * |
| * If a region of memory is freed with ::cuMemFree and a subsequent call |
| * to ::cuMemAlloc returns memory with the same device address, |
| * ::cuIpcGetMemHandle will return a unique handle for the |
| * new memory. |
| * |
| * IPC functionality is restricted to devices with support for unified |
| * addressing on Linux and Windows operating systems. |
| * IPC functionality on Windows is restricted to GPUs in TCC mode |
| * |
| * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return |
| * the handle in. |
| * \param dptr - Base pointer to previously allocated device memory |
| * |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_MAP_FAILED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuMemAlloc, |
| * ::cuMemFree, |
| * ::cuIpcGetEventHandle, |
| * ::cuIpcOpenEventHandle, |
| * ::cuIpcOpenMemHandle, |
| * ::cuIpcCloseMemHandle, |
| * ::cudaIpcGetMemHandle |
| */ |
| CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); |
| |
| /** |
| * \brief Opens an interprocess memory handle exported from another process |
| * and returns a device pointer usable in the local process. |
| * |
| * Maps memory exported from another process with ::cuIpcGetMemHandle into |
| * the current device address space. For contexts on different devices |
| * ::cuIpcOpenMemHandle can attempt to enable peer access between the |
| * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is |
| * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. |
| * ::cuDeviceCanAccessPeer can determine if a mapping is possible. |
| * |
| * ::cuIpcOpenMemHandle can open handles to devices that may not be visible |
| * in the process calling the API. |
| * |
| * Contexts that may open ::CUipcMemHandles are restricted in the following way. |
| * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened |
| * by one ::CUcontext per ::CUdevice per other process. |
| * |
| * Memory returned from ::cuIpcOpenMemHandle must be freed with |
| * ::cuIpcCloseMemHandle. |
| * |
| * Calling ::cuMemFree on an exported memory region before calling |
| * ::cuIpcCloseMemHandle in the importing context will result in undefined |
| * behavior. |
| * |
| * IPC functionality is restricted to devices with support for unified |
| * addressing on Linux and Windows operating systems. |
| * IPC functionality on Windows is restricted to GPUs in TCC mode |
| * |
| * \param pdptr - Returned device pointer |
| * \param handle - ::CUipcMemHandle to open |
| * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS |
| * |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_MAP_FAILED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_TOO_MANY_PEERS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \note No guarantees are made about the address returned in \p *pdptr. |
| * In particular, multiple processes may not receive the same address for the same \p handle. |
| * |
| * \sa |
| * ::cuMemAlloc, |
| * ::cuMemFree, |
| * ::cuIpcGetEventHandle, |
| * ::cuIpcOpenEventHandle, |
| * ::cuIpcGetMemHandle, |
| * ::cuIpcCloseMemHandle, |
| * ::cuCtxEnablePeerAccess, |
| * ::cuDeviceCanAccessPeer, |
| * ::cudaIpcOpenMemHandle |
| */ |
| CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); |
| |
| /** |
| * \brief Close memory mapped with ::cuIpcOpenMemHandle |
| * |
| * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation |
| * in the exporting process as well as imported mappings in other processes |
| * will be unaffected. |
| * |
| * Any resources used to enable peer access will be freed if this is the |
| * last mapping using them. |
| * |
| * IPC functionality is restricted to devices with support for unified |
| * addressing on Linux and Windows operating systems. |
| * IPC functionality on Windows is restricted to GPUs in TCC mode |
| * |
| * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle |
| * |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_MAP_FAILED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \sa |
| * ::cuMemAlloc, |
| * ::cuMemFree, |
| * ::cuIpcGetEventHandle, |
| * ::cuIpcOpenEventHandle, |
| * ::cuIpcGetMemHandle, |
| * ::cuIpcOpenMemHandle, |
| * ::cudaIpcCloseMemHandle |
| */ |
| CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); |
| |
| /** |
| * \brief Registers an existing host memory range for use by CUDA |
| * |
| * Page-locks the memory range specified by \p p and \p bytesize and maps it |
| * for the device(s) as specified by \p Flags. This memory range also is added |
| * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate |
| * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed |
| * directly by the device, it can be read or written with much higher bandwidth |
| * than pageable memory that has not been registered. Page-locking excessive |
| * amounts of memory may degrade system performance, since it reduces the amount |
| * of memory available to the system for paging. As a result, this function is |
| * best used sparingly to register staging areas for data exchange between |
| * host and device. |
| * |
| * This function has limited support on Mac OS X. OS 10.7 or higher is required. |
| * |
| * The \p Flags parameter enables different options to be specified that |
| * affect the allocation, as follows. |
| * |
| * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be |
| * considered as pinned memory by all CUDA contexts, not just the one that |
| * performed the allocation. |
| * |
| * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address |
| * space. The device pointer to the memory may be obtained by calling |
| * ::cuMemHostGetDevicePointer(). |
| * |
| * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some |
| * I/O memory space, e.g. the PCI Express resource of a 3rd party device. |
| * |
| * All of these flags are orthogonal to one another: a developer may page-lock |
| * memory that is portable or mapped with no restrictions. |
| * |
| * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in |
| * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect. |
| * |
| * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for |
| * devices that do not support mapped pinned memory. The failure is deferred |
| * to ::cuMemHostGetDevicePointer() because the memory may be mapped into |
| * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. |
| * |
| * For devices that have a non-zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory |
| * can also be accessed from the device using the host pointer \p p. |
| * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not |
| * match the original host pointer \p ptr and depends on the devices visible to the |
| * application. If all devices visible to the application have a non-zero value for the |
| * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() |
| * will match the original pointer \p ptr. If any device visible to the application |
| * has a zero value for the device attribute, the device pointer returned by |
| * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, |
| * but it will be suitable for use on all devices provided Unified Virtual Addressing |
| * is enabled. In such systems, it is valid to access the memory using either pointer |
| * on devices that have a non-zero value for the device attribute. Note however that |
| * such devices should access the memory using only of the two pointers and not both. |
| * |
| * The memory page-locked by this function must be unregistered with |
| * ::cuMemHostUnregister(). |
| * |
| * \param p - Host pointer to memory to page-lock |
| * \param bytesize - Size in bytes of the address range to page-lock |
| * \param Flags - Flags for allocation request |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemHostUnregister, |
| * ::cuMemHostGetFlags, |
| * ::cuMemHostGetDevicePointer, |
| * ::cudaHostRegister |
| */ |
| CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); |
| |
| /** |
| * \brief Unregisters a memory range that was registered with cuMemHostRegister. |
| * |
| * Unmaps the memory range whose base address is specified by \p p, and makes |
| * it pageable again. |
| * |
| * The base address must be the same one specified to ::cuMemHostRegister(). |
| * |
| * \param p - Host pointer to memory to unregister |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemHostRegister, |
| * ::cudaHostUnregister |
| */ |
| CUresult CUDAAPI cuMemHostUnregister(void *p); |
| |
| /** |
| * \brief Copies memory |
| * |
| * Copies data between two pointers. |
| * \p dst and \p src are base pointers of the destination and source, respectively. |
| * \p ByteCount specifies the number of bytes to copy. |
| * Note that this function infers the type of the transfer (host to host, host to |
| * device, device to device, or device to host) from the pointer values. This |
| * function is only allowed in contexts which support unified addressing. |
| * |
| * \param dst - Destination unified virtual address space pointer |
| * \param src - Source unified virtual address space pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy, |
| * ::cudaMemcpyToSymbol, |
| * ::cudaMemcpyFromSymbol |
| */ |
| CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); |
| |
| /** |
| * \brief Copies device memory between two contexts |
| * |
| * Copies from device memory in one context to device memory in another |
| * context. \p dstDevice is the base device pointer of the destination memory |
| * and \p dstContext is the destination context. \p srcDevice is the base |
| * device pointer of the source memory and \p srcContext is the source pointer. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstContext - Destination context |
| * \param srcDevice - Source device pointer |
| * \param srcContext - Source context |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
| * ::cuMemcpy3DPeerAsync, |
| * ::cudaMemcpyPeer |
| */ |
| CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Host to Device |
| * |
| * Copies from host memory to device memory. \p dstDevice and \p srcHost are |
| * the base addresses of the destination and source, respectively. \p ByteCount |
| * specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param srcHost - Source host pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy, |
| * ::cudaMemcpyToSymbol |
| */ |
| CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Device to Host |
| * |
| * Copies from device to host memory. \p dstHost and \p srcDevice specify the |
| * base pointers of the destination and source, respectively. \p ByteCount |
| * specifies the number of bytes to copy. |
| * |
| * \param dstHost - Destination host pointer |
| * \param srcDevice - Source device pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy, |
| * ::cudaMemcpyFromSymbol |
| */ |
| CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Device to Device |
| * |
| * Copies from device memory to device memory. \p dstDevice and \p srcDevice |
| * are the base pointers of the destination and source, respectively. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param srcDevice - Source device pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy, |
| * ::cudaMemcpyToSymbol, |
| * ::cudaMemcpyFromSymbol |
| */ |
| CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Device to Array |
| * |
| * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset |
| * specify the CUDA array handle and starting index of the destination data. |
| * \p srcDevice specifies the base pointer of the source. \p ByteCount |
| * specifies the number of bytes to copy. |
| * |
| * \param dstArray - Destination array |
| * \param dstOffset - Offset in bytes of destination array |
| * \param srcDevice - Source device pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpyToArray |
| */ |
| CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Array to Device |
| * |
| * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the |
| * base pointer of the destination and must be naturally aligned with the CUDA |
| * array elements. \p srcArray and \p srcOffset specify the CUDA array handle |
| * and the offset in bytes into the array where the copy is to begin. |
| * \p ByteCount specifies the number of bytes to copy and must be evenly |
| * divisible by the array element size. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param srcArray - Source array |
| * \param srcOffset - Offset in bytes of source array |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpyFromArray |
| */ |
| CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Host to Array |
| * |
| * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset |
| * specify the CUDA array handle and starting offset in bytes of the destination |
| * data. \p pSrc specifies the base address of the source. \p ByteCount specifies |
| * the number of bytes to copy. |
| * |
| * \param dstArray - Destination array |
| * \param dstOffset - Offset in bytes of destination array |
| * \param srcHost - Source host pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpyToArray |
| */ |
| CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Array to Host |
| * |
| * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base |
| * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA |
| * array handle and starting offset in bytes of the source data. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstHost - Destination device pointer |
| * \param srcArray - Source array |
| * \param srcOffset - Offset in bytes of source array |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpyFromArray |
| */ |
| CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory from Array to Array |
| * |
| * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray |
| * specify the handles of the destination and source CUDA arrays for the copy, |
| * respectively. \p dstOffset and \p srcOffset specify the destination and |
| * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of |
| * bytes to be copied. The size of the elements in the CUDA arrays need not be |
| * the same format, but the elements must be the same size; and count must be |
| * evenly divisible by that size. |
| * |
| * \param dstArray - Destination array |
| * \param dstOffset - Offset in bytes of destination array |
| * \param srcArray - Source array |
| * \param srcOffset - Offset in bytes of source array |
| * \param ByteCount - Size of memory copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpyArrayToArray |
| */ |
| CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| |
| /** |
| * \brief Copies memory for 2D arrays |
| * |
| * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
| * The ::CUDA_MEMCPY2D structure is defined as: |
| * |
| * \code |
| typedef struct CUDA_MEMCPY2D_st { |
| unsigned int srcXInBytes, srcY; |
| CUmemorytype srcMemoryType; |
| const void *srcHost; |
| CUdeviceptr srcDevice; |
| CUarray srcArray; |
| unsigned int srcPitch; |
| |
| unsigned int dstXInBytes, dstY; |
| CUmemorytype dstMemoryType; |
| void *dstHost; |
| CUdeviceptr dstDevice; |
| CUarray dstArray; |
| unsigned int dstPitch; |
| |
| unsigned int WidthInBytes; |
| unsigned int Height; |
| } CUDA_MEMCPY2D; |
| * \endcode |
| * where: |
| * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
| * source and destination, respectively; ::CUmemorytype_enum is defined as: |
| * |
| * \code |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, |
| CU_MEMORYTYPE_DEVICE = 0x02, |
| CU_MEMORYTYPE_ARRAY = 0x03, |
| CU_MEMORYTYPE_UNIFIED = 0x04 |
| } CUmemorytype; |
| * \endcode |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::srcArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
| * specify the (host) base address of the source data and the bytes per row to |
| * apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
| * specify the (device) base address of the source data and the bytes per row |
| * to apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
| * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
| * ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
| * specify the (host) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::dstArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
| * specify the (device) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
| * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
| * ignored. |
| * |
| * - ::srcXInBytes and ::srcY specify the base address of the source data for |
| * the copy. |
| * |
| * \par |
| * For host pointers, the starting address is |
| * \code |
| void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::dstXInBytes and ::dstY specify the base address of the destination data |
| * for the copy. |
| * |
| * \par |
| * For host pointers, the base address is |
| * \code |
| void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
| * the 2D copy being performed. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * |
| * \par |
| * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum |
| * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
| * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
| * (device to device, CUDA array to device, CUDA array to CUDA array), |
| * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). |
| * ::cuMemcpy2DUnaligned() does not have this restriction, but may run |
| * significantly slower in the cases where ::cuMemcpy2D() would have returned |
| * an error code. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy2D, |
| * ::cudaMemcpy2DToArray, |
| * ::cudaMemcpy2DFromArray |
| */ |
| CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); |
| |
| /** |
| * \brief Copies memory for 2D arrays |
| * |
| * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
| * The ::CUDA_MEMCPY2D structure is defined as: |
| * |
| * \code |
| typedef struct CUDA_MEMCPY2D_st { |
| unsigned int srcXInBytes, srcY; |
| CUmemorytype srcMemoryType; |
| const void *srcHost; |
| CUdeviceptr srcDevice; |
| CUarray srcArray; |
| unsigned int srcPitch; |
| unsigned int dstXInBytes, dstY; |
| CUmemorytype dstMemoryType; |
| void *dstHost; |
| CUdeviceptr dstDevice; |
| CUarray dstArray; |
| unsigned int dstPitch; |
| unsigned int WidthInBytes; |
| unsigned int Height; |
| } CUDA_MEMCPY2D; |
| * \endcode |
| * where: |
| * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
| * source and destination, respectively; ::CUmemorytype_enum is defined as: |
| * |
| * \code |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, |
| CU_MEMORYTYPE_DEVICE = 0x02, |
| CU_MEMORYTYPE_ARRAY = 0x03, |
| CU_MEMORYTYPE_UNIFIED = 0x04 |
| } CUmemorytype; |
| * \endcode |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::srcArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
| * specify the (host) base address of the source data and the bytes per row to |
| * apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
| * specify the (device) base address of the source data and the bytes per row |
| * to apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
| * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
| * ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::dstArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
| * specify the (host) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
| * specify the (device) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
| * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
| * ignored. |
| * |
| * - ::srcXInBytes and ::srcY specify the base address of the source data for |
| * the copy. |
| * |
| * \par |
| * For host pointers, the starting address is |
| * \code |
| void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::dstXInBytes and ::dstY specify the base address of the destination data |
| * for the copy. |
| * |
| * \par |
| * For host pointers, the base address is |
| * \code |
| void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
| * the 2D copy being performed. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * |
| * \par |
| * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum |
| * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
| * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
| * (device to device, CUDA array to device, CUDA array to CUDA array), |
| * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). |
| * ::cuMemcpy2DUnaligned() does not have this restriction, but may run |
| * significantly slower in the cases where ::cuMemcpy2D() would have returned |
| * an error code. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy2D, |
| * ::cudaMemcpy2DToArray, |
| * ::cudaMemcpy2DFromArray |
| */ |
| CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); |
| |
| /** |
| * \brief Copies memory for 3D arrays |
| * |
| * Perform a 3D memory copy according to the parameters specified in |
| * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: |
| * |
| * \code |
| typedef struct CUDA_MEMCPY3D_st { |
| |
| unsigned int srcXInBytes, srcY, srcZ; |
| unsigned int srcLOD; |
| CUmemorytype srcMemoryType; |
| const void *srcHost; |
| CUdeviceptr srcDevice; |
| CUarray srcArray; |
| unsigned int srcPitch; // ignored when src is array |
| unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 |
| |
| unsigned int dstXInBytes, dstY, dstZ; |
| unsigned int dstLOD; |
| CUmemorytype dstMemoryType; |
| void *dstHost; |
| CUdeviceptr dstDevice; |
| CUarray dstArray; |
| unsigned int dstPitch; // ignored when dst is array |
| unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 |
| |
| unsigned int WidthInBytes; |
| unsigned int Height; |
| unsigned int Depth; |
| } CUDA_MEMCPY3D; |
| * \endcode |
| * where: |
| * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
| * source and destination, respectively; ::CUmemorytype_enum is defined as: |
| * |
| * \code |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, |
| CU_MEMORYTYPE_DEVICE = 0x02, |
| CU_MEMORYTYPE_ARRAY = 0x03, |
| CU_MEMORYTYPE_UNIFIED = 0x04 |
| } CUmemorytype; |
| * \endcode |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::srcArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and |
| * ::srcHeight specify the (host) base address of the source data, the bytes |
| * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
| * ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and |
| * ::srcHeight specify the (device) base address of the source data, the bytes |
| * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
| * ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
| * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and |
| * ::srcHeight are ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::dstArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
| * specify the (host) base address of the destination data, the bytes per row, |
| * and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
| * specify the (device) base address of the destination data, the bytes per |
| * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
| * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and |
| * ::dstHeight are ignored. |
| * |
| * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source |
| * data for the copy. |
| * |
| * \par |
| * For host pointers, the starting address is |
| * \code |
| void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - dstXInBytes, ::dstY and ::dstZ specify the base address of the |
| * destination data for the copy. |
| * |
| * \par |
| * For host pointers, the base address is |
| * \code |
| void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height |
| * and depth of the 3D copy being performed. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * - If specified, ::srcHeight must be greater than or equal to ::Height + |
| * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
| * |
| * \par |
| * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum |
| * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). |
| * |
| * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be |
| * set to 0. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMemcpy3D |
| */ |
| CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); |
| |
| /** |
| * \brief Copies memory between contexts |
| * |
| * Perform a 3D memory copy according to the parameters specified in |
| * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure |
| * for documentation of its parameters. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
| * ::cuMemcpy3DPeerAsync, |
| * ::cudaMemcpy3DPeer |
| */ |
| CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); |
| |
| /** |
| * \brief Copies memory asynchronously |
| * |
| * Copies data between two pointers. |
| * \p dst and \p src are base pointers of the destination and source, respectively. |
| * \p ByteCount specifies the number of bytes to copy. |
| * Note that this function infers the type of the transfer (host to host, host to |
| * device, device to device, or device to host) from the pointer values. This |
| * function is only allowed in contexts which support unified addressing. |
| * |
| * \param dst - Destination unified virtual address space pointer |
| * \param src - Source unified virtual address space pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyAsync, |
| * ::cudaMemcpyToSymbolAsync, |
| * ::cudaMemcpyFromSymbolAsync |
| */ |
| CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies device memory between two contexts asynchronously. |
| * |
| * Copies from device memory in one context to device memory in another |
| * context. \p dstDevice is the base device pointer of the destination memory |
| * and \p dstContext is the destination context. \p srcDevice is the base |
| * device pointer of the source memory and \p srcContext is the source pointer. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstContext - Destination context |
| * \param srcDevice - Source device pointer |
| * \param srcContext - Source context |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpy3DPeerAsync, |
| * ::cudaMemcpyPeerAsync |
| */ |
| CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory from Host to Device |
| * |
| * Copies from host memory to device memory. \p dstDevice and \p srcHost are |
| * the base addresses of the destination and source, respectively. \p ByteCount |
| * specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param srcHost - Source host pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyAsync, |
| * ::cudaMemcpyToSymbolAsync |
| */ |
| CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory from Device to Host |
| * |
| * Copies from device to host memory. \p dstHost and \p srcDevice specify the |
| * base pointers of the destination and source, respectively. \p ByteCount |
| * specifies the number of bytes to copy. |
| * |
| * \param dstHost - Destination host pointer |
| * \param srcDevice - Source device pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyAsync, |
| * ::cudaMemcpyFromSymbolAsync |
| */ |
| CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory from Device to Device |
| * |
| * Copies from device memory to device memory. \p dstDevice and \p srcDevice |
| * are the base pointers of the destination and source, respectively. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param srcDevice - Source device pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyAsync, |
| * ::cudaMemcpyToSymbolAsync, |
| * ::cudaMemcpyFromSymbolAsync |
| */ |
| CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory from Host to Array |
| * |
| * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset |
| * specify the CUDA array handle and starting offset in bytes of the |
| * destination data. \p srcHost specifies the base address of the source. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstArray - Destination array |
| * \param dstOffset - Offset in bytes of destination array |
| * \param srcHost - Source host pointer |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyToArrayAsync |
| */ |
| CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory from Array to Host |
| * |
| * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base |
| * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA |
| * array handle and starting offset in bytes of the source data. |
| * \p ByteCount specifies the number of bytes to copy. |
| * |
| * \param dstHost - Destination pointer |
| * \param srcArray - Source array |
| * \param srcOffset - Offset in bytes of source array |
| * \param ByteCount - Size of memory copy in bytes |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * \note_memcpy |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpyFromArrayAsync |
| */ |
| CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); |
| |
| /** |
| * \brief Copies memory for 2D arrays |
| * |
| * Perform a 2D memory copy according to the parameters specified in \p pCopy. |
| * The ::CUDA_MEMCPY2D structure is defined as: |
| * |
| * \code |
| typedef struct CUDA_MEMCPY2D_st { |
| unsigned int srcXInBytes, srcY; |
| CUmemorytype srcMemoryType; |
| const void *srcHost; |
| CUdeviceptr srcDevice; |
| CUarray srcArray; |
| unsigned int srcPitch; |
| unsigned int dstXInBytes, dstY; |
| CUmemorytype dstMemoryType; |
| void *dstHost; |
| CUdeviceptr dstDevice; |
| CUarray dstArray; |
| unsigned int dstPitch; |
| unsigned int WidthInBytes; |
| unsigned int Height; |
| } CUDA_MEMCPY2D; |
| * \endcode |
| * where: |
| * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
| * source and destination, respectively; ::CUmemorytype_enum is defined as: |
| * |
| * \code |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, |
| CU_MEMORYTYPE_DEVICE = 0x02, |
| CU_MEMORYTYPE_ARRAY = 0x03, |
| CU_MEMORYTYPE_UNIFIED = 0x04 |
| } CUmemorytype; |
| * \endcode |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch |
| * specify the (host) base address of the source data and the bytes per row to |
| * apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::srcArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch |
| * specify the (device) base address of the source data and the bytes per row |
| * to apply. ::srcArray is ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
| * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are |
| * ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::dstArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
| * specify the (host) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
| * specify the (device) base address of the destination data and the bytes per |
| * row to apply. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
| * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are |
| * ignored. |
| * |
| * - ::srcXInBytes and ::srcY specify the base address of the source data for |
| * the copy. |
| * |
| * \par |
| * For host pointers, the starting address is |
| * \code |
| void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::dstXInBytes and ::dstY specify the base address of the destination data |
| * for the copy. |
| * |
| * \par |
| * For host pointers, the base address is |
| * \code |
| void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::WidthInBytes and ::Height specify the width (in bytes) and height of |
| * the 2D copy being performed. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * - If specified, ::srcHeight must be greater than or equal to ::Height + |
| * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
| * |
| * \par |
| * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum |
| * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back |
| * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies |
| * (device to device, CUDA array to device, CUDA array to CUDA array), |
| * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). |
| * |
| * \param pCopy - Parameters for the memory copy |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpy2DAsync, |
| * ::cudaMemcpy2DToArrayAsync, |
| * ::cudaMemcpy2DFromArrayAsync |
| */ |
| CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); |
| |
| /** |
| * \brief Copies memory for 3D arrays |
| * |
| * Perform a 3D memory copy according to the parameters specified in |
| * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: |
| * |
| * \code |
| typedef struct CUDA_MEMCPY3D_st { |
| |
| unsigned int srcXInBytes, srcY, srcZ; |
| unsigned int srcLOD; |
| CUmemorytype srcMemoryType; |
| const void *srcHost; |
| CUdeviceptr srcDevice; |
| CUarray srcArray; |
| unsigned int srcPitch; // ignored when src is array |
| unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 |
| |
| unsigned int dstXInBytes, dstY, dstZ; |
| unsigned int dstLOD; |
| CUmemorytype dstMemoryType; |
| void *dstHost; |
| CUdeviceptr dstDevice; |
| CUarray dstArray; |
| unsigned int dstPitch; // ignored when dst is array |
| unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 |
| |
| unsigned int WidthInBytes; |
| unsigned int Height; |
| unsigned int Depth; |
| } CUDA_MEMCPY3D; |
| * \endcode |
| * where: |
| * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the |
| * source and destination, respectively; ::CUmemorytype_enum is defined as: |
| * |
| * \code |
| typedef enum CUmemorytype_enum { |
| CU_MEMORYTYPE_HOST = 0x01, |
| CU_MEMORYTYPE_DEVICE = 0x02, |
| CU_MEMORYTYPE_ARRAY = 0x03, |
| CU_MEMORYTYPE_UNIFIED = 0x04 |
| } CUmemorytype; |
| * \endcode |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::srcArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and |
| * ::srcHeight specify the (host) base address of the source data, the bytes |
| * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
| * ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and |
| * ::srcHeight specify the (device) base address of the source data, the bytes |
| * per row, and the height of each 2D slice of the 3D array. ::srcArray is |
| * ignored. |
| * |
| * \par |
| * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the |
| * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and |
| * ::srcHeight are ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch |
| * specify the (unified virtual address space) base address of the source data |
| * and the bytes per row to apply. ::dstArray is ignored. |
| * This value may be used only if unified addressing is supported in the calling |
| * context. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch |
| * specify the (host) base address of the destination data, the bytes per row, |
| * and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch |
| * specify the (device) base address of the destination data, the bytes per |
| * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. |
| * |
| * \par |
| * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the |
| * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and |
| * ::dstHeight are ignored. |
| * |
| * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source |
| * data for the copy. |
| * |
| * \par |
| * For host pointers, the starting address is |
| * \code |
| void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - dstXInBytes, ::dstY and ::dstZ specify the base address of the |
| * destination data for the copy. |
| * |
| * \par |
| * For host pointers, the base address is |
| * \code |
| void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); |
| * \endcode |
| * |
| * \par |
| * For device pointers, the starting address is |
| * \code |
| CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; |
| * \endcode |
| * |
| * \par |
| * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array |
| * element size. |
| * |
| * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height |
| * and depth of the 3D copy being performed. |
| * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + |
| * ::srcXInBytes, and ::dstPitch must be greater than or equal to |
| * ::WidthInBytes + dstXInBytes. |
| * - If specified, ::srcHeight must be greater than or equal to ::Height + |
| * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. |
| * |
| * \par |
| * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum |
| * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). |
| * |
| * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be |
| * set to 0. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemcpy3DAsync |
| */ |
| CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); |
| |
| /** |
| * \brief Copies memory between contexts asynchronously. |
| * |
| * Perform a 3D memory copy according to the parameters specified in |
| * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure |
| * for documentation of its parameters. |
| * |
| * \param pCopy - Parameters for the memory copy |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, |
| * ::cuMemcpy3DPeerAsync, |
| * ::cudaMemcpy3DPeerAsync |
| */ |
| CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the memory range of \p N 8-bit values to the specified value |
| * \p uc. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param uc - Value to set |
| * \param N - Number of elements |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset |
| */ |
| CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the memory range of \p N 16-bit values to the specified value |
| * \p us. The \p dstDevice pointer must be two byte aligned. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param us - Value to set |
| * \param N - Number of elements |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset |
| */ |
| CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the memory range of \p N 32-bit values to the specified value |
| * \p ui. The \p dstDevice pointer must be four byte aligned. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param ui - Value to set |
| * \param N - Number of elements |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32Async, |
| * ::cudaMemset |
| */ |
| CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the 2D memory range of \p Width 8-bit values to the specified value |
| * \p uc. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param uc - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2D |
| */ |
| CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the 2D memory range of \p Width 16-bit values to the specified value |
| * \p us. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. The \p dstDevice pointer |
| * and \p dstPitch offset must be two byte aligned. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param us - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2D |
| */ |
| CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); |
| |
| /** |
| * \brief Initializes device memory |
| * |
| * Sets the 2D memory range of \p Width 32-bit values to the specified value |
| * \p ui. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. The \p dstDevice pointer |
| * and \p dstPitch offset must be four byte aligned. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param ui - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2D |
| */ |
| CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the memory range of \p N 8-bit values to the specified value |
| * \p uc. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param uc - Value to set |
| * \param N - Number of elements |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemsetAsync |
| */ |
| CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the memory range of \p N 16-bit values to the specified value |
| * \p us. The \p dstDevice pointer must be two byte aligned. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param us - Value to set |
| * \param N - Number of elements |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemsetAsync |
| */ |
| CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the memory range of \p N 32-bit values to the specified value |
| * \p ui. The \p dstDevice pointer must be four byte aligned. |
| * |
| * \param dstDevice - Destination device pointer |
| * \param ui - Value to set |
| * \param N - Number of elements |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, |
| * ::cudaMemsetAsync |
| */ |
| CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the 2D memory range of \p Width 8-bit values to the specified value |
| * \p uc. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param uc - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2DAsync |
| */ |
| CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the 2D memory range of \p Width 16-bit values to the specified value |
| * \p us. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. The \p dstDevice pointer |
| * and \p dstPitch offset must be two byte aligned. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param us - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2DAsync |
| */ |
| CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); |
| |
| /** |
| * \brief Sets device memory |
| * |
| * Sets the 2D memory range of \p Width 32-bit values to the specified value |
| * \p ui. \p Height specifies the number of rows to set, and \p dstPitch |
| * specifies the number of bytes between each row. The \p dstDevice pointer |
| * and \p dstPitch offset must be four byte aligned. This function performs |
| * fastest when the pitch is one that has been passed back by |
| * ::cuMemAllocPitch(). |
| * |
| * \param dstDevice - Destination device pointer |
| * \param dstPitch - Pitch of destination device pointer |
| * \param ui - Value to set |
| * \param Width - Width of row |
| * \param Height - Number of rows |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * \note_memset |
| * \note_null_stream |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, |
| * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, |
| * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, |
| * ::cuMemsetD32, ::cuMemsetD32Async, |
| * ::cudaMemset2DAsync |
| */ |
| CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); |
| |
| /** |
| * \brief Creates a 1D or 2D CUDA array |
| * |
| * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure |
| * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. |
| * The ::CUDA_ARRAY_DESCRIPTOR is defined as: |
| * |
| * \code |
| typedef struct { |
| unsigned int Width; |
| unsigned int Height; |
| CUarray_format Format; |
| unsigned int NumChannels; |
| } CUDA_ARRAY_DESCRIPTOR; |
| * \endcode |
| * where: |
| * |
| * - \p Width, and \p Height are the width, and height of the CUDA array (in |
| * elements); the CUDA array is one-dimensional if height is 0, two-dimensional |
| * otherwise; |
| * - ::Format specifies the format of the elements; ::CUarray_format is |
| * defined as: |
| * \code |
| typedef enum CUarray_format_enum { |
| CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
| CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
| CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
| CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
| CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
| CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
| CU_AD_FORMAT_HALF = 0x10, |
| CU_AD_FORMAT_FLOAT = 0x20 |
| } CUarray_format; |
| * \endcode |
| * - \p NumChannels specifies the number of packed components per CUDA array |
| * element; it may be 1, 2, or 4; |
| * |
| * Here are examples of CUDA array descriptions: |
| * |
| * Description for a CUDA array of 2048 floats: |
| * \code |
| CUDA_ARRAY_DESCRIPTOR desc; |
| desc.Format = CU_AD_FORMAT_FLOAT; |
| desc.NumChannels = 1; |
| desc.Width = 2048; |
| desc.Height = 1; |
| * \endcode |
| * |
| * Description for a 64 x 64 CUDA array of floats: |
| * \code |
| CUDA_ARRAY_DESCRIPTOR desc; |
| desc.Format = CU_AD_FORMAT_FLOAT; |
| desc.NumChannels = 1; |
| desc.Width = 64; |
| desc.Height = 64; |
| * \endcode |
| * |
| * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit |
| * float16's: |
| * \code |
| CUDA_ARRAY_DESCRIPTOR desc; |
| desc.FormatFlags = CU_AD_FORMAT_HALF; |
| desc.NumChannels = 4; |
| desc.Width = width; |
| desc.Height = height; |
| * \endcode |
| * |
| * Description for a \p width x \p height CUDA array of 16-bit elements, each |
| * of which is two 8-bit unsigned chars: |
| * \code |
| CUDA_ARRAY_DESCRIPTOR arrayDesc; |
| desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; |
| desc.NumChannels = 2; |
| desc.Width = width; |
| desc.Height = height; |
| * \endcode |
| * |
| * \param pHandle - Returned array |
| * \param pAllocateArray - Array descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMallocArray |
| */ |
| CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); |
| |
| /** |
| * \brief Get a 1D or 2D CUDA array descriptor |
| * |
| * Returns in \p *pArrayDescriptor a descriptor containing information on the |
| * format and dimensions of the CUDA array \p hArray. It is useful for |
| * subroutines that have been passed a CUDA array, but need to know the CUDA |
| * array parameters for validation or other purposes. |
| * |
| * \param pArrayDescriptor - Returned array descriptor |
| * \param hArray - Array to get descriptor of |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaArrayGetInfo |
| */ |
| CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); |
| |
| /** |
| * \brief Destroys a CUDA array |
| * |
| * Destroys the CUDA array \p hArray. |
| * |
| * \param hArray - Array to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_ARRAY_IS_MAPPED, |
| * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaFreeArray |
| */ |
| CUresult CUDAAPI cuArrayDestroy(CUarray hArray); |
| |
| /** |
| * \brief Creates a 3D CUDA array |
| * |
| * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure |
| * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. |
| * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: |
| * |
| * \code |
| typedef struct { |
| unsigned int Width; |
| unsigned int Height; |
| unsigned int Depth; |
| CUarray_format Format; |
| unsigned int NumChannels; |
| unsigned int Flags; |
| } CUDA_ARRAY3D_DESCRIPTOR; |
| * \endcode |
| * where: |
| * |
| * - \p Width, \p Height, and \p Depth are the width, height, and depth of the |
| * CUDA array (in elements); the following types of CUDA arrays can be allocated: |
| * - A 1D array is allocated if \p Height and \p Depth extents are both zero. |
| * - A 2D array is allocated if only \p Depth extent is zero. |
| * - A 3D array is allocated if all three extents are non-zero. |
| * - A 1D layered CUDA array is allocated if only \p Height is zero and the |
| * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number |
| * of layers is determined by the depth extent. |
| * - A 2D layered CUDA array is allocated if all three extents are non-zero and |
| * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number |
| * of layers is determined by the depth extent. |
| * - A cubemap CUDA array is allocated if all three extents are non-zero and the |
| * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and |
| * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, |
| * where the six layers represent the six faces of a cube. The order of the six |
| * layers in memory is the same as that listed in ::CUarray_cubemap_face. |
| * - A cubemap layered CUDA array is allocated if all three extents are non-zero, |
| * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. |
| * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. |
| * A cubemap layered CUDA array is a special type of 2D layered CUDA array that |
| * consists of a collection of cubemaps. The first six layers represent the first |
| * cubemap, the next six layers form the second cubemap, and so on. |
| * |
| * - ::Format specifies the format of the elements; ::CUarray_format is |
| * defined as: |
| * \code |
| typedef enum CUarray_format_enum { |
| CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
| CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
| CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
| CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
| CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
| CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
| CU_AD_FORMAT_HALF = 0x10, |
| CU_AD_FORMAT_FLOAT = 0x20 |
| } CUarray_format; |
| * \endcode |
| * |
| * - \p NumChannels specifies the number of packed components per CUDA array |
| * element; it may be 1, 2, or 4; |
| * |
| * - ::Flags may be set to |
| * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, |
| * \p Depth specifies the number of layers, not the depth of a 3D array. |
| * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. |
| * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array |
| * to a surface reference. |
| * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be |
| * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, |
| * then \p Depth must be a multiple of six. |
| * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. |
| * Texture gather can only be performed on 2D CUDA arrays. |
| * |
| * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. |
| * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute |
| * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. |
| * |
| * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag |
| * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH |
| * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. |
| * |
| * <table> |
| * <tr><td><b>CUDA array type</b></td> |
| * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), |
| * (depth range)}</b></td> |
| * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> |
| * {(width range in elements), (height range), (depth range)}</b></td></tr> |
| * <tr><td>1D</td> |
| * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td> |
| * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> |
| * <tr><td>2D</td> |
| * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td> |
| * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> |
| * <tr><td>3D</td> |
| * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } |
| * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), |
| * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> |
| * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), |
| * (1,SURFACE3D_DEPTH) }</small></td></tr> |
| * <tr><td>1D Layered</td> |
| * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, |
| * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, |
| * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> |
| * <tr><td>2D Layered</td> |
| * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), |
| * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), |
| * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> |
| * <tr><td>Cubemap</td> |
| * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> |
| * <td><small>{ (1,SURFACECUBEMAP_WIDTH), |
| * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> |
| * <tr><td>Cubemap Layered</td> |
| * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), |
| * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), |
| * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> |
| * </table> |
| * |
| * Here are examples of CUDA array descriptions: |
| * |
| * Description for a CUDA array of 2048 floats: |
| * \code |
| CUDA_ARRAY3D_DESCRIPTOR desc; |
| desc.Format = CU_AD_FORMAT_FLOAT; |
| desc.NumChannels = 1; |
| desc.Width = 2048; |
| desc.Height = 0; |
| desc.Depth = 0; |
| * \endcode |
| * |
| * Description for a 64 x 64 CUDA array of floats: |
| * \code |
| CUDA_ARRAY3D_DESCRIPTOR desc; |
| desc.Format = CU_AD_FORMAT_FLOAT; |
| desc.NumChannels = 1; |
| desc.Width = 64; |
| desc.Height = 64; |
| desc.Depth = 0; |
| * \endcode |
| * |
| * Description for a \p width x \p height x \p depth CUDA array of 64-bit, |
| * 4x16-bit float16's: |
| * \code |
| CUDA_ARRAY3D_DESCRIPTOR desc; |
| desc.FormatFlags = CU_AD_FORMAT_HALF; |
| desc.NumChannels = 4; |
| desc.Width = width; |
| desc.Height = height; |
| desc.Depth = depth; |
| * \endcode |
| * |
| * \param pHandle - Returned array |
| * \param pAllocateArray - 3D array descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaMalloc3DArray |
| */ |
| CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); |
| |
| /** |
| * \brief Get a 3D CUDA array descriptor |
| * |
| * Returns in \p *pArrayDescriptor a descriptor containing information on the |
| * format and dimensions of the CUDA array \p hArray. It is useful for |
| * subroutines that have been passed a CUDA array, but need to know the CUDA |
| * array parameters for validation or other purposes. |
| * |
| * This function may be called on 1D and 2D arrays, in which case the \p Height |
| * and/or \p Depth members of the descriptor struct will be set to 0. |
| * |
| * \param pArrayDescriptor - Returned 3D array descriptor |
| * \param hArray - 3D array to get descriptor of |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
| * \notefnerr |
| * |
| * \sa ::cuArray3DCreate, ::cuArrayCreate, |
| * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, |
| * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, |
| * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, |
| * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, |
| * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, |
| * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, |
| * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, |
| * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, |
| * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, |
| * ::cudaArrayGetInfo |
| */ |
| CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); |
| |
| /** |
| * \brief Creates a CUDA mipmapped array |
| * |
| * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure |
| * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. |
| * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is |
| * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. |
| * |
| * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: |
| * |
| * \code |
| typedef struct { |
| unsigned int Width; |
| unsigned int Height; |
| unsigned int Depth; |
| CUarray_format Format; |
| unsigned int NumChannels; |
| unsigned int Flags; |
| } CUDA_ARRAY3D_DESCRIPTOR; |
| * \endcode |
| * where: |
| * |
| * - \p Width, \p Height, and \p Depth are the width, height, and depth of the |
| * CUDA array (in elements); the following types of CUDA arrays can be allocated: |
| * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. |
| * - A 2D mipmapped array is allocated if only \p Depth extent is zero. |
| * - A 3D mipmapped array is allocated if all three extents are non-zero. |
| * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the |
| * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number |
| * of layers is determined by the depth extent. |
| * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and |
| * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number |
| * of layers is determined by the depth extent. |
| * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the |
| * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and |
| * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, |
| * where the six layers represent the six faces of a cube. The order of the six |
| * layers in memory is the same as that listed in ::CUarray_cubemap_face. |
| * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, |
| * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. |
| * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. |
| * A cubemap layered CUDA array is a special type of 2D layered CUDA array that |
| * consists of a collection of cubemaps. The first six layers represent the first |
| * cubemap, the next six layers form the second cubemap, and so on. |
| * |
| * - ::Format specifies the format of the elements; ::CUarray_format is |
| * defined as: |
| * \code |
| typedef enum CUarray_format_enum { |
| CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, |
| CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, |
| CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, |
| CU_AD_FORMAT_SIGNED_INT8 = 0x08, |
| CU_AD_FORMAT_SIGNED_INT16 = 0x09, |
| CU_AD_FORMAT_SIGNED_INT32 = 0x0a, |
| CU_AD_FORMAT_HALF = 0x10, |
| CU_AD_FORMAT_FLOAT = 0x20 |
| } CUarray_format; |
| * \endcode |
| * |
| * - \p NumChannels specifies the number of packed components per CUDA array |
| * element; it may be 1, 2, or 4; |
| * |
| * - ::Flags may be set to |
| * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, |
| * \p Depth specifies the number of layers, not the depth of a 3D array. |
| * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of |
| * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to |
| * bind a mipmap level of the CUDA mipmapped array to a surface reference. |
| * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be |
| * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, |
| * then \p Depth must be a multiple of six. |
| * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. |
| * Texture gather can only be performed on 2D CUDA mipmapped arrays. |
| * |
| * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. |
| * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute |
| * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. |
| * |
| * <table> |
| * <tr><td><b>CUDA array type</b></td> |
| * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), |
| * (depth range)}</b></td> |
| * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> |
| * {(width range in elements), (height range), (depth range)}</b></td></tr> |
| * <tr><td>1D</td> |
| * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td> |
| * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> |
| * <tr><td>2D</td> |
| * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td> |
| * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> |
| * <tr><td>3D</td> |
| * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } |
| * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), |
| * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> |
| * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), |
| * (1,SURFACE3D_DEPTH) }</small></td></tr> |
| * <tr><td>1D Layered</td> |
| * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, |
| * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, |
| * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> |
| * <tr><td>2D Layered</td> |
| * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), |
| * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), |
| * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> |
| * <tr><td>Cubemap</td> |
| * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> |
| * <td><small>{ (1,SURFACECUBEMAP_WIDTH), |
| * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> |
| * <tr><td>Cubemap Layered</td> |
| * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), |
| * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> |
| * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), |
| * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> |
| * </table> |
| * |
| * |
| * \param pHandle - Returned mipmapped array |
| * \param pMipmappedArrayDesc - mipmapped array descriptor |
| * \param numMipmapLevels - Number of mipmap levels |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMipmappedArrayDestroy, |
| * ::cuMipmappedArrayGetLevel, |
| * ::cuArrayCreate, |
| * ::cudaMallocMipmappedArray |
| */ |
| CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); |
| |
| /** |
| * \brief Gets a mipmap level of a CUDA mipmapped array |
| * |
| * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level |
| * of the CUDA mipmapped array \p hMipmappedArray. |
| * |
| * If \p level is greater than the maximum number of levels in this mipmapped array, |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * \param pLevelArray - Returned mipmap level CUDA array |
| * \param hMipmappedArray - CUDA mipmapped array |
| * \param level - Mipmap level |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMipmappedArrayCreate, |
| * ::cuMipmappedArrayDestroy, |
| * ::cuArrayCreate, |
| * ::cudaGetMipmappedArrayLevel |
| */ |
| CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); |
| |
| /** |
| * \brief Destroys a CUDA mipmapped array |
| * |
| * Destroys the CUDA mipmapped array \p hMipmappedArray. |
| * |
| * \param hMipmappedArray - Mipmapped array to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_ARRAY_IS_MAPPED, |
| * ::CUDA_ERROR_CONTEXT_IS_DESTROYED |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMipmappedArrayCreate, |
| * ::cuMipmappedArrayGetLevel, |
| * ::cuArrayCreate, |
| * ::cudaFreeMipmappedArray |
| */ |
| CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); |
| |
| /** @} */ /* END CUDA_MEM */ |
| |
| /** |
| * \defgroup CUDA_VA Virtual Memory Management |
| * |
| * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the virtual memory management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Allocate an address range reservation. |
| * |
| * Reserves a virtual address range based on the given parameters, giving |
| * the starting address of the range in \p ptr. This API requires a system that |
| * supports UVA. The size and address parameters must be a multiple of the |
| * host page size and the alignment must be a power of two or zero for default |
| * alignment. |
| * |
| * \param[out] ptr - Resulting pointer to start of virtual address range allocated |
| * \param[in] size - Size of the reserved virtual address range requested |
| * \param[in] alignment - Alignment of the reserved virtual address range requested |
| * \param[in] addr - Fixed starting address range requested |
| * \param[in] flags - Currently unused, must be zero |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemAddressFree |
| */ |
| CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); |
| |
| /** |
| * \brief Free an address range reservation. |
| * |
| * Frees a virtual address range reserved by cuMemAddressReserve. The size |
| * must match what was given to memAddressReserve and the ptr given must |
| * match what was returned from memAddressReserve. |
| * |
| * \param[in] ptr - Starting address of the virtual address range to free |
| * \param[in] size - Size of the virtual address region to free |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemAddressReserve |
| */ |
| CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); |
| |
| /** |
| * \brief Create a shareable memory handle representing a memory allocation of a given size described by the given properties |
| * |
| * This creates a memory allocation on the target device specified through the |
| * \p prop strcuture. The created allocation will not have any device or host |
| * mappings. The generic memory \p handle for the allocation can be |
| * mapped to the address space of calling process via ::cuMemMap. This handle |
| * cannot be transmitted directly to other processes (see |
| * ::cuMemExportToShareableHandle). On Windows, the caller must also pass |
| * an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which |
| * limits or allows access to this handle for a recepient process (see |
| * ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this |
| * allocation must be a multiple of the the value given via |
| * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM |
| * flag. |
| * |
| * \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. |
| * \param[in] size - Size of the allocation requested |
| * \param[in] prop - Properties of the allocation to create. |
| * \param[in] flags - flags for future use, must be zero now. |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle |
| */ |
| CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); |
| |
| /** |
| * \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. |
| * |
| * Frees the memory that was allocated on a device through cuMemCreate. |
| * |
| * The memory allocation will be freed when all outstanding mappings to the memory |
| * are unmapped and when all outstanding references to the handle (including it's |
| * shareable counterparts) are also released. The generic memory handle can be |
| * freed when there are still outstanding mappings made with this handle. Each |
| * time a recepient process imports a shareable handle, it needs to pair it with |
| * ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle |
| * the behavior is undefined. |
| * |
| * \param[in] handle Value of handle which was returned previously by cuMemCreate. |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuMemCreate |
| */ |
| CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); |
| |
| /** |
| * \brief Maps an allocation handle to a reserved virtual address range. |
| * |
| * Maps bytes of memory represented by \p handle starting from byte \p offset to |
| * \p size to address range [\p addr, \p addr + \p size]. This range must be an |
| * address reservation previously reserved with ::cuMemAddressReserve, and |
| * \p offset + \p size must be less than the size of the memory allocation. |
| * Both \p ptr, \p size, and \p offset must be a multiple of the value given via |
| * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. |
| * |
| * Please note calling ::cuMemMap does not make the address accessible, |
| * the caller needs to update accessibility of a contiguous mapped VA |
| * range by calling ::cuMemSetAccess. |
| * |
| * Once a recipient process obtains a shareable memory handle |
| * from ::cuMemImportFromShareableHandle, the process must |
| * use ::cuMemMap to map the memory into its address ranges before |
| * setting accessibility with ::cuMemSetAccess. |
| * |
| * ::cuMemMap can only create mappings on VA range reservations |
| * that are not currently mapped. |
| * |
| * \param[in] ptr - Address where memory will be mapped. |
| * \param[in] size - Size of the memory mapping. |
| * \param[in] offset - Offset into the memory represented by |
| * - \p handle from which to start mapping |
| * - Note: currently must be zero. |
| * \param[in] handle - Handle to a shareable memory |
| * \param[in] flags - flags for future use, must be zero now. |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle |
| */ |
| CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); |
| |
| /** |
| * \brief Unmap the backing memory of a given address range. |
| * |
| * The range must be the entire contiguous address range that was mapped to. In |
| * other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped |
| * by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed |
| * if there are no existing mappings and there are no unreleased memory handles. |
| * |
| * When ::cuMemUnmap returns successfully the address range is converted to an |
| * address reservation and can be used for a future calls to ::cuMemMap. Any new |
| * mapping to this virtual address will need to have access granted through |
| * ::cuMemSetAccess, as all mappings start with no accessibility setup. |
| * |
| * \param[in] ptr - Starting address for the virtual address range to unmap |
| * \param[in] size - Size of the virtual address range to unmap |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuMemCreate, ::cuMemAddressReserve |
| */ |
| CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); |
| |
| /** |
| * \brief Set the access flags for each location specified in \p desc for the given virtual address range |
| * |
| * Given the virtual address range via \p ptr and \p size, and the locations |
| * in the array given by \p desc and \p count, set the access flags for the |
| * target locations. The range must be a fully mapped address range |
| * containing all allocations created by ::cuMemMap / ::cuMemCreate. |
| * |
| * \param[in] ptr - Starting address for the virtual address range |
| * \param[in] size - Length of the virtual address range |
| * \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the |
| * - mapping for each location specified |
| * \param[in] count - Number of ::CUmemAccessDesc in \p desc |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * \note_sync |
| * |
| * \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap |
| */ |
| CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); |
| |
| /** |
| * \brief Get the access \p flags set for the given \p location and \p ptr |
| * |
| * \param[out] flags - Flags set for this location |
| * \param[in] location - Location in which to check the flags for |
| * \param[in] ptr - Address in which to check the access flags for |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemSetAccess |
| */ |
| CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); |
| |
| /** |
| * \brief Exports an allocation to a requested shareable handle type |
| * |
| * Given a CUDA memory handle, create a shareable memory |
| * allocation handle that can be used to share the memory with other |
| * processes. The recipient process can convert the shareable handle back into a |
| * CUDA memory handle using ::cuMemImportFromShareableHandle and map |
| * it with ::cuMemMap. The implementation of what this handle is and how it |
| * can be transferred is defined by the requested handle type in \p handleType |
| * |
| * Once all shareable handles are closed and the allocation is released, the allocated |
| * memory referenced will be released back to the OS and uses of the CUDA handle afterward |
| * will lead to undefined behavior. |
| * |
| * This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) |
| * that support importing memory from the shareable type |
| * |
| * \param[out] shareableHandle - Pointer to the location in which to store the requested handle type |
| * \param[in] handle - CUDA handle for the memory allocation |
| * \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) |
| * \param[in] flags - Reserved, must be zero |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemImportFromShareableHandle |
| */ |
| CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); |
| |
| /** |
| * \brief Imports an allocation from a requested shareable handle type. |
| * |
| * If the current process cannot support the memory described by this shareable |
| * handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. |
| * |
| * \note Importing shareable handles exported from some graphics APIs(Vulkan, OpenGL, etc) |
| * created on devices under an SLI group may not be supported, and thus this API will |
| * return CUDA_ERROR_NOT_SUPPORTED. |
| * There is no guarantee that the contents of \p handle will be the same CUDA memory handle |
| * for the same given OS shareable handle, or the same underlying allocation. |
| * |
| * \param[out] handle - CUDA Memory handle for the memory allocation. |
| * \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. |
| * \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease |
| */ |
| CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); |
| |
| /** |
| * \brief Calculates either the minimal or recommended granularity |
| * |
| * Calculates either the minimal or recommended granularity |
| * for a given allocation specification and returns it in granularity. This |
| * granularity can be used as a multiple for alignment, size, or address mapping. |
| * |
| * \param[out] granularity Returned granularity. |
| * \param[in] prop Property for which to determine the granularity for |
| * \param[in] option Determines which granularity to return |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemCreate, ::cuMemMap |
| */ |
| CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); |
| |
| /** |
| * \brief Retrieve the contents of the property structure defining properties for this handle |
| * |
| * \param[out] prop - Pointer to a properties structure which will hold the information about this handle |
| * \param[in] handle - Handle which to perform the query on |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemCreate, ::cuMemImportFromShareableHandle |
| */ |
| CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); |
| |
| /** |
| * \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. |
| * |
| * The handle is guaranteed to be the same handle value used to map the memory. If the address |
| * requested is not mapped, the function will fail. The returned handle must be released with |
| * corresponding number of calls to ::cuMemRelease. |
| * |
| * \note The address \p addr, can be any address in a range previously mapped |
| * by ::cuMemMap, and not necessarily the start address. |
| * |
| * \param[out] handle CUDA Memory handle for the backing memory allocation. |
| * \param[in] addr Memory address to query, that has been mapped previously. |
| * \returns |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_PERMITTED, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * |
| * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap |
| */ |
| CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); |
| |
| /** @} */ /* END CUDA_VA */ |
| |
| /** |
| * \defgroup CUDA_UNIFIED Unified Addressing |
| * |
| * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver |
| * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the unified addressing functions of the |
| * low-level CUDA driver application programming interface. |
| * |
| * @{ |
| * |
| * \section CUDA_UNIFIED_overview Overview |
| * |
| * CUDA devices can share a unified address space with the host. |
| * For these devices there is no distinction between a device |
| * pointer and a host pointer -- the same pointer value may be |
| * used to access memory from the host program and from a kernel |
| * running on the device (with exceptions enumerated below). |
| * |
| * \section CUDA_UNIFIED_support Supported Platforms |
| * |
| * Whether or not a device supports unified addressing may be |
| * queried by calling ::cuDeviceGetAttribute() with the device |
| * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. |
| * |
| * Unified addressing is automatically enabled in 64-bit processes |
| * |
| * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values |
| * |
| * It is possible to look up information about the memory which backs a |
| * pointer value. For instance, one may want to know if a pointer points |
| * to host or device memory. As another example, in the case of device |
| * memory, one may want to know on which CUDA device the memory |
| * resides. These properties may be queried using the function |
| * ::cuPointerGetAttribute() |
| * |
| * Since pointers are unique, it is not necessary to specify information |
| * about the pointers specified to the various copy functions in the |
| * CUDA API. The function ::cuMemcpy() may be used to perform a copy |
| * between two pointers, ignoring whether they point to host or device |
| * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() |
| * unnecessary for devices supporting unified addressing). For |
| * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be |
| * used to specify that the CUDA driver should infer the location of the |
| * pointer from its value. |
| * |
| * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory |
| * |
| * All host memory allocated in all contexts using ::cuMemAllocHost() and |
| * ::cuMemHostAlloc() is always directly accessible from all contexts on |
| * all devices that support unified addressing. This is the case regardless |
| * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and |
| * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. |
| * |
| * The pointer value through which allocated host memory may be accessed |
| * in kernels on all devices that support unified addressing is the same |
| * as the pointer value through which that memory is accessed on the host, |
| * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device |
| * pointer for these allocations. |
| * |
| * Note that this is not the case for memory allocated using the flag |
| * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. |
| * |
| * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory |
| * |
| * Upon enabling direct access from a context that supports unified addressing |
| * to another peer context that supports unified addressing using |
| * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using |
| * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible |
| * by the current context. The device pointer value through |
| * which any peer memory may be accessed in the current context |
| * is the same pointer value through which that memory may be |
| * accessed in the peer context. |
| * |
| * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing |
| * |
| * Not all memory may be accessed on devices through the same pointer |
| * value through which they are accessed on the host. These exceptions |
| * are host memory registered using ::cuMemHostRegister() and host memory |
| * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these |
| * exceptions, there exists a distinct host and device address for the |
| * memory. The device address is guaranteed to not overlap any valid host |
| * pointer range and is guaranteed to have the same value across all |
| * contexts that support unified addressing. |
| * |
| * This device address may be queried using ::cuMemHostGetDevicePointer() |
| * when a context using unified addressing is current. Either the host |
| * or the unified device pointer value may be used to refer to this memory |
| * through ::cuMemcpy() and similar functions using the |
| * ::CU_MEMORYTYPE_UNIFIED memory type. |
| * |
| */ |
| |
| /** |
| * \brief Returns information about a pointer |
| * |
| * The supported attributes are: |
| * |
| * - ::CU_POINTER_ATTRIBUTE_CONTEXT: |
| * |
| * Returns in \p *data the ::CUcontext in which \p ptr was allocated or |
| * registered. |
| * The type of \p data must be ::CUcontext *. |
| * |
| * If \p ptr was not allocated by, mapped by, or registered with |
| * a ::CUcontext which uses unified virtual addressing then |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: |
| * |
| * Returns in \p *data the physical memory type of the memory that |
| * \p ptr addresses as a ::CUmemorytype enumerated value. |
| * The type of \p data must be unsigned int. |
| * |
| * If \p ptr addresses device memory then \p *data is set to |
| * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the |
| * memory resides is the ::CUdevice of the ::CUcontext returned by the |
| * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. |
| * |
| * If \p ptr addresses host memory then \p *data is set to |
| * ::CU_MEMORYTYPE_HOST. |
| * |
| * If \p ptr was not allocated by, mapped by, or registered with |
| * a ::CUcontext which uses unified virtual addressing then |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * If the current ::CUcontext does not support unified virtual |
| * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: |
| * |
| * Returns in \p *data the device pointer value through which |
| * \p ptr may be accessed by kernels running in the current |
| * ::CUcontext. |
| * The type of \p data must be CUdeviceptr *. |
| * |
| * If there exists no device pointer value through which |
| * kernels running in the current ::CUcontext may access |
| * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * If there is no current ::CUcontext then |
| * ::CUDA_ERROR_INVALID_CONTEXT is returned. |
| * |
| * Except in the exceptional disjoint addressing cases discussed |
| * below, the value returned in \p *data will equal the input |
| * value \p ptr. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: |
| * |
| * Returns in \p *data the host pointer value through which |
| * \p ptr may be accessed by by the host program. |
| * The type of \p data must be void **. |
| * If there exists no host pointer value through which |
| * the host program may directly access \p ptr then |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * Except in the exceptional disjoint addressing cases discussed |
| * below, the value returned in \p *data will equal the input |
| * value \p ptr. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: |
| * |
| * Returns in \p *data two tokens for use with the nv-p2p.h Linux |
| * kernel interface. \p data must be a struct of type |
| * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. |
| * |
| * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). |
| * Note that p2pToken and vaSpaceToken are only valid for the |
| * lifetime of the source allocation. A subsequent allocation at |
| * the same address may return completely different tokens. |
| * Querying this attribute has a side effect of setting the attribute |
| * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that |
| * \p ptr points to. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: |
| * |
| * A boolean attribute which when set, ensures that synchronous memory operations |
| * initiated on the region of memory that \p ptr points to will always synchronize. |
| * See further documentation in the section titled "API synchronization behavior" |
| * to learn more about cases when synchronous memory operations can |
| * exhibit asynchronous behavior. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: |
| * |
| * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. |
| * \p data must point to an unsigned long long. |
| * |
| * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. |
| * Every memory allocation from any of the CUDA memory allocation APIs will |
| * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs |
| * from previous freed allocations. IDs are only unique within a single process. |
| * |
| * |
| * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: |
| * |
| * Returns in \p *data a boolean that indicates whether the pointer points to |
| * managed memory or not. |
| * |
| * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: |
| * |
| * Returns in \p *data an integer representing a device ordinal of a device against |
| * which the memory was allocated or registered. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: |
| * |
| * Returns in \p *data a boolean that indicates if this pointer maps to |
| * an allocation that is suitable for ::cudaIpcGetMemHandle. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: |
| * |
| * Returns in \p *data the starting address for the allocation referenced |
| * by the device pointer \p ptr. Note that this is not necessarily the |
| * address of the mapped region, but the address of the mappable address |
| * range \p ptr references (e.g. from ::cuMemAddressReserve). |
| * |
| * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: |
| * |
| * Returns in \p *data the size for the allocation referenced by the device |
| * pointer \p ptr. Note that this is not necessarily the size of the mapped |
| * region, but the size of the mappable address range \p ptr references |
| * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped |
| * region, see ::cuMemGetAllocationPropertyForAddress. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_MAPPED: |
| * |
| * Returns in \p *data a boolean that indicates if this pointer is in a |
| * valid address range that is mapped to a backing allocation. |
| * |
| * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: |
| * |
| * Returns a bitmask of the allowed handle types for an allocation that may |
| * be passed to ::cuMemExportToShareableHandle. |
| * |
| * \par |
| * |
| * Note that for most allocations in the unified virtual address space |
| * the host and device pointer for accessing the allocation will be the |
| * same. The exceptions to this are |
| * - user memory registered using ::cuMemHostRegister |
| * - host memory allocated using ::cuMemHostAlloc with the |
| * ::CU_MEMHOSTALLOC_WRITECOMBINED flag |
| * For these types of allocation there will exist separate, disjoint host |
| * and device addresses for accessing the allocation. In particular |
| * - The host address will correspond to an invalid unmapped device address |
| * (which will result in an exception if accessed from the device) |
| * - The device address will correspond to an invalid unmapped host address |
| * (which will result in an exception if accessed from the host). |
| * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER |
| * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host |
| * and device addresses from either address. |
| * |
| * \param data - Returned pointer attribute value |
| * \param attribute - Pointer attribute to query |
| * \param ptr - Pointer |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuPointerSetAttribute, |
| * ::cuMemAlloc, |
| * ::cuMemFree, |
| * ::cuMemAllocHost, |
| * ::cuMemFreeHost, |
| * ::cuMemHostAlloc, |
| * ::cuMemHostRegister, |
| * ::cuMemHostUnregister, |
| * ::cudaPointerGetAttributes |
| */ |
| CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); |
| |
| /** |
| * \brief Prefetches memory to the specified destination device |
| * |
| * Prefetches memory to the specified destination device. \p devPtr is the |
| * base device pointer of the memory to be prefetched and \p dstDevice is the |
| * destination device. \p count specifies the number of bytes to copy. \p hStream |
| * is the stream in which the operation is enqueued. The memory range must refer |
| * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. |
| * |
| * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If |
| * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS |
| * must be non-zero. Additionally, \p hStream must be associated with a device that has a |
| * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
| * |
| * The start address and end address of the memory range will be rounded down and rounded up |
| * respectively to be aligned to CPU page size before the prefetch operation is enqueued |
| * in the stream. |
| * |
| * If no physical memory has been allocated for this region, then this memory region |
| * will be populated and mapped on the destination device. If there's insufficient |
| * memory to prefetch the desired region, the Unified Memory driver may evict pages from other |
| * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory |
| * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. |
| * |
| * By default, any mappings to the previous location of the migrated pages are removed and |
| * mappings for the new location are only setup on \p dstDevice. The exact behavior however |
| * also depends on the settings applied to this memory range via ::cuMemAdvise as described |
| * below: |
| * |
| * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, |
| * then that subset will create a read-only copy of the pages on \p dstDevice. |
| * |
| * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory |
| * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the |
| * preferred location of any pages in the memory range. |
| * |
| * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, |
| * then mappings to those pages from all the appropriate processors are updated to |
| * refer to the new location if establishing such a mapping is possible. Otherwise, |
| * those mappings are cleared. |
| * |
| * Note that this API is not required for functionality and only serves to improve performance |
| * by allowing the application to migrate data to a suitable location before it is accessed. |
| * Memory accesses to this range are always coherent and are allowed even when the data is |
| * actively being migrated. |
| * |
| * Note that this function is asynchronous with respect to the host and all work |
| * on other devices. |
| * |
| * \param devPtr - Pointer to be prefetched |
| * \param count - Size in bytes |
| * \param dstDevice - Destination device to prefetch to |
| * \param hStream - Stream to enqueue prefetch operation |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, |
| * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, |
| * ::cudaMemPrefetchAsync |
| */ |
| CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); |
| |
| /** |
| * \brief Advise about the usage of a given memory range |
| * |
| * Advise the Unified Memory subsystem about the usage pattern for the memory range |
| * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory |
| * range will be rounded down and rounded up respectively to be aligned to CPU page size before the |
| * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged |
| * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable |
| * memory provided it represents a valid, host-accessible region of memory and all additional constraints |
| * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable |
| * memory range results in an error being returned. |
| * |
| * The \p advice parameter can take the following values: |
| * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read |
| * from and only occasionally written to. Any read accesses from any processor to this region will create a |
| * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync |
| * is called on this region, it will create a read-only copy of the data on the destination processor. |
| * If any processor writes to this region, all copies of the corresponding page will be invalidated |
| * except for the one where the write occurred. The \p device argument is ignored for this advice. |
| * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU |
| * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
| * Also, if a context is created on a device that does not have the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until |
| * all such contexts are destroyed. |
| * If the memory region refers to valid system-allocated pageable memory, then the accessing device must |
| * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only |
| * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice |
| * will not create a read-only copy when that device accesses this memory region. |
| * |
| * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the |
| * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated |
| * copies of the data will be collapsed into a single copy. The location for the collapsed |
| * copy will be the preferred location if the page has a preferred location and one of the read-duplicated |
| * copies was resident at that location. Otherwise, the location chosen is arbitrary. |
| * |
| * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the |
| * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the |
| * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location |
| * does not cause data to migrate to that location immediately. Instead, it guides the migration policy |
| * when a fault occurs on that memory region. If the data is already in its preferred location and the |
| * faulting processor can establish a mapping without requiring the data to be migrated, then |
| * data migration will be avoided. On the other hand, if the data is not in its preferred location |
| * or if a direct mapping cannot be established, then it will be migrated to the processor accessing |
| * it. It is important to note that setting the preferred location does not prevent data prefetching |
| * done using ::cuMemPrefetchAsync. |
| * Having a preferred location can override the page thrash detection and resolution logic in the Unified |
| * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device |
| * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But |
| * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. |
| * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the |
| * policies associated with that advice will override the policies of this advice, unless read accesses from |
| * \p device will not result in a read-only copy being created on that device as outlined in description for |
| * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. |
| * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
| * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
| * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
| * then this call has no effect. Note however that this behavior may change in the future. |
| * |
| * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION |
| * and changes the preferred location to none. |
| * |
| * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. |
| * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then |
| * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. |
| * This advice does not cause data migration and has no impact on the location of the data per se. Instead, |
| * it causes the data to always be mapped in the specified processor's page tables, as long as the |
| * location of the data permits a mapping to be established. If the data gets migrated for any reason, |
| * the mappings are updated accordingly. |
| * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. |
| * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the |
| * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data |
| * over to the other GPUs is not as important because the accesses are infrequent and the overhead of |
| * migration may be too high. But preventing faults can still help improve performance, and so having |
| * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated |
| * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the |
| * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the |
| * page in host memory. |
| * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the |
| * policies associated with that advice will override the policies of this advice. Additionally, if the |
| * preferred location of this memory region or any subset of it is also \p device, then the policies |
| * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. |
| * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
| * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
| * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
| * then this call has no effect. |
| * |
| * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to |
| * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. |
| * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero |
| * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has |
| * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, |
| * then this call has no effect. |
| * |
| * \param devPtr - Pointer to memory to set the advice for |
| * \param count - Size in bytes of the memory range |
| * \param advice - Advice to be applied for the specified memory range |
| * \param device - Device to apply the advice for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, |
| * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, |
| * ::cudaMemAdvise |
| */ |
| CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); |
| |
| /** |
| * \brief Query an attribute of a given memory range |
| * |
| * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The |
| * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via |
| * __managed__ variables. |
| * |
| * The \p attribute parameter can take the following values: |
| * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted |
| * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given |
| * memory range have read-duplication enabled, or 0 otherwise. |
| * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be |
| * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device |
| * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU |
| * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID |
| * if either all the pages don't have the same preferred location or some of the pages don't have a |
| * preferred location at all. Note that the actual location of the pages in the memory range at the time of |
| * the query may be different from the preferred location. |
| * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted |
| * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned |
| * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. |
| * If any device does not have that advice set for the entire memory range, that device will not be included. |
| * If \p data is larger than the number of devices that have that advice set for that memory range, |
| * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 |
| * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be |
| * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have |
| * that advice set, then only as many devices will be returned as can fit in the array. There is no |
| * guarantee on which specific devices will be returned, however. |
| * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be |
| * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location |
| * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be |
| * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU |
| * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not |
| * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the |
| * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to |
| * whether the prefetch operation to that location has completed or even begun. |
| * |
| * \param data - A pointers to a memory location where the result |
| * of each attribute query will be written to. |
| * \param dataSize - Array containing the size of data |
| * \param attribute - The attribute to query |
| * \param devPtr - Start of the range to query |
| * \param count - Size of the range to query |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * \note_async |
| * \note_null_stream |
| * |
| * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, |
| * ::cuMemAdvise, |
| * ::cudaMemRangeGetAttribute |
| */ |
| CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); |
| |
| /** |
| * \brief Query attributes of a given memory range. |
| * |
| * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The |
| * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via |
| * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes |
| * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. |
| * The results of the query will be stored in \p data. |
| * |
| * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for |
| * attribute descriptions and restrictions. |
| * |
| * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY |
| * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION |
| * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY |
| * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION |
| * |
| * \param data - A two-dimensional array containing pointers to memory |
| * locations where the result of each attribute query will be written to. |
| * \param dataSizes - Array containing the sizes of each result |
| * \param attributes - An array of attributes to query |
| * (numAttributes and the number of attributes in this array should match) |
| * \param numAttributes - Number of attributes to query |
| * \param devPtr - Start of the range to query |
| * \param count - Size of the range to query |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise |
| * ::cuMemPrefetchAsync, |
| * ::cudaMemRangeGetAttributes |
| */ |
| CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); |
| |
| /** |
| * \brief Set attributes on a previously allocated memory region |
| * |
| * The supported attributes are: |
| * |
| * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: |
| * |
| * A boolean attribute that can either be set (1) or unset (0). When set, |
| * the region of memory that \p ptr points to is guaranteed to always synchronize |
| * memory operations that are synchronous. If there are some previously initiated |
| * synchronous memory operations that are pending when this attribute is set, the |
| * function does not return until those memory operations are complete. |
| * See further documentation in the section titled "API synchronization behavior" |
| * to learn more about cases when synchronous memory operations can |
| * exhibit asynchronous behavior. |
| * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. |
| * |
| * \param value - Pointer to memory containing the value to be set |
| * \param attribute - Pointer attribute to set |
| * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa ::cuPointerGetAttribute, |
| * ::cuPointerGetAttributes, |
| * ::cuMemAlloc, |
| * ::cuMemFree, |
| * ::cuMemAllocHost, |
| * ::cuMemFreeHost, |
| * ::cuMemHostAlloc, |
| * ::cuMemHostRegister, |
| * ::cuMemHostUnregister |
| */ |
| CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); |
| |
| /** |
| * \brief Returns information about a pointer. |
| * |
| * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): |
| * |
| * - ::CU_POINTER_ATTRIBUTE_CONTEXT |
| * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE |
| * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER |
| * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER |
| * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS |
| * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID |
| * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED |
| * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL |
| * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR |
| * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE |
| * - ::CU_POINTER_ATTRIBUTE_MAPPED |
| * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE |
| * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES |
| * |
| * \param numAttributes - Number of attributes to query |
| * \param attributes - An array of attributes to query |
| * (numAttributes and the number of attributes in this array should match) |
| * \param data - A two-dimensional array containing pointers to memory |
| * locations where the result of each attribute query will be written to. |
| * \param ptr - Pointer to query |
| * |
| * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr |
| * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values |
| * and CUDA_SUCCESS is returned. |
| * |
| * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA |
| * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuPointerGetAttribute, |
| * ::cuPointerSetAttribute, |
| * ::cudaPointerGetAttributes |
| */ |
| CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); |
| |
| /** @} */ /* END CUDA_UNIFIED */ |
| |
| /** |
| * \defgroup CUDA_STREAM Stream Management |
| * |
| * ___MANBRIEF___ stream management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the stream management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Create a stream |
| * |
| * Creates a stream and returns a handle in \p phStream. The \p Flags argument |
| * determines behaviors of the stream. |
| * |
| * Valid values for \p Flags are: |
| * - ::CU_STREAM_DEFAULT: Default stream creation flag. |
| * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created |
| * stream may run concurrently with work in stream 0 (the NULL stream), and that |
| * the created stream should perform no implicit synchronization with stream 0. |
| * |
| * \param phStream - Returned newly created stream |
| * \param Flags - Parameters for stream creation |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuStreamDestroy, |
| * ::cuStreamCreateWithPriority, |
| * ::cuStreamGetPriority, |
| * ::cuStreamGetFlags, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamCreate, |
| * ::cudaStreamCreateWithFlags |
| */ |
| CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); |
| |
| /** |
| * \brief Create a stream with the given priority |
| * |
| * Creates a stream with the specified priority and returns a handle in \p phStream. |
| * This API alters the scheduler priority of work in the stream. Work in a higher |
| * priority stream may preempt work already executing in a low priority stream. |
| * |
| * \p priority follows a convention where lower numbers represent higher priorities. |
| * '0' represents default priority. The range of meaningful numerical priorities can |
| * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is |
| * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, |
| * it will automatically be clamped to the lowest or the highest number in the range. |
| * |
| * \param phStream - Returned newly created stream |
| * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of |
| * valid flags |
| * \param priority - Stream priority. Lower numbers represent higher priorities. |
| * See ::cuCtxGetStreamPriorityRange for more information about |
| * meaningful stream priorities that can be passed. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \note Stream priorities are supported only on GPUs |
| * with compute capability 3.5 or higher. |
| * |
| * \note In the current implementation, only compute kernels launched in |
| * priority streams are affected by the stream's priority. Stream priorities have |
| * no effect on host-to-device and device-to-host memory operations. |
| * |
| * \sa ::cuStreamDestroy, |
| * ::cuStreamCreate, |
| * ::cuStreamGetPriority, |
| * ::cuCtxGetStreamPriorityRange, |
| * ::cuStreamGetFlags, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamCreateWithPriority |
| */ |
| CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); |
| |
| |
| /** |
| * \brief Query the priority of a given stream |
| * |
| * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority |
| * and return the priority in \p priority. Note that if the stream was created with a |
| * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, |
| * this function returns the clamped priority. |
| * See ::cuStreamCreateWithPriority for details about priority clamping. |
| * |
| * \param hStream - Handle to the stream to be queried |
| * \param priority - Pointer to a signed integer in which the stream's priority is returned |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuStreamDestroy, |
| * ::cuStreamCreate, |
| * ::cuStreamCreateWithPriority, |
| * ::cuCtxGetStreamPriorityRange, |
| * ::cuStreamGetFlags, |
| * ::cudaStreamGetPriority |
| */ |
| CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); |
| |
| /** |
| * \brief Query the flags of a given stream |
| * |
| * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority |
| * and return the flags in \p flags. |
| * |
| * \param hStream - Handle to the stream to be queried |
| * \param flags - Pointer to an unsigned integer in which the stream's flags are returned |
| * The value returned in \p flags is a logical 'OR' of all flags that |
| * were used while creating this stream. See ::cuStreamCreate for the list |
| * of valid flags |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa ::cuStreamDestroy, |
| * ::cuStreamCreate, |
| * ::cuStreamGetPriority, |
| * ::cudaStreamGetFlags |
| */ |
| CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); |
| |
| /** |
| * \brief Query the context associated with a stream |
| * |
| * Returns the CUDA context that the stream is associated with. |
| * |
| * The stream handle \p hStream can refer to any of the following: |
| * <ul> |
| * <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate |
| * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as |
| * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. |
| * The returned context is the context that was active in the calling thread when the |
| * stream was created. Passing an invalid handle will result in undefined behavior.</li> |
| * <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and |
| * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, |
| * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. |
| * Specifying any of the special handles will return the context current to the |
| * calling thread. If no context is current to the calling thread, |
| * ::CUDA_ERROR_INVALID_CONTEXT is returned.</li> |
| * </ul> |
| * |
| * \param hStream - Handle to the stream to be queried |
| * \param pctx - Returned context associated with the stream |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * \notefnerr |
| * |
| * \sa ::cuStreamDestroy, |
| * ::cuStreamCreateWithPriority, |
| * ::cuStreamGetPriority, |
| * ::cuStreamGetFlags, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamCreate, |
| * ::cudaStreamCreateWithFlags |
| */ |
| CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); |
| |
| /** |
| * \brief Make a compute stream wait on an event |
| * |
| * Makes all future work submitted to \p hStream wait for all work captured in |
| * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. |
| * The synchronization will be performed efficiently on the device when applicable. |
| * \p hEvent may be from a different context or device than \p hStream. |
| * |
| * \param hStream - Stream to wait |
| * \param hEvent - Event to wait on (may not be NULL) |
| * \param Flags - Parameters for the operation (must be 0) |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuEventRecord, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cuStreamDestroy, |
| * ::cudaStreamWaitEvent |
| */ |
| CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); |
| |
| /** |
| * \brief Add a callback to a compute stream |
| * |
| * \note This function is slated for eventual deprecation and removal. If |
| * you do not require the callback to execute in case of a device error, |
| * consider using ::cuLaunchHostFunc. Additionally, this function is not |
| * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike |
| * ::cuLaunchHostFunc. |
| * |
| * Adds a callback to be called on the host after all currently enqueued |
| * items in the stream have completed. For each |
| * cuStreamAddCallback call, the callback will be executed exactly once. |
| * The callback will block later work in the stream until it is finished. |
| * |
| * The callback may be passed ::CUDA_SUCCESS or an error code. In the event |
| * of a device error, all subsequently executed callbacks will receive an |
| * appropriate ::CUresult. |
| * |
| * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API |
| * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any |
| * synchronization that may depend on outstanding device work or other callbacks |
| * that are not mandated to run earlier. Callbacks without a mandated order |
| * (in independent streams) execute in undefined order and may be serialized. |
| * |
| * For the purposes of Unified Memory, callback execution makes a number of |
| * guarantees: |
| * <ul> |
| * <li>The callback stream is considered idle for the duration of the |
| * callback. Thus, for example, a callback may always use memory attached |
| * to the callback stream.</li> |
| * <li>The start of execution of a callback has the same effect as |
| * synchronizing an event recorded in the same stream immediately prior to |
| * the callback. It thus synchronizes streams which have been "joined" |
| * prior to the callback.</li> |
| * <li>Adding device work to any stream does not have the effect of making |
| * the stream active until all preceding host functions and stream callbacks |
| * have executed. Thus, for |
| * example, a callback might use global attached memory even if work has |
| * been added to another stream, if the work has been ordered behind the |
| * callback with an event.</li> |
| * <li>Completion of a callback does not cause a stream to become |
| * active except as described above. The callback stream will remain idle |
| * if no device work follows the callback, and will remain idle across |
| * consecutive callbacks without device work in between. Thus, for example, |
| * stream synchronization can be done by signaling from a callback at the |
| * end of the stream.</li> |
| * </ul> |
| * |
| * \param hStream - Stream to add callback to |
| * \param callback - The function to call once preceding stream operations are complete |
| * \param userData - User specified data to be passed to the callback function |
| * \param flags - Reserved for future use, must be 0 |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamDestroy, |
| * ::cuMemAllocManaged, |
| * ::cuStreamAttachMemAsync, |
| * ::cuStreamLaunchHostFunc, |
| * ::cudaStreamAddCallback |
| */ |
| CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); |
| |
| /** |
| * \brief Begins graph capture on a stream |
| * |
| * Begin graph capture on \p hStream. When a stream is in capture mode, all operations |
| * pushed into the stream will not be executed, but will instead be captured into |
| * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated |
| * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which |
| * it was initiated, and it may only be initiated if the stream is not already in capture |
| * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id |
| * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. |
| * |
| * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be |
| * called on this stream from the same thread. |
| * |
| * \param hStream - Stream in which to initiate capture |
| * \param mode - Controls the interaction of this capture sequence with other API |
| * calls that are potentially unsafe. For more details see |
| * ::cuThreadExchangeStreamCaptureMode. |
| * |
| * \note Kernels captured using this API must not use texture and surface references. |
| * Reading or writing through any texture or surface reference is undefined |
| * behavior. This restriction does not apply to texture and surface objects. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuStreamCreate, |
| * ::cuStreamIsCapturing, |
| * ::cuStreamEndCapture, |
| * ::cuThreadExchangeStreamCaptureMode |
| */ |
| CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); |
| |
| /** |
| * \brief Swaps the stream capture interaction mode for a thread |
| * |
| * Sets the calling thread's stream capture interaction mode to the value contained |
| * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To |
| * facilitate deterministic behavior across function or module boundaries, callers |
| * are encouraged to use this API in a push-pop fashion: \code |
| CUstreamCaptureMode mode = desiredMode; |
| cuThreadExchangeStreamCaptureMode(&mode); |
| ... |
| cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode |
| * \endcode |
| * |
| * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call |
| * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is |
| * not enqueued asynchronously to a stream, and is not observed by stream capture. |
| * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture |
| * depended on the allocation being replayed whenever the graph is launched, the |
| * captured graph would be invalid. |
| * |
| * Therefore, stream capture places restrictions on API calls that can be made within |
| * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This |
| * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. |
| * |
| * A thread's mode is one of the following: |
| * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has |
| * an ongoing capture sequence that was not initiated with |
| * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread |
| * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, |
| * this thread is prohibited from potentially unsafe API calls. |
| * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture |
| * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited |
| * from potentially unsafe API calls. Concurrent capture sequences in other threads |
| * are ignored. |
| * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially |
| * unsafe API calls. Note that the thread is still prohibited from API calls which |
| * necessarily conflict with stream capture, for example, attempting ::cuEventQuery |
| * on an event that was last recorded inside a capture sequence. |
| * |
| * \param mode - Pointer to mode value to swap with the current mode |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuStreamBeginCapture |
| */ |
| CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); |
| |
| /** |
| * \brief Ends capture on a stream, returning the captured graph |
| * |
| * End capture on \p hStream, returning the captured graph via \p phGraph. |
| * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. |
| * If capture was invalidated, due to a violation of the rules of stream capture, then |
| * a NULL graph will be returned. |
| * |
| * If the \p mode argument to ::cuStreamBeginCapture was not |
| * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as |
| * ::cuStreamBeginCapture. |
| * |
| * \param hStream - Stream to query |
| * \param phGraph - The captured graph |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD |
| * \notefnerr |
| * |
| * \sa |
| * ::cuStreamCreate, |
| * ::cuStreamBeginCapture, |
| * ::cuStreamIsCapturing |
| */ |
| CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); |
| |
| /** |
| * \brief Returns a stream's capture status |
| * |
| * Return the capture status of \p hStream via \p captureStatus. After a successful |
| * call, \p *captureStatus will contain one of the following: |
| * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. |
| * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. |
| * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error |
| * has invalidated the capture sequence. The capture sequence must be terminated |
| * with ::cuStreamEndCapture on the stream where it was initiated in order to |
| * continue using \p hStream. |
| * |
| * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while |
| * a blocking stream in the same context is capturing, it will return |
| * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified |
| * after the call. The blocking stream capture is not invalidated. |
| * |
| * When a blocking stream is capturing, the legacy stream is in an |
| * unusable state until the blocking stream capture is terminated. The legacy |
| * stream is not supported for stream capture, but attempted use would have an |
| * implicit dependency on the capturing stream(s). |
| * |
| * \param hStream - Stream to query |
| * \param captureStatus - Returns the stream's capture status |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT |
| * \notefnerr |
| * |
| * \sa |
| * ::cuStreamCreate, |
| * ::cuStreamBeginCapture, |
| * ::cuStreamEndCapture |
| */ |
| CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); |
| |
| /** |
| * \brief Query capture status of a stream |
| * |
| * Query the capture status of a stream and and get an id for |
| * the capture sequence, which is unique over the lifetime of the process. |
| * |
| * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created |
| * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. |
| * |
| * A valid id is returned only if both of the following are true: |
| * - the call returns CUDA_SUCCESS |
| * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT |
| * \notefnerr |
| * |
| * \sa |
| * ::cuStreamBeginCapture, |
| * ::cuStreamIsCapturing |
| */ |
| CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id); |
| |
| /** |
| * \brief Attach memory to a stream asynchronously |
| * |
| * Enqueues an operation in \p hStream to specify stream association of |
| * \p length bytes of memory starting from \p dptr. This function is a |
| * stream-ordered operation, meaning that it is dependent on, and will |
| * only take effect when, previous work in stream has completed. Any |
| * previous association is automatically replaced. |
| * |
| * \p dptr must point to one of the following types of memories: |
| * - managed memory declared using the __managed__ keyword or allocated with |
| * ::cuMemAllocManaged. |
| * - a valid host-accessible region of system-allocated pageable memory. This |
| * type of memory may only be specified if the device associated with the |
| * stream reports a non-zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. |
| * |
| * For managed allocations, \p length must be either zero or the entire |
| * allocation's size. Both indicate that the entire allocation's stream |
| * association is being changed. Currently, it is not possible to change stream |
| * association for a portion of a managed allocation. |
| * |
| * For pageable host allocations, \p length must be non-zero. |
| * |
| * The stream association is specified using \p flags which must be |
| * one of ::CUmemAttach_flags. |
| * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed |
| * by any stream on any device. |
| * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee |
| * that it won't access the memory on the device from any stream on a device that |
| * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. |
| * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with |
| * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, |
| * the program makes a guarantee that it will only access the memory on the device |
| * from \p hStream. It is illegal to attach singly to the NULL stream, because the |
| * NULL stream is a virtual global stream and not a specific stream. An error will |
| * be returned in this case. |
| * |
| * When memory is associated with a single stream, the Unified Memory system will |
| * allow CPU access to this memory region so long as all operations in \p hStream |
| * have completed, regardless of whether other streams are active. In effect, |
| * this constrains exclusive ownership of the managed memory region by |
| * an active GPU to per-stream activity instead of whole-GPU activity. |
| * |
| * Accessing memory on the device from streams that are not associated with |
| * it will produce undefined results. No error checking is performed by the |
| * Unified Memory system to ensure that kernels launched into other streams |
| * do not access this region. |
| * |
| * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync |
| * via events, synchronization or other means to ensure legal access to memory |
| * at all times. Data visibility and coherency will be changed appropriately |
| * for all kernels which follow a stream-association change. |
| * |
| * If \p hStream is destroyed while data is associated with it, the association is |
| * removed and the association reverts to the default visibility of the allocation |
| * as specified at ::cuMemAllocManaged. For __managed__ variables, the default |
| * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an |
| * asynchronous operation, and as a result, the change to default association won't |
| * happen until all work in the stream has completed. |
| * |
| * \param hStream - Stream in which to enqueue the attach operation |
| * \param dptr - Pointer to memory (must be a pointer to managed memory or |
| * to a valid host-accessible region of system-allocated |
| * pageable memory) |
| * \param length - Length of memory |
| * \param flags - Must be one of ::CUmemAttach_flags |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamDestroy, |
| * ::cuMemAllocManaged, |
| * ::cudaStreamAttachMemAsync |
| */ |
| CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); |
| |
| /** |
| * \brief Determine status of a compute stream |
| * |
| * Returns ::CUDA_SUCCESS if all operations in the stream specified by |
| * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. |
| * |
| * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS |
| * is equivalent to having called ::cuStreamSynchronize(). |
| * |
| * \param hStream - Stream to query status of |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_READY |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamDestroy, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamQuery |
| */ |
| CUresult CUDAAPI cuStreamQuery(CUstream hStream); |
| |
| /** |
| * \brief Wait until a stream's tasks are completed |
| * |
| * Waits until the device has completed all operations in the stream specified |
| * by \p hStream. If the context was created with the |
| * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the |
| * stream is finished with all of its tasks. |
| * |
| * \param hStream - Stream to wait for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamDestroy, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamQuery, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamSynchronize |
| */ |
| CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); |
| |
| /** |
| * \brief Destroys a stream |
| * |
| * Destroys the stream specified by \p hStream. |
| * |
| * In case the device is still doing work in the stream \p hStream |
| * when ::cuStreamDestroy() is called, the function will return immediately |
| * and the resources associated with \p hStream will be released automatically |
| * once the device has completed all work in \p hStream. |
| * |
| * \param hStream - Stream to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamAddCallback, |
| * ::cudaStreamDestroy |
| */ |
| CUresult CUDAAPI cuStreamDestroy(CUstream hStream); |
| |
| /** |
| * \brief Copies attributes from source stream to destination stream |
| * |
| * Copies attributes from source stream \p src to destination stream \p dst. |
| * Both streams must have the same context. |
| * |
| * \param[out] dst Destination stream |
| * \param[in] src Source stream |
| * For list of attributes see ::CUstreamAttrID |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); |
| |
| /** |
| * \brief Queries stream attribute. |
| * |
| * Queries attribute \p attr from \p hStream and stores it in corresponding |
| * member of \p value_out. |
| * |
| * \param[in] hStream |
| * \param[in] attr |
| * \param[out] value_out |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, |
| CUstreamAttrValue *value_out); |
| |
| /** |
| * \brief Sets stream attribute. |
| * |
| * Sets attribute \p attr on \p hStream from corresponding attribute of |
| * \p value. The updated attribute will be applied to subsequent work |
| * submitted to the stream. It will not affect previously submitted work. |
| * |
| * \param[out] hStream |
| * \param[in] attr |
| * \param[in] value |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, |
| const CUstreamAttrValue *value); |
| |
| /** @} */ /* END CUDA_STREAM */ |
| |
| |
| /** |
| * \defgroup CUDA_EVENT Event Management |
| * |
| * ___MANBRIEF___ event management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the event management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Creates an event |
| * |
| * Creates an event *phEvent for the current context with the flags specified via |
| * \p Flags. Valid flags include: |
| * - ::CU_EVENT_DEFAULT: Default event creation flag. |
| * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking |
| * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on |
| * an event created with this flag will block until the event has actually |
| * been recorded. |
| * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need |
| * to record timing data. Events created with this flag specified and |
| * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best |
| * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). |
| * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an |
| * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must |
| * be specified along with ::CU_EVENT_DISABLE_TIMING. |
| * |
| * \param phEvent - Returns newly created event |
| * \param Flags - Event creation flags |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \notefnerr |
| * |
| * \sa |
| * ::cuEventRecord, |
| * ::cuEventQuery, |
| * ::cuEventSynchronize, |
| * ::cuEventDestroy, |
| * ::cuEventElapsedTime, |
| * ::cudaEventCreate, |
| * ::cudaEventCreateWithFlags |
| */ |
| CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); |
| |
| /** |
| * \brief Records an event |
| * |
| * Captures in \p hEvent the contents of \p hStream at the time of this call. |
| * \p hEvent and \p hStream must be from the same context. |
| * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then |
| * examine or wait for completion of the work that was captured. Uses of |
| * \p hStream after this call do not modify \p hEvent. See note on default |
| * stream behavior for what is captured in the default case. |
| * |
| * ::cuEventRecord() can be called multiple times on the same event and |
| * will overwrite the previously captured state. Other APIs such as |
| * ::cuStreamWaitEvent() use the most recently captured state at the time |
| * of the API call, and are not affected by later calls to |
| * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an |
| * event represents an empty set of work, so for example ::cuEventQuery() |
| * would return ::CUDA_SUCCESS. |
| * |
| * \param hEvent - Event to record |
| * \param hStream - Stream to record event for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuEventCreate, |
| * ::cuEventQuery, |
| * ::cuEventSynchronize, |
| * ::cuStreamWaitEvent, |
| * ::cuEventDestroy, |
| * ::cuEventElapsedTime, |
| * ::cudaEventRecord |
| */ |
| CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); |
| |
| /** |
| * \brief Queries an event's status |
| * |
| * Queries the status of all work currently captured by \p hEvent. See |
| * ::cuEventRecord() for details on what is captured by an event. |
| * |
| * Returns ::CUDA_SUCCESS if all captured work has been completed, or |
| * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. |
| * |
| * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS |
| * is equivalent to having called ::cuEventSynchronize(). |
| * |
| * \param hEvent - Event to query |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_READY |
| * \notefnerr |
| * |
| * \sa ::cuEventCreate, |
| * ::cuEventRecord, |
| * ::cuEventSynchronize, |
| * ::cuEventDestroy, |
| * ::cuEventElapsedTime, |
| * ::cudaEventQuery |
| */ |
| CUresult CUDAAPI cuEventQuery(CUevent hEvent); |
| |
| /** |
| * \brief Waits for an event to complete |
| * |
| * Waits until the completion of all work currently captured in \p hEvent. |
| * See ::cuEventRecord() for details on what is captured by an event. |
| * |
| * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC |
| * flag will cause the calling CPU thread to block until the event has |
| * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has |
| * not been set, then the CPU thread will busy-wait until the event has |
| * been completed by the device. |
| * |
| * \param hEvent - Event to wait for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuEventCreate, |
| * ::cuEventRecord, |
| * ::cuEventQuery, |
| * ::cuEventDestroy, |
| * ::cuEventElapsedTime, |
| * ::cudaEventSynchronize |
| */ |
| CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); |
| |
| /** |
| * \brief Destroys an event |
| * |
| * Destroys the event specified by \p hEvent. |
| * |
| * An event may be destroyed before it is complete (i.e., while |
| * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the |
| * call does not block on completion of the event, and any associated |
| * resources will automatically be released asynchronously at completion. |
| * |
| * \param hEvent - Event to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuEventCreate, |
| * ::cuEventRecord, |
| * ::cuEventQuery, |
| * ::cuEventSynchronize, |
| * ::cuEventElapsedTime, |
| * ::cudaEventDestroy |
| */ |
| CUresult CUDAAPI cuEventDestroy(CUevent hEvent); |
| |
| /** |
| * \brief Computes the elapsed time between two events |
| * |
| * Computes the elapsed time between two events (in milliseconds with a |
| * resolution of around 0.5 microseconds). |
| * |
| * If either event was last recorded in a non-NULL stream, the resulting time |
| * may be greater than expected (even if both used the same stream handle). This |
| * happens because the ::cuEventRecord() operation takes place asynchronously |
| * and there is no guarantee that the measured latency is actually just between |
| * the two events. Any number of other different stream operations could execute |
| * in between the two measured events, thus altering the timing in a significant |
| * way. |
| * |
| * If ::cuEventRecord() has not been called on either event then |
| * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called |
| * on both events but one or both of them has not yet been completed (that is, |
| * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the |
| * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with |
| * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return |
| * ::CUDA_ERROR_INVALID_HANDLE. |
| * |
| * \param pMilliseconds - Time between \p hStart and \p hEnd in ms |
| * \param hStart - Starting event |
| * \param hEnd - Ending event |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_READY |
| * \notefnerr |
| * |
| * \sa ::cuEventCreate, |
| * ::cuEventRecord, |
| * ::cuEventQuery, |
| * ::cuEventSynchronize, |
| * ::cuEventDestroy, |
| * ::cudaEventElapsedTime |
| */ |
| CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); |
| |
| /** @} */ /* END CUDA_EVENT */ |
| |
| /** |
| * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability |
| * |
| * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the external resource interoperability functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Imports an external memory object |
| * |
| * Imports an externally allocated memory object and returns |
| * a handle to that in \p extMem_out. |
| * |
| * The properties of the handle being imported must be described in |
| * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure |
| * is defined as follows: |
| * |
| * \code |
| typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { |
| CUexternalMemoryHandleType type; |
| union { |
| int fd; |
| struct { |
| void *handle; |
| const void *name; |
| } win32; |
| const void *nvSciBufObject; |
| } handle; |
| unsigned long long size; |
| unsigned int flags; |
| } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; |
| * \endcode |
| * |
| * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type |
| * of handle being imported. ::CUexternalMemoryHandleType is |
| * defined as: |
| * |
| * \code |
| typedef enum CUexternalMemoryHandleType_enum { |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, |
| CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 |
| } CUexternalMemoryHandleType; |
| * \endcode |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid |
| * file descriptor referencing a memory object. Ownership of |
| * the file descriptor is transferred to the CUDA driver when the |
| * handle is imported successfully. Performing any operations on the |
| * file descriptor after it is imported results in undefined behavior. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one |
| * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
| * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
| * is not NULL, then it must represent a valid shared NT handle that |
| * references a memory object. Ownership of this handle is |
| * not transferred to CUDA after the import operation, so the |
| * application must release the handle using the appropriate system |
| * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must point to a NULL-terminated array of |
| * UTF-16 characters that refers to a memory object. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
| * be non-NULL and |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * must be NULL. The handle specified must be a globally shared KMT |
| * handle. This handle does not hold a reference to the underlying |
| * object, and thus will be invalid when all references to the |
| * memory object are destroyed. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one |
| * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
| * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
| * is not NULL, then it must represent a valid shared NT handle that |
| * is returned by ID3D12Device::CreateSharedHandle when referring to a |
| * ID3D12Heap object. This handle holds a reference to the underlying |
| * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must point to a NULL-terminated array of |
| * UTF-16 characters that refers to a ID3D12Heap object. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one |
| * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be |
| * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle |
| * is not NULL, then it must represent a valid shared NT handle that |
| * is returned by ID3D12Device::CreateSharedHandle when referring to a |
| * ID3D12Resource object. This handle holds a reference to the |
| * underlying object. If |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must point to a NULL-terminated array of |
| * UTF-16 characters that refers to a ID3D12Resource object. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
| * represent a valid shared NT handle that is returned by |
| * IDXGIResource1::CreateSharedHandle when referring to a |
| * ID3D11Resource object. If |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must point to a NULL-terminated array of |
| * UTF-16 characters that refers to a ID3D11Resource object. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must |
| * represent a valid shared KMT handle that is returned by |
| * IDXGIResource::GetSharedHandle when referring to a |
| * ID3D11Resource object and |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name |
| * must be NULL. |
| * |
| * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL |
| * and reference a valid NvSciBuf object. |
| * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the |
| * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync |
| * as appropriate barriers to maintain coherence between CUDA and the other drivers. |
| * |
| * The size of the memory object must be specified in |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. |
| * |
| * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in |
| * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the |
| * resource is a dedicated resource. The definition of what a |
| * dedicated resource is outside the scope of this extension. |
| * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type |
| * is one of the following: |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT |
| * |
| * \param extMem_out - Returned handle to an external memory object |
| * \param memHandleDesc - Memory import handle descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the |
| * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges |
| * as well as appropriate Vulkan pipeline barriers to maintain coherence between |
| * CPU and GPU. For more information on these APIs, please refer to "Synchronization |
| * and Cache Control" chapter from Vulkan specification. |
| * |
| * \sa ::cuDestroyExternalMemory, |
| * ::cuExternalMemoryGetMappedBuffer, |
| * ::cuExternalMemoryGetMappedMipmappedArray |
| */ |
| CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); |
| |
| /** |
| * \brief Maps a buffer onto an imported memory object |
| * |
| * Maps a buffer onto an imported memory object and returns a device |
| * pointer in \p devPtr. |
| * |
| * The properties of the buffer being mapped must be described in |
| * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is |
| * defined as follows: |
| * |
| * \code |
| typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { |
| unsigned long long offset; |
| unsigned long long size; |
| unsigned int flags; |
| } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; |
| * \endcode |
| * |
| * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in |
| * the memory object where the buffer's base address is. |
| * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. |
| * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. |
| * |
| * The offset and size have to be suitably aligned to match the |
| * requirements of the external API. Mapping two buffers whose ranges |
| * overlap may or may not result in the same virtual address being |
| * returned for the overlapped portion. In such cases, the application |
| * must ensure that all accesses to that region from the GPU are |
| * volatile. Otherwise writes made via one address are not guaranteed |
| * to be visible via the other address, even if they're issued by the |
| * same thread. It is recommended that applications map the combined |
| * range instead of mapping separate buffers and then apply the |
| * appropriate offsets to the returned pointer to derive the |
| * individual buffers. |
| * |
| * The returned pointer \p devPtr must be freed using ::cuMemFree. |
| * |
| * \param devPtr - Returned device pointer to buffer |
| * \param extMem - Handle to external memory object |
| * \param bufferDesc - Buffer descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalMemory |
| * ::cuDestroyExternalMemory, |
| * ::cuExternalMemoryGetMappedMipmappedArray |
| */ |
| CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); |
| |
| /** |
| * \brief Maps a CUDA mipmapped array onto an external memory object |
| * |
| * Maps a CUDA mipmapped array onto an external object and returns a |
| * handle to it in \p mipmap. |
| * |
| * The properties of the CUDA mipmapped array being mapped must be |
| * described in \p mipmapDesc. The structure |
| * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: |
| * |
| * \code |
| typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { |
| unsigned long long offset; |
| CUDA_ARRAY3D_DESCRIPTOR arrayDesc; |
| unsigned int numLevels; |
| } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; |
| * \endcode |
| * |
| * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the |
| * offset in the memory object where the base level of the mipmap |
| * chain is. |
| * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes |
| * the format, dimensions and type of the base level of the mipmap |
| * chain. For further details on these parameters, please refer to the |
| * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped |
| * array is bound as a color target in the graphics API, then the flag |
| * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in |
| * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. |
| * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies |
| * the total number of levels in the mipmap chain. |
| * |
| * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then |
| * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. |
| * |
| * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. |
| * |
| * \param mipmap - Returned CUDA mipmapped array |
| * \param extMem - Handle to external memory object |
| * \param mipmapDesc - CUDA array descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalMemory |
| * ::cuDestroyExternalMemory, |
| * ::cuExternalMemoryGetMappedBuffer |
| */ |
| CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); |
| |
| /** |
| * \brief Destroys an external memory object. |
| * |
| * Destroys the specified external memory object. Any existing buffers |
| * and CUDA mipmapped arrays mapped onto this object must no longer be |
| * used and must be explicitly freed using ::cuMemFree and |
| * ::cuMipmappedArrayDestroy respectively. |
| * |
| * \param extMem - External memory object to be destroyed |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalMemory |
| * ::cuExternalMemoryGetMappedBuffer, |
| * ::cuExternalMemoryGetMappedMipmappedArray |
| */ |
| CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); |
| |
| /** |
| * \brief Imports an external semaphore |
| * |
| * Imports an externally allocated synchronization object and returns |
| * a handle to that in \p extSem_out. |
| * |
| * The properties of the handle being imported must be described in |
| * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is |
| * defined as follows: |
| * |
| * \code |
| typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { |
| CUexternalSemaphoreHandleType type; |
| union { |
| int fd; |
| struct { |
| void *handle; |
| const void *name; |
| } win32; |
| const void* NvSciSyncObj; |
| } handle; |
| unsigned int flags; |
| } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; |
| * \endcode |
| * |
| * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of |
| * handle being imported. ::CUexternalSemaphoreHandleType is defined |
| * as: |
| * |
| * \code |
| typedef enum CUexternalSemaphoreHandleType_enum { |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, |
| CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8 |
| } CUexternalSemaphoreHandleType; |
| * \endcode |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid |
| * file descriptor referencing a synchronization object. Ownership of |
| * the file descriptor is transferred to the CUDA driver when the |
| * handle is imported successfully. Performing any operations on the |
| * file descriptor after it is imported results in undefined behavior. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one |
| * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be |
| * NULL. If |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
| * is not NULL, then it must represent a valid shared NT handle that |
| * references a synchronization object. Ownership of this handle is |
| * not transferred to CUDA after the import operation, so the |
| * application must release the handle using the appropriate system |
| * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must name a valid synchronization object. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must |
| * be non-NULL and |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
| * must be NULL. The handle specified must be a globally shared KMT |
| * handle. This handle does not hold a reference to the underlying |
| * object, and thus will be invalid when all references to the |
| * synchronization object are destroyed. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one |
| * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be |
| * NULL. If |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
| * is not NULL, then it must represent a valid shared NT handle that |
| * is returned by ID3D12Device::CreateSharedHandle when referring to a |
| * ID3D12Fence object. This handle holds a reference to the underlying |
| * object. If |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must name a valid synchronization object that |
| * refers to a valid ID3D12Fence object. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
| * represents a valid shared NT handle that is returned by |
| * ID3D11Fence::CreateSharedHandle. If |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must name a valid synchronization object that |
| * refers to a valid ID3D11Fence object. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj |
| * represents a valid NvSciSyncObj. |
| * |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
| * represents a valid shared NT handle that |
| * is returned by IDXGIResource1::CreateSharedHandle when referring to |
| * a IDXGIKeyedMutex object. If |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name |
| * is not NULL, then it must name a valid synchronization object that |
| * refers to a valid IDXGIKeyedMutex object. |
| * |
| * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle |
| * represents a valid shared KMT handle that |
| * is returned by IDXGIResource::GetSharedHandle when referring to |
| * a IDXGIKeyedMutex object and |
| * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. |
| * |
| * \param extSem_out - Returned handle to an external semaphore |
| * \param semHandleDesc - Semaphore import handle descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_NOT_SUPPORTED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuDestroyExternalSemaphore, |
| * ::cuSignalExternalSemaphoresAsync, |
| * ::cuWaitExternalSemaphoresAsync |
| */ |
| CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); |
| |
| /** |
| * \brief Signals a set of external semaphore objects |
| * |
| * Enqueues a signal operation on a set of externally allocated |
| * semaphore object in the specified stream. The operations will be |
| * executed when all prior operations in the stream complete. |
| * |
| * The exact semantics of signaling a semaphore depends on the type of |
| * the object. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * then signaling the semaphore will set it to the signaled state. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE |
| * then the semaphore will be set to the value specified in |
| * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. |
| * |
| * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC |
| * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence |
| * to a value that can be used by subsequent waiters of the same NvSciSync object |
| * to order operations with those currently submitted in \p stream. Such an update |
| * will overwrite previous contents of |
| * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, |
| * signaling such an external semaphore object causes appropriate memory synchronization |
| * operations to be performed over all external memory objects that are imported as |
| * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses |
| * made by other importers of the same set of NvSciBuf memory object(s) are coherent. |
| * These operations can be skipped by specifying the flag |
| * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a |
| * performance optimization when data coherency is not required. But specifying this |
| * flag in scenarios where data coherency is required results in undefined behavior. |
| * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
| * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in |
| * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return |
| * CUDA_ERROR_NOT_SUPPORTED. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
| * then the keyed mutex will be released with the key specified in |
| * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. |
| * |
| * \param extSemArray - Set of external semaphores to be signaled |
| * \param paramsArray - Array of semaphore parameters |
| * \param numExtSems - Number of semaphores to signal |
| * \param stream - Stream to enqueue the signal operations in |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalSemaphore, |
| * ::cuDestroyExternalSemaphore, |
| * ::cuWaitExternalSemaphoresAsync |
| */ |
| CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
| |
| /** |
| * \brief Waits on a set of external semaphore objects |
| * |
| * Enqueues a wait operation on a set of externally allocated |
| * semaphore object in the specified stream. The operations will be |
| * executed when all prior operations in the stream complete. |
| * |
| * The exact semantics of waiting on a semaphore depends on the type |
| * of the object. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT |
| * then waiting on the semaphore will wait until the semaphore reaches |
| * the signaled state. The semaphore will then be reset to the |
| * unsignaled state. Therefore for every signal operation, there can |
| * only be one wait operation. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE |
| * then waiting on the semaphore will wait until the value of the |
| * semaphore is greater than or equal to |
| * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. |
| * |
| * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC |
| * then, waiting on the semaphore will wait until the |
| * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the |
| * signaler of the NvSciSyncObj that was associated with this semaphore object. |
| * By default, waiting on such an external semaphore object causes appropriate |
| * memory synchronization operations to be performed over all external memory objects |
| * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that |
| * any subsequent accesses made by other importers of the same set of NvSciBuf memory |
| * object(s) are coherent. These operations can be skipped by specifying the flag |
| * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a |
| * performance optimization when data coherency is not required. But specifying this |
| * flag in scenarios where data coherency is required results in undefined behavior. |
| * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, |
| * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in |
| * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return |
| * CUDA_ERROR_NOT_SUPPORTED. |
| * |
| * If the semaphore object is any one of the following types: |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, |
| * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT |
| * then the keyed mutex will be acquired when it is released with the key |
| * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key |
| * or until the timeout specified by |
| * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs |
| * has lapsed. The timeout interval can either be a finite value |
| * specified in milliseconds or an infinite value. In case an infinite |
| * value is specified the timeout never elapses. The windows INFINITE |
| * macro must be used to specify infinite timeout. |
| * |
| * \param extSemArray - External semaphores to be waited on |
| * \param paramsArray - Array of semaphore parameters |
| * \param numExtSems - Number of semaphores to wait on |
| * \param stream - Stream to enqueue the wait operations in |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_SUPPORTED, |
| * ::CUDA_ERROR_TIMEOUT |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalSemaphore, |
| * ::cuDestroyExternalSemaphore, |
| * ::cuSignalExternalSemaphoresAsync |
| */ |
| CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
| |
| /** |
| * \brief Destroys an external semaphore |
| * |
| * Destroys an external semaphore object and releases any references |
| * to the underlying resource. Any outstanding signals or waits must |
| * have completed before the semaphore is destroyed. |
| * |
| * \param extSem - External semaphore to be destroyed |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa ::cuImportExternalSemaphore, |
| * ::cuSignalExternalSemaphoresAsync, |
| * ::cuWaitExternalSemaphoresAsync |
| */ |
| CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); |
| |
| /** @} */ /* END CUDA_EXTRES_INTEROP */ |
| |
| /** |
| * \defgroup CUDA_MEMOP Stream memory operations |
| * |
| * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the stream memory operations of the low-level CUDA |
| * driver application programming interface. |
| * |
| * The whole set of operations is disabled by default. Users are required |
| * to explicitly enable them, e.g. on Linux by passing the kernel module |
| * parameter shown below: |
| * modprobe nvidia NVreg_EnableStreamMemOPs=1 |
| * There is currently no way to enable these operations on other operating |
| * systems. |
| * |
| * Users can programmatically query whether the device supports these |
| * operations with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
| * |
| * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. |
| * |
| * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() |
| * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and |
| * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
| * |
| * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and |
| * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform |
| * hardware features and can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. |
| * |
| * Note that all memory pointers passed as parameters to these operations |
| * are device pointers. Where necessary a device pointer should be |
| * obtained, for example with ::cuMemHostGetDevicePointer(). |
| * |
| * None of the operations accepts pointers to managed memory buffers |
| * (::cuMemAllocManaged). |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Wait on a memory location |
| * |
| * Enqueues a synchronization of the stream on the given memory location. Work |
| * ordered after the operation will block until the given condition on the |
| * memory is satisfied. By default, the condition is to wait for |
| * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. |
| * Other condition types can be specified via \p flags. |
| * |
| * If the memory was registered via ::cuMemHostRegister(), the device pointer |
| * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot |
| * be used with managed memory (::cuMemAllocManaged). |
| * |
| * Support for this can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
| * |
| * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. |
| * |
| * \param stream The stream to synchronize on the memory location. |
| * \param addr The memory location to wait on. |
| * \param value The value to compare with the memory location. |
| * \param flags See ::CUstreamWaitValue_flags. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuStreamWaitValue64, |
| * ::cuStreamWriteValue32, |
| * ::cuStreamWriteValue64 |
| * ::cuStreamBatchMemOp, |
| * ::cuMemHostRegister, |
| * ::cuStreamWaitEvent |
| */ |
| CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
| |
| /** |
| * \brief Wait on a memory location |
| * |
| * Enqueues a synchronization of the stream on the given memory location. Work |
| * ordered after the operation will block until the given condition on the |
| * memory is satisfied. By default, the condition is to wait for |
| * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. |
| * Other condition types can be specified via \p flags. |
| * |
| * If the memory was registered via ::cuMemHostRegister(), the device pointer |
| * should be obtained with ::cuMemHostGetDevicePointer(). |
| * |
| * Support for this can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
| * |
| * \param stream The stream to synchronize on the memory location. |
| * \param addr The memory location to wait on. |
| * \param value The value to compare with the memory location. |
| * \param flags See ::CUstreamWaitValue_flags. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuStreamWaitValue32, |
| * ::cuStreamWriteValue32, |
| * ::cuStreamWriteValue64, |
| * ::cuStreamBatchMemOp, |
| * ::cuMemHostRegister, |
| * ::cuStreamWaitEvent |
| */ |
| CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
| |
| /** |
| * \brief Write a value to memory |
| * |
| * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER |
| * flag is passed, the write is preceded by a system-wide memory fence, |
| * equivalent to a __threadfence_system() but scoped to the stream |
| * rather than a CUDA thread. |
| * |
| * If the memory was registered via ::cuMemHostRegister(), the device pointer |
| * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot |
| * be used with managed memory (::cuMemAllocManaged). |
| * |
| * Support for this can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. |
| * |
| * \param stream The stream to do the write in. |
| * \param addr The device address to write to. |
| * \param value The value to write. |
| * \param flags See ::CUstreamWriteValue_flags. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuStreamWriteValue64, |
| * ::cuStreamWaitValue32, |
| * ::cuStreamWaitValue64, |
| * ::cuStreamBatchMemOp, |
| * ::cuMemHostRegister, |
| * ::cuEventRecord |
| */ |
| CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
| |
| /** |
| * \brief Write a value to memory |
| * |
| * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER |
| * flag is passed, the write is preceded by a system-wide memory fence, |
| * equivalent to a __threadfence_system() but scoped to the stream |
| * rather than a CUDA thread. |
| * |
| * If the memory was registered via ::cuMemHostRegister(), the device pointer |
| * should be obtained with ::cuMemHostGetDevicePointer(). |
| * |
| * Support for this can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. |
| * |
| * \param stream The stream to do the write in. |
| * \param addr The device address to write to. |
| * \param value The value to write. |
| * \param flags See ::CUstreamWriteValue_flags. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuStreamWriteValue32, |
| * ::cuStreamWaitValue32, |
| * ::cuStreamWaitValue64, |
| * ::cuStreamBatchMemOp, |
| * ::cuMemHostRegister, |
| * ::cuEventRecord |
| */ |
| CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
| |
| /** |
| * \brief Batch operations to synchronize the stream via memory operations |
| * |
| * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). |
| * Batching operations may avoid some performance overhead in both the API call |
| * and the device execution versus adding them to the stream in separate API |
| * calls. The operations are enqueued in the order they appear in the array. |
| * |
| * See ::CUstreamBatchMemOpType for the full set of supported operations, and |
| * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), |
| * and ::cuStreamWriteValue64() for details of specific operations. |
| * |
| * Basic support for this can be queried with ::cuDeviceGetAttribute() and |
| * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details |
| * on querying support for specific operations. |
| * |
| * \param stream The stream to enqueue the operations in. |
| * \param count The number of operations in the array. Must be less than 256. |
| * \param paramArray The types and parameters of the individual operations. |
| * \param flags Reserved for future expansion; must be 0. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \notefnerr |
| * |
| * \sa ::cuStreamWaitValue32, |
| * ::cuStreamWaitValue64, |
| * ::cuStreamWriteValue32, |
| * ::cuStreamWriteValue64, |
| * ::cuMemHostRegister |
| */ |
| CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); |
| |
| /** @} */ /* END CUDA_MEMOP */ |
| |
| /** |
| * \defgroup CUDA_EXEC Execution Control |
| * |
| * ___MANBRIEF___ execution control functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the execution control functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Returns information about a function |
| * |
| * Returns in \p *pi the integer value of the attribute \p attrib on the kernel |
| * given by \p hfunc. The supported attributes are: |
| * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads |
| * per block, beyond which a launch of the function would fail. This number |
| * depends on both the function and the device on which the function is |
| * currently loaded. |
| * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of |
| * statically-allocated shared memory per block required by this function. |
| * This does not include dynamically-allocated shared memory requested by |
| * the user at runtime. |
| * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated |
| * constant memory required by this function. |
| * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory |
| * used by each thread of this function. |
| * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread |
| * of this function. |
| * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for |
| * which the function was compiled. This value is the major PTX version * 10 |
| * + the minor PTX version, so a PTX version 1.3 function would return the |
| * value 13. Note that this may return the undefined value of 0 for cubins |
| * compiled prior to CUDA 3.0. |
| * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for |
| * which the function was compiled. This value is the major binary |
| * version * 10 + the minor binary version, so a binary version 1.3 function |
| * would return the value 13. Note that this will return a value of 10 for |
| * legacy cubins that do not have a properly-encoded binary architecture |
| * version. |
| * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has |
| * been compiled with user specified option "-Xptxas --dlcm=ca" set . |
| * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of |
| * dynamically-allocated shared memory. |
| * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 |
| * cache split ratio in percent of total shared memory. |
| * |
| * \param pi - Returned attribute value |
| * \param attrib - Attribute requested |
| * \param hfunc - Function to query attribute of |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cuLaunchKernel, |
| * ::cudaFuncGetAttributes |
| * ::cudaFuncSetAttribute |
| */ |
| CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); |
| |
| /** |
| * \brief Sets information about a function |
| * |
| * This call sets the value of a specified attribute \p attrib on the kernel given |
| * by \p hfunc to an integer value specified by \p val |
| * This function returns CUDA_SUCCESS if the new value of the attribute could be |
| * successfully set. If the set fails, this call will return an error. |
| * Not all attributes can have values set. Attempting to set a value on a read-only |
| * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) |
| * |
| * Supported attributes for the cuFuncSetAttribute call are: |
| * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of |
| * dynamically-allocated shared memory. The value should contain the requested |
| * maximum size of dynamically-allocated shared memory. The sum of this value and |
| * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. |
| * The maximal size of requestable dynamic shared memory may differ by GPU |
| * architecture. |
| * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 |
| * cache and shared memory use the same hardware resources, this sets the shared memory |
| * carveout preference, in percent of the total shared memory. |
| * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR |
| * This is only a hint, and the driver can choose a different ratio if required to execute the function. |
| * |
| * \param hfunc - Function to query attribute of |
| * \param attrib - Attribute requested |
| * \param value - The value to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cuLaunchKernel, |
| * ::cudaFuncGetAttributes |
| * ::cudaFuncSetAttribute |
| */ |
| CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); |
| |
| /** |
| * \brief Sets the preferred cache configuration for a device function |
| * |
| * On devices where the L1 cache and shared memory use the same hardware |
| * resources, this sets through \p config the preferred cache configuration for |
| * the device function \p hfunc. This is only a preference. The driver will use |
| * the requested configuration if possible, but it is free to choose a different |
| * configuration if required to execute \p hfunc. Any context-wide preference |
| * set via ::cuCtxSetCacheConfig() will be overridden by this per-function |
| * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In |
| * that case, the current context-wide setting will be used. |
| * |
| * This setting does nothing on devices where the size of the L1 cache and |
| * shared memory are fixed. |
| * |
| * Launching a kernel with a different preference than the most recent |
| * preference setting may insert a device-side synchronization point. |
| * |
| * |
| * The supported cache configurations are: |
| * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) |
| * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache |
| * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory |
| * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory |
| * |
| * \param hfunc - Kernel to configure cache for |
| * \param config - Requested cache configuration |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuLaunchKernel, |
| * ::cudaFuncSetCacheConfig |
| */ |
| CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); |
| |
| /** |
| * \brief Sets the shared memory configuration for a device function. |
| * |
| * On devices with configurable shared memory banks, this function will |
| * force all subsequent launches of the specified device function to have |
| * the given shared memory bank size configuration. On any given launch of the |
| * function, the shared memory configuration of the device will be temporarily |
| * changed if needed to suit the function's preferred configuration. Changes in |
| * shared memory configuration between subsequent launches of functions, |
| * may introduce a device side synchronization point. |
| * |
| * Any per-function setting of shared memory bank size set via |
| * ::cuFuncSetSharedMemConfig will override the context wide setting set with |
| * ::cuCtxSetSharedMemConfig. |
| * |
| * Changing the shared memory bank size will not increase shared memory usage |
| * or affect occupancy of kernels, but may have major effects on performance. |
| * Larger bank sizes will allow for greater potential bandwidth to shared memory, |
| * but will change what kinds of accesses to shared memory will result in bank |
| * conflicts. |
| * |
| * This function will do nothing on devices with fixed shared memory bank size. |
| * |
| * The supported bank configurations are: |
| * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory |
| * configuration when launching this function. |
| * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to |
| * be natively four bytes when launching this function. |
| * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to |
| * be natively eight bytes when launching this function. |
| * |
| * \param hfunc - kernel to be given a shared memory config |
| * \param config - requested shared memory configuration |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuCtxGetSharedMemConfig, |
| * ::cuCtxSetSharedMemConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuLaunchKernel, |
| * ::cudaFuncSetSharedMemConfig |
| */ |
| CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); |
| |
| /** |
| * \brief Launches a CUDA function |
| * |
| * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ |
| * grid of blocks. Each block contains \p blockDimX x \p blockDimY x |
| * \p blockDimZ threads. |
| * |
| * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
| * available to each thread block. |
| * |
| * Kernel parameters to \p f can be specified in one of two ways: |
| * |
| * 1) Kernel parameters can be specified via \p kernelParams. If \p f |
| * has N parameters, then \p kernelParams needs to be an array of N |
| * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] |
| * must point to a region of memory from which the actual kernel |
| * parameter will be copied. The number of kernel parameters and their |
| * offsets and sizes do not need to be specified as that information is |
| * retrieved directly from the kernel's image. |
| * |
| * 2) Kernel parameters can also be packaged by the application into |
| * a single buffer that is passed in via the \p extra parameter. |
| * This places the burden on the application of knowing each kernel |
| * parameter's size and alignment/padding within the buffer. Here is |
| * an example of using the \p extra parameter in this manner: |
| * \code |
| size_t argBufferSize; |
| char argBuffer[256]; |
| |
| // populate argBuffer and argBufferSize |
| |
| void *config[] = { |
| CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, |
| CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, |
| CU_LAUNCH_PARAM_END |
| }; |
| status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); |
| * \endcode |
| * |
| * The \p extra parameter exists to allow ::cuLaunchKernel to take |
| * additional less commonly used arguments. \p extra specifies a list of |
| * names of extra settings and their corresponding values. Each extra |
| * setting name is immediately followed by the corresponding value. The |
| * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. |
| * |
| * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra |
| * array; |
| * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next |
| * value in \p extra will be a pointer to a buffer containing all |
| * the kernel parameters for launching kernel \p f; |
| * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next |
| * value in \p extra will be a pointer to a size_t containing the |
| * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; |
| * |
| * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel |
| * parameters are specified with both \p kernelParams and \p extra |
| * (i.e. both \p kernelParams and \p extra are non-NULL). |
| * |
| * Calling ::cuLaunchKernel() invalidates the persistent function state |
| * set through the following deprecated APIs: |
| * ::cuFuncSetBlockShape(), |
| * ::cuFuncSetSharedSize(), |
| * ::cuParamSetSize(), |
| * ::cuParamSeti(), |
| * ::cuParamSetf(), |
| * ::cuParamSetv(). |
| * |
| * Note that to use ::cuLaunchKernel(), the kernel \p f must either have |
| * been compiled with toolchain version 3.2 or later so that it will |
| * contain kernel parameter information, or have no kernel parameters. |
| * If either of these conditions is not met, then ::cuLaunchKernel() will |
| * return ::CUDA_ERROR_INVALID_IMAGE. |
| * |
| * \param f - Kernel to launch |
| * \param gridDimX - Width of grid in blocks |
| * \param gridDimY - Height of grid in blocks |
| * \param gridDimZ - Depth of grid in blocks |
| * \param blockDimX - X dimension of each thread block |
| * \param blockDimY - Y dimension of each thread block |
| * \param blockDimZ - Z dimension of each thread block |
| * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes |
| * \param hStream - Stream identifier |
| * \param kernelParams - Array of pointers to kernel parameters |
| * \param extra - Extra options |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_IMAGE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cudaLaunchKernel |
| */ |
| CUresult CUDAAPI cuLaunchKernel(CUfunction f, |
| unsigned int gridDimX, |
| unsigned int gridDimY, |
| unsigned int gridDimZ, |
| unsigned int blockDimX, |
| unsigned int blockDimY, |
| unsigned int blockDimZ, |
| unsigned int sharedMemBytes, |
| CUstream hStream, |
| void **kernelParams, |
| void **extra); |
| |
| /** |
| * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute |
| * |
| * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ |
| * grid of blocks. Each block contains \p blockDimX x \p blockDimY x |
| * \p blockDimZ threads. |
| * |
| * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
| * available to each thread block. |
| * |
| * The device on which this kernel is invoked must have a non-zero value for |
| * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. |
| * |
| * The total number of blocks launched cannot exceed the maximum number of blocks per |
| * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or |
| * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors |
| * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. |
| * |
| * The kernel cannot make use of CUDA dynamic parallelism. |
| * |
| * Kernel parameters must be specified via \p kernelParams. If \p f |
| * has N parameters, then \p kernelParams needs to be an array of N |
| * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] |
| * must point to a region of memory from which the actual kernel |
| * parameter will be copied. The number of kernel parameters and their |
| * offsets and sizes do not need to be specified as that information is |
| * retrieved directly from the kernel's image. |
| * |
| * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is |
| * the same as function state set through ::cuLaunchKernel API |
| * |
| * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous |
| * block shape, shared size and parameter info associated with \p f |
| * is overwritten. |
| * |
| * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have |
| * been compiled with toolchain version 3.2 or later so that it will |
| * contain kernel parameter information, or have no kernel parameters. |
| * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will |
| * return ::CUDA_ERROR_INVALID_IMAGE. |
| * |
| * \param f - Kernel to launch |
| * \param gridDimX - Width of grid in blocks |
| * \param gridDimY - Height of grid in blocks |
| * \param gridDimZ - Depth of grid in blocks |
| * \param blockDimX - X dimension of each thread block |
| * \param blockDimY - Y dimension of each thread block |
| * \param blockDimZ - Z dimension of each thread block |
| * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes |
| * \param hStream - Stream identifier |
| * \param kernelParams - Array of pointers to kernel parameters |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_IMAGE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuLaunchCooperativeKernelMultiDevice, |
| * ::cudaLaunchCooperativeKernel |
| */ |
| CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, |
| unsigned int gridDimX, |
| unsigned int gridDimY, |
| unsigned int gridDimZ, |
| unsigned int blockDimX, |
| unsigned int blockDimY, |
| unsigned int blockDimZ, |
| unsigned int sharedMemBytes, |
| CUstream hStream, |
| void **kernelParams); |
| |
| /** |
| * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute |
| * |
| * Invokes kernels as specified in the \p launchParamsList array where each element |
| * of the array specifies all the parameters required to perform a single kernel launch. |
| * These kernels can cooperate and synchronize as they execute. The size of the array is |
| * specified by \p numDevices. |
| * |
| * No two kernels can be launched on the same device. All the devices targeted by this |
| * multi-device launch must be identical. All devices must have a non-zero value for the |
| * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. |
| * |
| * All kernels launched must be identical with respect to the compiled code. Note that |
| * any __device__, __constant__ or __managed__ variables present in the module that owns |
| * the kernel launched on each device, are independently instantiated on every device. |
| * It is the application's responsiblity to ensure these variables are initialized and |
| * used appropriately. |
| * |
| * The size of the grids as specified in blocks, the size of the blocks themselves |
| * and the amount of shared memory used by each thread block must also match across |
| * all launched kernels. |
| * |
| * The streams used to launch these kernels must have been created via either ::cuStreamCreate |
| * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD |
| * cannot be used. |
| * |
| * The total number of blocks launched per kernel cannot exceed the maximum number of blocks |
| * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or |
| * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors |
| * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the |
| * total number of blocks launched per device has to match across all devices, the maximum |
| * number of blocks that can be launched per device will be limited by the device with the |
| * least number of multiprocessors. |
| * |
| * The kernels cannot make use of CUDA dynamic parallelism. |
| * |
| * The ::CUDA_LAUNCH_PARAMS structure is defined as: |
| * \code |
| typedef struct CUDA_LAUNCH_PARAMS_st |
| { |
| CUfunction function; |
| unsigned int gridDimX; |
| unsigned int gridDimY; |
| unsigned int gridDimZ; |
| unsigned int blockDimX; |
| unsigned int blockDimY; |
| unsigned int blockDimZ; |
| unsigned int sharedMemBytes; |
| CUstream hStream; |
| void **kernelParams; |
| } CUDA_LAUNCH_PARAMS; |
| * \endcode |
| * where: |
| * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must |
| * be identical with respect to the compiled code. |
| * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across |
| * all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. |
| * This must match across all kernels launched. |
| * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot |
| * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated |
| * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. |
| * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If |
| * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams |
| * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through |
| * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual |
| * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes |
| * do not need to be specified as that information is retrieved directly from the kernel's image. |
| * |
| * By default, the kernel won't begin execution on any GPU until all prior work in all the specified |
| * streams has completed. This behavior can be overridden by specifying the flag |
| * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel |
| * will only wait for prior work in the stream corresponding to that GPU to complete before it begins |
| * execution. |
| * |
| * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin |
| * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying |
| * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, |
| * any subsequent work pushed in any of the specified streams will only wait for the kernel launched |
| * on the GPU corresponding to that stream to complete before it begins execution. |
| * |
| * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is |
| * the same as function state set through ::cuLaunchKernel API when called individually for each |
| * element in \p launchParamsList. |
| * |
| * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous |
| * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function |
| * in \p launchParamsList is overwritten. |
| * |
| * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have |
| * been compiled with toolchain version 3.2 or later so that it will |
| * contain kernel parameter information, or have no kernel parameters. |
| * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will |
| * return ::CUDA_ERROR_INVALID_IMAGE. |
| * |
| * \param launchParamsList - List of launch parameters, one per device |
| * \param numDevices - Size of the \p launchParamsList array |
| * \param flags - Flags to control launch behavior |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_IMAGE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuCtxGetCacheConfig, |
| * ::cuCtxSetCacheConfig, |
| * ::cuFuncSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuLaunchCooperativeKernel, |
| * ::cudaLaunchCooperativeKernelMultiDevice |
| */ |
| CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); |
| |
| /** |
| * \brief Enqueues a host function call in a stream |
| * |
| * Enqueues a host function to run in a stream. The function will be called |
| * after currently enqueued work and will block work added after it. |
| * |
| * The host function must not make any CUDA API calls. Attempting to use a |
| * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. |
| * The host function must not perform any synchronization that may depend on |
| * outstanding CUDA work not mandated to run earlier. Host functions without a |
| * mandated order (such as in independent streams) execute in undefined order |
| * and may be serialized. |
| * |
| * For the purposes of Unified Memory, execution makes a number of guarantees: |
| * <ul> |
| * <li>The stream is considered idle for the duration of the function's |
| * execution. Thus, for example, the function may always use memory attached |
| * to the stream it was enqueued in.</li> |
| * <li>The start of execution of the function has the same effect as |
| * synchronizing an event recorded in the same stream immediately prior to |
| * the function. It thus synchronizes streams which have been "joined" |
| * prior to the function.</li> |
| * <li>Adding device work to any stream does not have the effect of making |
| * the stream active until all preceding host functions and stream callbacks |
| * have executed. Thus, for |
| * example, a function might use global attached memory even if work has |
| * been added to another stream, if the work has been ordered behind the |
| * function call with an event.</li> |
| * <li>Completion of the function does not cause a stream to become |
| * active except as described above. The stream will remain idle |
| * if no device work follows the function, and will remain idle across |
| * consecutive host functions or stream callbacks without device work in |
| * between. Thus, for example, |
| * stream synchronization can be done by signaling from a host function at the |
| * end of the stream.</li> |
| * </ul> |
| * |
| * Note that, in contrast to ::cuStreamAddCallback, the function will not be |
| * called in the event of an error in the CUDA context. |
| * |
| * \param hStream - Stream to enqueue function call in |
| * \param fn - The function to call once preceding stream operations are complete |
| * \param userData - User-specified data to be passed to the function |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_SUPPORTED |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuStreamCreate, |
| * ::cuStreamQuery, |
| * ::cuStreamSynchronize, |
| * ::cuStreamWaitEvent, |
| * ::cuStreamDestroy, |
| * ::cuMemAllocManaged, |
| * ::cuStreamAttachMemAsync, |
| * ::cuStreamAddCallback |
| */ |
| CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); |
| |
| /** @} */ /* END CUDA_EXEC */ |
| |
| /** |
| * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] |
| * |
| * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the deprecated execution control functions of the |
| * low-level CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Sets the block-dimensions for the function |
| * |
| * \deprecated |
| * |
| * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are |
| * created when the kernel given by \p hfunc is launched. |
| * |
| * \param hfunc - Kernel to specify dimensions of |
| * \param x - X dimension |
| * \param y - Y dimension |
| * \param z - Z dimension |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetSharedSize, |
| * ::cuFuncSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSeti, |
| * ::cuParamSetf, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); |
| |
| /** |
| * \brief Sets the dynamic shared-memory size for the function |
| * |
| * \deprecated |
| * |
| * Sets through \p bytes the amount of dynamic shared memory that will be |
| * available to each thread block when the kernel given by \p hfunc is launched. |
| * |
| * \param hfunc - Kernel to specify dynamic shared-memory size for |
| * \param bytes - Dynamic shared-memory size per thread in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetCacheConfig, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSeti, |
| * ::cuParamSetf, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); |
| |
| /** |
| * \brief Sets the parameter size for the function |
| * |
| * \deprecated |
| * |
| * Sets through \p numbytes the total size in bytes needed by the function |
| * parameters of the kernel corresponding to \p hfunc. |
| * |
| * \param hfunc - Kernel to set parameter size for |
| * \param numbytes - Size of parameter list in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetf, |
| * ::cuParamSeti, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); |
| |
| /** |
| * \brief Adds an integer parameter to the function's argument list |
| * |
| * \deprecated |
| * |
| * Sets an integer parameter that will be specified the next time the |
| * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. |
| * |
| * \param hfunc - Kernel to add parameter to |
| * \param offset - Offset to add parameter to argument list |
| * \param value - Value of parameter |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSetf, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); |
| |
| /** |
| * \brief Adds a floating-point parameter to the function's argument list |
| * |
| * \deprecated |
| * |
| * Sets a floating-point parameter that will be specified the next time the |
| * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. |
| * |
| * \param hfunc - Kernel to add parameter to |
| * \param offset - Offset to add parameter to argument list |
| * \param value - Value of parameter |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSeti, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); |
| |
| /** |
| * \brief Adds arbitrary data to the function's argument list |
| * |
| * \deprecated |
| * |
| * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr |
| * into the parameter space of the kernel corresponding to \p hfunc. \p offset |
| * is a byte offset. |
| * |
| * \param hfunc - Kernel to add data to |
| * \param offset - Offset to add data to argument list |
| * \param ptr - Pointer to arbitrary data |
| * \param numbytes - Size of data to copy in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSetf, |
| * ::cuParamSeti, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); |
| |
| /** |
| * \brief Launches a CUDA function |
| * |
| * \deprecated |
| * |
| * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block |
| * contains the number of threads specified by a previous call to |
| * ::cuFuncSetBlockShape(). |
| * |
| * The block shape, dynamic shared memory size, and parameter information |
| * must be set using |
| * ::cuFuncSetBlockShape(), |
| * ::cuFuncSetSharedSize(), |
| * ::cuParamSetSize(), |
| * ::cuParamSeti(), |
| * ::cuParamSetf(), and |
| * ::cuParamSetv() |
| * prior to calling this function. |
| * |
| * Launching a function via ::cuLaunchKernel() invalidates the function's |
| * block shape, dynamic shared memory size, and parameter information. After |
| * launching via cuLaunchKernel, this state must be re-initialized prior to |
| * calling this function. Failure to do so results in undefined behavior. |
| * |
| * \param f - Kernel to launch |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSetf, |
| * ::cuParamSeti, |
| * ::cuParamSetv, |
| * ::cuLaunchGrid, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); |
| |
| /** |
| * \brief Launches a CUDA function |
| * |
| * \deprecated |
| * |
| * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of |
| * blocks. Each block contains the number of threads specified by a previous |
| * call to ::cuFuncSetBlockShape(). |
| * |
| * The block shape, dynamic shared memory size, and parameter information |
| * must be set using |
| * ::cuFuncSetBlockShape(), |
| * ::cuFuncSetSharedSize(), |
| * ::cuParamSetSize(), |
| * ::cuParamSeti(), |
| * ::cuParamSetf(), and |
| * ::cuParamSetv() |
| * prior to calling this function. |
| * |
| * Launching a function via ::cuLaunchKernel() invalidates the function's |
| * block shape, dynamic shared memory size, and parameter information. After |
| * launching via cuLaunchKernel, this state must be re-initialized prior to |
| * calling this function. Failure to do so results in undefined behavior. |
| * |
| * \param f - Kernel to launch |
| * \param grid_width - Width of grid in blocks |
| * \param grid_height - Height of grid in blocks |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSetf, |
| * ::cuParamSeti, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGridAsync, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); |
| |
| /** |
| * \brief Launches a CUDA function |
| * |
| * \deprecated |
| * |
| * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of |
| * blocks. Each block contains the number of threads specified by a previous |
| * call to ::cuFuncSetBlockShape(). |
| * |
| * The block shape, dynamic shared memory size, and parameter information |
| * must be set using |
| * ::cuFuncSetBlockShape(), |
| * ::cuFuncSetSharedSize(), |
| * ::cuParamSetSize(), |
| * ::cuParamSeti(), |
| * ::cuParamSetf(), and |
| * ::cuParamSetv() |
| * prior to calling this function. |
| * |
| * Launching a function via ::cuLaunchKernel() invalidates the function's |
| * block shape, dynamic shared memory size, and parameter information. After |
| * launching via cuLaunchKernel, this state must be re-initialized prior to |
| * calling this function. Failure to do so results in undefined behavior. |
| * |
| * \param f - Kernel to launch |
| * \param grid_width - Width of grid in blocks |
| * \param grid_height - Height of grid in blocks |
| * \param hStream - Stream identifier |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_LAUNCH_FAILED, |
| * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, |
| * ::CUDA_ERROR_LAUNCH_TIMEOUT, |
| * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, |
| * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED |
| * |
| * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), |
| * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by |
| * growing the per-thread stack as needed per launch and not shrinking it afterwards. |
| * |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa ::cuFuncSetBlockShape, |
| * ::cuFuncSetSharedSize, |
| * ::cuFuncGetAttribute, |
| * ::cuParamSetSize, |
| * ::cuParamSetf, |
| * ::cuParamSeti, |
| * ::cuParamSetv, |
| * ::cuLaunch, |
| * ::cuLaunchGrid, |
| * ::cuLaunchKernel |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); |
| |
| |
| /** |
| * \brief Adds a texture-reference to the function's argument list |
| * |
| * \deprecated |
| * |
| * Makes the CUDA array or linear memory bound to the texture reference |
| * \p hTexRef available to a device program as a texture. In this version of |
| * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and |
| * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. |
| * |
| * \param hfunc - Kernel to add texture-reference to |
| * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) |
| * \param hTexRef - Texture-reference to add to argument list |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); |
| /** @} */ /* END CUDA_EXEC_DEPRECATED */ |
| |
| /** |
| * \defgroup CUDA_GRAPH Graph Management |
| * |
| * ___MANBRIEF___ graph management functions of the low-level CUDA driver API |
| * (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the graph management functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Creates a graph |
| * |
| * Creates an empty graph, which is returned via \p phGraph. |
| * |
| * \param phGraph - Returns newly created graph |
| * \param flags - Graph creation flags, must be 0 |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode, |
| * ::cuGraphInstantiate, |
| * ::cuGraphDestroy, |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphGetEdges, |
| * ::cuGraphClone |
| */ |
| CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); |
| |
| /** |
| * \brief Creates a kernel execution node and adds it to a graph |
| * |
| * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies |
| * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * The CUDA_KERNEL_NODE_PARAMS structure is defined as: |
| * |
| * \code |
| * typedef struct CUDA_KERNEL_NODE_PARAMS_st { |
| * CUfunction func; |
| * unsigned int gridDimX; |
| * unsigned int gridDimY; |
| * unsigned int gridDimZ; |
| * unsigned int blockDimX; |
| * unsigned int blockDimY; |
| * unsigned int blockDimZ; |
| * unsigned int sharedMemBytes; |
| * void **kernelParams; |
| * void **extra; |
| * } CUDA_KERNEL_NODE_PARAMS; |
| * \endcode |
| * |
| * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x |
| * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains |
| * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. |
| * |
| * \p sharedMemBytes sets the amount of dynamic shared memory that will be |
| * available to each thread block. |
| * |
| * Kernel parameters to \p func can be specified in one of two ways: |
| * |
| * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N |
| * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, |
| * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual |
| * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need |
| * to be specified as that information is retrieved directly from the kernel's image. |
| * |
| * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single |
| * buffer that is passed in via \p extra. This places the burden on the application of knowing each |
| * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists |
| * to allow this function to take additional less commonly used arguments. \p extra specifies |
| * a list of names of extra settings and their corresponding values. Each extra setting name is |
| * immediately followed by the corresponding value. The list must be terminated with either NULL or |
| * CU_LAUNCH_PARAM_END. |
| * |
| * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra |
| * array; |
| * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next |
| * value in \p extra will be a pointer to a buffer |
| * containing all the kernel parameters for launching kernel |
| * \p func; |
| * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next |
| * value in \p extra will be a pointer to a size_t |
| * containing the size of the buffer specified with |
| * ::CU_LAUNCH_PARAM_BUFFER_POINTER; |
| * |
| * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both |
| * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). |
| * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. |
| * |
| * The \p kernelParams or \p extra array, as well as the argument values it points to, |
| * are copied during this call. |
| * |
| * \note Kernels launched using graphs must not use texture and surface references. Reading or |
| * writing through any texture or surface reference is undefined behavior. |
| * This restriction does not apply to texture and surface objects. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * \param nodeParams - Parameters for the GPU execution node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchKernel, |
| * ::cuLaunchCooperativeKernel, |
| * ::cuGraphKernelNodeGetParams, |
| * ::cuGraphKernelNodeSetParams, |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode |
| */ |
| CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Returns a kernel node's parameters |
| * |
| * Returns the parameters of kernel node \p hNode in \p nodeParams. |
| * The \p kernelParams or \p extra array returned in \p nodeParams, |
| * as well as the argument values it points to, are owned by the node. |
| * This memory remains valid until the node is destroyed or its |
| * parameters are modified, and should not be modified |
| * directly. Use ::cuGraphKernelNodeSetParams to update the |
| * parameters of this node. |
| * |
| * The params will contain either \p kernelParams or \p extra, |
| * according to which of these was most recently set on the node. |
| * |
| * \param hNode - Node to get the parameters for |
| * \param nodeParams - Pointer to return the parameters |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchKernel, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphKernelNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Sets a kernel node's parameters |
| * |
| * Sets the parameters of kernel node \p hNode to \p nodeParams. |
| * |
| * \param hNode - Node to set the parameters for |
| * \param nodeParams - Parameters to copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchKernel, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphKernelNodeGetParams |
| */ |
| CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Creates a memcpy node and adds it to a graph |
| * |
| * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies |
| * dependencies specified via \p dependencies. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * When the graph is launched, the node will perform the memcpy described by \p copyParams. |
| * See ::cuMemcpy3D() for a description of the structure and its restrictions. |
| * |
| * Memcpy nodes have some additional restrictions with regards to managed memory, if the |
| * system contains at least one device which has a zero value for the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer |
| * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed |
| * for those operand(s). The managed memory will be treated as residing on either the |
| * host or the device, depending on which memory type is specified. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * \param copyParams - Parameters for the memory copy |
| * \param ctx - Context on which to run the node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemcpy3D, |
| * ::cuGraphMemcpyNodeGetParams, |
| * ::cuGraphMemcpyNodeSetParams, |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemsetNode |
| */ |
| CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); |
| |
| /** |
| * \brief Returns a memcpy node's parameters |
| * |
| * Returns the parameters of memcpy node \p hNode in \p nodeParams. |
| * |
| * \param hNode - Node to get the parameters for |
| * \param nodeParams - Pointer to return the parameters |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemcpy3D, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphMemcpyNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); |
| |
| /** |
| * \brief Sets a memcpy node's parameters |
| * |
| * Sets the parameters of memcpy node \p hNode to \p nodeParams. |
| * |
| * \param hNode - Node to set the parameters for |
| * \param nodeParams - Parameters to copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemcpy3D, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphMemcpyNodeGetParams |
| */ |
| CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); |
| |
| /** |
| * \brief Creates a memset node and adds it to a graph |
| * |
| * Creates a new memset node and adds it to \p hGraph with \p numDependencies |
| * dependencies specified via \p dependencies. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * The element size must be 1, 2, or 4 bytes. |
| * When the graph is launched, the node will perform the memset described by \p memsetParams. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * \param memsetParams - Parameters for the memory set |
| * \param ctx - Context on which to run the node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_CONTEXT |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemsetD2D32, |
| * ::cuGraphMemsetNodeGetParams, |
| * ::cuGraphMemsetNodeSetParams, |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode |
| */ |
| CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); |
| |
| /** |
| * \brief Returns a memset node's parameters |
| * |
| * Returns the parameters of memset node \p hNode in \p nodeParams. |
| * |
| * \param hNode - Node to get the parameters for |
| * \param nodeParams - Pointer to return the parameters |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemsetD2D32, |
| * ::cuGraphAddMemsetNode, |
| * ::cuGraphMemsetNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Sets a memset node's parameters |
| * |
| * Sets the parameters of memset node \p hNode to \p nodeParams. |
| * |
| * \param hNode - Node to set the parameters for |
| * \param nodeParams - Parameters to copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuMemsetD2D32, |
| * ::cuGraphAddMemsetNode, |
| * ::cuGraphMemsetNodeGetParams |
| */ |
| CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Creates a host execution node and adds it to a graph |
| * |
| * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies |
| * dependencies specified via \p dependencies and arguments specified in \p nodeParams. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * When the graph is launched, the node will invoke the specified CPU function. |
| * Host nodes are not supported under MPS with pre-Volta GPUs. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * \param nodeParams - Parameters for the host node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_NOT_SUPPORTED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchHostFunc, |
| * ::cuGraphHostNodeGetParams, |
| * ::cuGraphHostNodeSetParams, |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode |
| */ |
| CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Returns a host node's parameters |
| * |
| * Returns the parameters of host node \p hNode in \p nodeParams. |
| * |
| * \param hNode - Node to get the parameters for |
| * \param nodeParams - Pointer to return the parameters |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchHostFunc, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphHostNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Sets a host node's parameters |
| * |
| * Sets the parameters of host node \p hNode to \p nodeParams. |
| * |
| * \param hNode - Node to set the parameters for |
| * \param nodeParams - Parameters to copy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuLaunchHostFunc, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphHostNodeGetParams |
| */ |
| CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Creates a child graph node and adds it to a graph |
| * |
| * Creates a new node which executes an embedded graph, and adds it to \p hGraph with |
| * \p numDependencies dependencies specified via \p dependencies. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * The node executes an embedded child graph. The child graph is cloned in this call. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * \param childGraph - The graph to clone into this node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphChildGraphNodeGetGraph, |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode, |
| * ::cuGraphClone |
| */ |
| CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); |
| |
| /** |
| * \brief Gets a handle to the embedded graph of a child graph node |
| * |
| * Gets a handle to the embedded graph in a child graph node. This call |
| * does not clone the graph. Changes to the graph will be reflected in |
| * the node, and the node retains ownership of the graph. |
| * |
| * \param hNode - Node to get the embedded graph for |
| * \param phGraph - Location to store a handle to the graph |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphNodeFindInClone |
| */ |
| CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); |
| |
| /** |
| * \brief Creates an empty node and adds it to a graph |
| * |
| * Creates a new node which performs no operation, and adds it to \p hGraph with |
| * \p numDependencies dependencies specified via \p dependencies. |
| * It is possible for \p numDependencies to be 0, in which case the node will be placed |
| * at the root of the graph. \p dependencies may not have any duplicate entries. |
| * A handle to the new node will be returned in \p phGraphNode. |
| * |
| * An empty node performs no operation during execution, but can be used for |
| * transitive ordering. For example, a phased execution graph with 2 groups of n |
| * nodes with a barrier between them can be represented using an empty node and |
| * 2*n dependency edges, rather than no empty node and n^2 dependency edges. |
| * |
| * \param phGraphNode - Returns newly created node |
| * \param hGraph - Graph to which to add the node |
| * \param dependencies - Dependencies of the node |
| * \param numDependencies - Number of dependencies |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate, |
| * ::cuGraphDestroyNode, |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode |
| */ |
| CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); |
| |
| /** |
| * \brief Clones a graph |
| * |
| * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone. |
| * All parameters are copied into the cloned graph. The original graph may be modified |
| * after this call without affecting the clone. |
| * |
| * Child graph nodes in the original graph are recursively copied into the clone. |
| * |
| * \param phGraphClone - Returns newly created cloned graph |
| * \param originalGraph - Graph to clone |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_OUT_OF_MEMORY |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate, |
| * ::cuGraphNodeFindInClone |
| */ |
| CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); |
| |
| /** |
| * \brief Finds a cloned version of a node |
| * |
| * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode |
| * in the original graph. |
| * |
| * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. |
| * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to |
| * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have |
| * been removed. The cloned node is then returned via \p phClonedNode. |
| * |
| * \param phNode - Returns handle to the cloned node |
| * \param hOriginalNode - Handle to the original node |
| * \param hClonedGraph - Cloned graph to query |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphClone |
| */ |
| CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); |
| |
| /** |
| * \brief Returns a node's type |
| * |
| * Returns the node type of \p hNode in \p type. |
| * |
| * \param hNode - Node to query |
| * \param type - Pointer to return the node type |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphChildGraphNodeGetGraph, |
| * ::cuGraphKernelNodeGetParams, |
| * ::cuGraphKernelNodeSetParams, |
| * ::cuGraphHostNodeGetParams, |
| * ::cuGraphHostNodeSetParams, |
| * ::cuGraphMemcpyNodeGetParams, |
| * ::cuGraphMemcpyNodeSetParams, |
| * ::cuGraphMemsetNodeGetParams, |
| * ::cuGraphMemsetNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); |
| |
| /** |
| * \brief Returns a graph's nodes |
| * |
| * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this |
| * function will return the number of nodes in \p numNodes. Otherwise, |
| * \p numNodes entries will be filled in. If \p numNodes is higher than the actual |
| * number of nodes, the remaining entries in \p nodes will be set to NULL, and the |
| * number of nodes actually obtained will be returned in \p numNodes. |
| * |
| * \param hGraph - Graph to query |
| * \param nodes - Pointer to return the nodes |
| * \param numNodes - See description |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphGetEdges, |
| * ::cuGraphNodeGetType, |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphNodeGetDependentNodes |
| */ |
| CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); |
| |
| /** |
| * \brief Returns a graph's root nodes |
| * |
| * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this |
| * function will return the number of root nodes in \p numRootNodes. Otherwise, |
| * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual |
| * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the |
| * number of nodes actually obtained will be returned in \p numRootNodes. |
| * |
| * \param hGraph - Graph to query |
| * \param rootNodes - Pointer to return the root nodes |
| * \param numRootNodes - See description |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate, |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetEdges, |
| * ::cuGraphNodeGetType, |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphNodeGetDependentNodes |
| */ |
| CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); |
| |
| /** |
| * \brief Returns a graph's dependency edges |
| * |
| * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding |
| * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the |
| * node in \p from[i]. \p from and \p to may both be NULL, in which |
| * case this function only returns the number of edges in \p numEdges. Otherwise, |
| * \p numEdges entries will be filled in. If \p numEdges is higher than the actual |
| * number of edges, the remaining entries in \p from and \p to will be set to NULL, and |
| * the number of edges actually returned will be written to \p numEdges. |
| * |
| * \param hGraph - Graph to get the edges from |
| * \param from - Location to return edge endpoints |
| * \param to - Location to return edge endpoints |
| * \param numEdges - See description |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphAddDependencies, |
| * ::cuGraphRemoveDependencies, |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphNodeGetDependentNodes |
| */ |
| CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); |
| |
| /** |
| * \brief Returns a node's dependencies |
| * |
| * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this |
| * function will return the number of dependencies in \p numDependencies. Otherwise, |
| * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual |
| * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the |
| * number of nodes actually obtained will be returned in \p numDependencies. |
| * |
| * \param hNode - Node to query |
| * \param dependencies - Pointer to return the dependencies |
| * \param numDependencies - See description |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphNodeGetDependentNodes, |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphGetEdges, |
| * ::cuGraphAddDependencies, |
| * ::cuGraphRemoveDependencies |
| */ |
| CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); |
| |
| /** |
| * \brief Returns a node's dependent nodes |
| * |
| * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which |
| * case this function will return the number of dependent nodes in \p numDependentNodes. |
| * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is |
| * higher than the actual number of dependent nodes, the remaining entries in |
| * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will |
| * be returned in \p numDependentNodes. |
| * |
| * \param hNode - Node to query |
| * \param dependentNodes - Pointer to return the dependent nodes |
| * \param numDependentNodes - See description |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphGetNodes, |
| * ::cuGraphGetRootNodes, |
| * ::cuGraphGetEdges, |
| * ::cuGraphAddDependencies, |
| * ::cuGraphRemoveDependencies |
| */ |
| CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); |
| |
| /** |
| * \brief Adds dependency edges to a graph |
| * |
| * The number of dependencies to be added is defined by \p numDependencies |
| * Elements in \p from and \p to at corresponding indices define a dependency. |
| * Each node in \p from and \p to must belong to \p hGraph. |
| * |
| * If \p numDependencies is 0, elements in \p from and \p to will be ignored. |
| * Specifying an existing dependency will return an error. |
| * |
| * \param hGraph - Graph to which dependencies are added |
| * \param from - Array of nodes that provide the dependencies |
| * \param to - Array of dependent nodes |
| * \param numDependencies - Number of dependencies to be added |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphRemoveDependencies, |
| * ::cuGraphGetEdges, |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphNodeGetDependentNodes |
| */ |
| CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); |
| |
| /** |
| * \brief Removes dependency edges from a graph |
| * |
| * The number of \p dependencies to be removed is defined by \p numDependencies. |
| * Elements in \p from and \p to at corresponding indices define a dependency. |
| * Each node in \p from and \p to must belong to \p hGraph. |
| * |
| * If \p numDependencies is 0, elements in \p from and \p to will be ignored. |
| * Specifying a non-existing dependency will return an error. |
| * |
| * \param hGraph - Graph from which to remove dependencies |
| * \param from - Array of nodes that provide the dependencies |
| * \param to - Array of dependent nodes |
| * \param numDependencies - Number of dependencies to be removed |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphAddDependencies, |
| * ::cuGraphGetEdges, |
| * ::cuGraphNodeGetDependencies, |
| * ::cuGraphNodeGetDependentNodes |
| */ |
| CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); |
| |
| /** |
| * \brief Remove a node from the graph |
| * |
| * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes |
| * on \p hNode and vice versa. |
| * |
| * \param hNode - Node to remove |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphAddChildGraphNode, |
| * ::cuGraphAddEmptyNode, |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphAddHostNode, |
| * ::cuGraphAddMemcpyNode, |
| * ::cuGraphAddMemsetNode |
| */ |
| CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); |
| |
| /** |
| * \brief Creates an executable graph from a graph |
| * |
| * Instantiates \p hGraph as an executable graph. The graph is validated for any |
| * structural constraints or intra-node constraints which were not previously |
| * validated. If instantiation is successful, a handle to the instantiated graph |
| * is returned in \p graphExec. |
| * |
| * If there are any errors, diagnostic information may be returned in \p errorNode and |
| * \p logBuffer. This is the primary way to inspect instantiation errors. The output |
| * will be null terminated unless the diagnostics overflow |
| * the buffer. In this case, they will be truncated, and the last byte can be |
| * inspected to determine if truncation occurred. |
| * |
| * \param phGraphExec - Returns instantiated graph |
| * \param hGraph - Graph to instantiate |
| * \param phErrorNode - In case of an instantiation error, this may be modified to |
| * indicate a node contributing to the error |
| * \param logBuffer - A character buffer to store diagnostic messages |
| * \param bufferSize - Size of the log buffer in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate, |
| * ::cuGraphLaunch, |
| * ::cuGraphExecDestroy |
| */ |
| CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); |
| |
| /** |
| * \brief Sets the parameters for a kernel node in the given graphExec |
| * |
| * Sets the parameters of a kernel node in an executable graph \p hGraphExec. |
| * The node is identified by the corresponding node \p hNode in the |
| * non-executable graph, from which the executable graph was instantiated. |
| * |
| * \p hNode must not have been removed from the original graph. The \p func field |
| * of \p nodeParams cannot be modified and must match the original value. |
| * All other values can be modified. |
| * |
| * The modifications only affect future launches of \p hGraphExec. Already |
| * enqueued or running launches of \p hGraphExec are not affected by this call. |
| * \p hNode is also not modified by this call. |
| * |
| * \param hGraphExec - The executable graph in which to set the specified node |
| * \param hNode - kernel node from the graph from which graphExec was instantiated |
| * \param nodeParams - Updated Parameters to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphAddKernelNode, |
| * ::cuGraphKernelNodeSetParams, |
| * ::cuGraphInstantiate |
| */ |
| CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Sets the parameters for a memcpy node in the given graphExec. |
| * |
| * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
| * contained \p copyParams at instantiation. hNode must remain in the graph which was |
| * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
| * |
| * The source and destination memory in \p copyParams must be allocated from the same |
| * contexts as the original source and destination memory. Both the instantiation-time |
| * memory operands and the memory operands in \p copyParams must be 1-dimensional. |
| * Zero-length operations are not supported. |
| * |
| * The modifications only affect future launches of \p hGraphExec. Already enqueued |
| * or running launches of \p hGraphExec are not affected by this call. hNode is also |
| * not modified by this call. |
| * |
| * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or |
| * either the original or new memory operands are multidimensional. |
| * |
| * \param hGraphExec - The executable graph in which to set the specified node |
| * \param hNode - Memcpy node from the graph which was used to instantiate graphExec |
| * \param copyParams - The updated parameters to set |
| * \param ctx - Context on which to run the node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| * ::cuGraphExecKernelNodeSetParams |
| * ::cuGraphExecMemsetNodeSetParams |
| * ::cuGraphExecHostNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); |
| |
| /** |
| * \brief Sets the parameters for a memset node in the given graphExec. |
| * |
| * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
| * contained \p memsetParams at instantiation. hNode must remain in the graph which was |
| * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
| * |
| * The destination memory in \p memsetParams must be allocated from the same |
| * contexts as the original destination memory. Both the instantiation-time |
| * memory operand and the memory operand in \p memsetParams must be 1-dimensional. |
| * Zero-length operations are not supported. |
| * |
| * The modifications only affect future launches of \p hGraphExec. Already enqueued |
| * or running launches of \p hGraphExec are not affected by this call. hNode is also |
| * not modified by this call. |
| * |
| * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or |
| * either the original or new memory operand are multidimensional. |
| * |
| * \param hGraphExec - The executable graph in which to set the specified node |
| * \param hNode - Memset node from the graph which was used to instantiate graphExec |
| * \param memsetParams - The updated parameters to set |
| * \param ctx - Context on which to run the node |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| * ::cuGraphExecKernelNodeSetParams |
| * ::cuGraphExecMemcpyNodeSetParams |
| * ::cuGraphExecHostNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); |
| |
| /** |
| * \brief Sets the parameters for a host node in the given graphExec. |
| * |
| * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had |
| * contained \p nodeParams at instantiation. hNode must remain in the graph which was |
| * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. |
| * |
| * The modifications only affect future launches of \p hGraphExec. Already enqueued |
| * or running launches of \p hGraphExec are not affected by this call. hNode is also |
| * not modified by this call. |
| * |
| * \param hGraphExec - The executable graph in which to set the specified node |
| * \param hNode - Host node from the graph which was used to instantiate graphExec |
| * \param nodeParams - The updated parameters to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| * ::cuGraphExecKernelNodeSetParams |
| * ::cuGraphExecMemcpyNodeSetParams |
| * ::cuGraphExecMemsetNodeSetParams |
| */ |
| CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); |
| |
| /** |
| * \brief Launches an executable graph in a stream |
| * |
| * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing |
| * at a time. Each launch is ordered behind both any previous work in \p hStream |
| * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be |
| * instantiated multiple times into multiple executable graphs. |
| * |
| * \param hGraphExec - Executable graph to launch |
| * \param hStream - Stream in which to launch the graph |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| * ::cuGraphExecDestroy |
| */ |
| CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); |
| |
| /** |
| * \brief Destroys an executable graph |
| * |
| * Destroys the executable graph specified by \p hGraphExec, as well |
| * as all of its executable nodes. If the executable graph is |
| * in-flight, it will not be terminated, but rather freed |
| * asynchronously on completion. |
| * |
| * \param hGraphExec - Executable graph to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| * ::cuGraphLaunch |
| */ |
| CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); |
| |
| /** |
| * \brief Destroys a graph |
| * |
| * Destroys the graph specified by \p hGraph, as well as all of its nodes. |
| * |
| * \param hGraph - Graph to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphCreate |
| */ |
| CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); |
| |
| /** |
| * \brief Check whether an executable graph can be updated with a graph and perform the update if possible |
| * |
| * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the |
| * node parameters in a topologically identical graph specified by \p hGraph. |
| * |
| * Limitations: |
| * |
| * - Kernel nodes: |
| * - The function must not change (same restriction as cuGraphExecKernelNodeSetParams()) |
| * - Memset and memcpy nodes: |
| * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. |
| * - The source/destination memory must be allocated from the same contexts as the original |
| * source/destination memory. |
| * - Only 1D memsets can be changed. |
| * - Additional memcpy node restrictions: |
| * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, |
| * CU_MEMORYTYPE_ARRAY, etc.) is not supported. |
| * |
| * Note: The API may add further restrictions in future releases. The return code should always be checked. |
| * |
| * Some node types are not currently supported: |
| * - Empty graph nodes(CU_GRAPH_NODE_TYPE_EMPTY) |
| * - Child graphs(CU_GRAPH_NODE_TYPE_GRAPH). |
| * |
| * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under |
| * the following conditions: |
| * |
| * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out |
| * is NULL. |
| * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out |
| * is NULL. |
| * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is |
| * the pairless node from \p hGraph. |
| * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. |
| * |
| * cuGraphExecUpdate sets \p updateResult_out to: |
| * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. |
| * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed |
| * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case |
| * \p hErrorNode_out is set to the node from \p hGraph. |
| * - CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED if the func field of a kernel changed, in which |
| * case \p hErrorNode_out is set to the node from \p hGraph |
| * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way |
| * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. |
| * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like |
| * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph |
| * |
| * If \p updateResult_out isn't set in one of the situations described above, the update check passes |
| * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens |
| * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, |
| * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. |
| * |
| * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns |
| * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included |
| * changes which violated constraints specific to instantiated graph update. |
| * |
| * \param hGraphExec The instantiated graph to be updated |
| * \param hGraph The graph containing the updated parameters |
| * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any |
| * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, |
| * \note_graph_thread_safety |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphInstantiate, |
| */ |
| CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); |
| |
| /** |
| * \brief Copies attributes from source node to destination node. |
| * |
| * Copies attributes from source node \p src to destination node \p dst. |
| * Both node must have the same context. |
| * |
| * \param[out] dst Destination node |
| * \param[in] src Source node |
| * For list of attributes see ::CUkernelNodeAttrID |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); |
| |
| /** |
| * \brief Queries node attribute. |
| * |
| * Queries attribute \p attr from node \p hNode and stores it in corresponding |
| * member of \p value_out. |
| * |
| * \param[in] hNode |
| * \param[in] attr |
| * \param[out] value_out |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, |
| CUkernelNodeAttrValue *value_out); |
| |
| /** |
| * \brief Sets node attribute. |
| * |
| * Sets attribute \p attr on node \p hNode from corresponding attribute of |
| * \p value. |
| * |
| * \param[out] hNode |
| * \param[in] attr |
| * \param[out] value |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE |
| * \notefnerr |
| * |
| * \sa |
| * ::CUaccessPolicyWindow |
| */ |
| CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, |
| const CUkernelNodeAttrValue *value); |
| |
| /** @} */ /* END CUDA_GRAPH */ |
| |
| /** |
| * \defgroup CUDA_OCCUPANCY Occupancy |
| * |
| * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver |
| * API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the occupancy calculation functions of the low-level CUDA |
| * driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Returns occupancy of a function |
| * |
| * Returns in \p *numBlocks the number of the maximum active blocks per |
| * streaming multiprocessor. |
| * |
| * \param numBlocks - Returned occupancy |
| * \param func - Kernel for which occupancy is calculated |
| * \param blockSize - Block size the kernel is intended to be launched with |
| * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor |
| */ |
| CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); |
| |
| /** |
| * \brief Returns occupancy of a function |
| * |
| * Returns in \p *numBlocks the number of the maximum active blocks per |
| * streaming multiprocessor. |
| * |
| * The \p Flags parameter controls how special cases are handled. The |
| * valid flags are: |
| * |
| * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as |
| * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; |
| * |
| * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the |
| * default behavior on platform where global caching affects |
| * occupancy. On such platforms, if caching is enabled, but |
| * per-block SM resource usage would result in zero occupancy, the |
| * occupancy calculator will calculate the occupancy as if caching |
| * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes |
| * the occupancy calculator to return 0 in such cases. More information |
| * can be found about this feature in the "Unified L1/Texture Cache" |
| * section of the Maxwell tuning guide. |
| * |
| * \param numBlocks - Returned occupancy |
| * \param func - Kernel for which occupancy is calculated |
| * \param blockSize - Block size the kernel is intended to be launched with |
| * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes |
| * \param flags - Requested behavior for the occupancy calculator |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags |
| */ |
| CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); |
| |
| /** |
| * \brief Suggest a launch configuration with reasonable occupancy |
| * |
| * Returns in \p *blockSize a reasonable block size that can achieve |
| * the maximum occupancy (or, the maximum number of active warps with |
| * the fewest blocks per multiprocessor), and in \p *minGridSize the |
| * minimum grid size to achieve the maximum occupancy. |
| * |
| * If \p blockSizeLimit is 0, the configurator will use the maximum |
| * block size permitted by the device / function instead. |
| * |
| * If per-block dynamic shared memory allocation is not needed, the |
| * user should leave both \p blockSizeToDynamicSMemSize and \p |
| * dynamicSMemSize as 0. |
| * |
| * If per-block dynamic shared memory allocation is needed, then if |
| * the dynamic shared memory size is constant regardless of block |
| * size, the size should be passed through \p dynamicSMemSize, and \p |
| * blockSizeToDynamicSMemSize should be NULL. |
| * |
| * Otherwise, if the per-block dynamic shared memory size varies with |
| * different block sizes, the user needs to provide a unary function |
| * through \p blockSizeToDynamicSMemSize that computes the dynamic |
| * shared memory needed by \p func for any given block size. \p |
| * dynamicSMemSize is ignored. An example signature is: |
| * |
| * \code |
| * // Take block size, returns dynamic shared memory needed |
| * size_t blockToSmem(int blockSize); |
| * \endcode |
| * |
| * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy |
| * \param blockSize - Returned maximum block size that can achieve the maximum occupancy |
| * \param func - Kernel for which launch configuration is calculated |
| * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size |
| * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes |
| * \param blockSizeLimit - The maximum block size \p func is designed to handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cudaOccupancyMaxPotentialBlockSize |
| */ |
| CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); |
| |
| /** |
| * \brief Suggest a launch configuration with reasonable occupancy |
| * |
| * An extended version of ::cuOccupancyMaxPotentialBlockSize. In |
| * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, |
| * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags |
| * parameter. |
| * |
| * The \p Flags parameter controls how special cases are handled. The |
| * valid flags are: |
| * |
| * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as |
| * ::cuOccupancyMaxPotentialBlockSize; |
| * |
| * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the |
| * default behavior on platform where global caching affects |
| * occupancy. On such platforms, the launch configurations that |
| * produces maximal occupancy might not support global |
| * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE |
| * guarantees that the the produced launch configuration is global |
| * caching compatible at a potential cost of occupancy. More information |
| * can be found about this feature in the "Unified L1/Texture Cache" |
| * section of the Maxwell tuning guide. |
| * |
| * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy |
| * \param blockSize - Returned maximum block size that can achieve the maximum occupancy |
| * \param func - Kernel for which launch configuration is calculated |
| * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size |
| * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes |
| * \param blockSizeLimit - The maximum block size \p func is designed to handle |
| * \param flags - Options |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cudaOccupancyMaxPotentialBlockSizeWithFlags |
| */ |
| CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); |
| |
| /** |
| * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM |
| * |
| * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. |
| * |
| * \param dynamicSmemSize - Returned maximum dynamic shared memory |
| * \param func - Kernel function for which occupancy is calculated |
| * \param numBlocks - Number of blocks to fit on SM |
| * \param blockSize - Size of the blocks |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| */ |
| CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); |
| |
| /** @} */ /* END CUDA_OCCUPANCY */ |
| |
| /** |
| * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] |
| * |
| * ___MANBRIEF___ deprecated texture reference management functions of the |
| * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the deprecated texture reference management |
| * functions of the low-level CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Binds an array as a texture reference |
| * |
| * \deprecated |
| * |
| * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any |
| * previous address or CUDA array state associated with the texture reference |
| * is superseded by this function. \p Flags must be set to |
| * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is |
| * unbound. |
| * |
| * \param hTexRef - Texture reference to bind |
| * \param hArray - Array to bind |
| * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); |
| |
| /** |
| * \brief Binds a mipmapped array to a texture reference |
| * |
| * \deprecated |
| * |
| * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. |
| * Any previous address or CUDA array state associated with the texture reference |
| * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. |
| * Any CUDA array previously bound to \p hTexRef is unbound. |
| * |
| * \param hTexRef - Texture reference to bind |
| * \param hMipmappedArray - Mipmapped array to bind |
| * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); |
| |
| /** |
| * \brief Binds an address as a texture reference |
| * |
| * \deprecated |
| * |
| * Binds a linear address range to the texture reference \p hTexRef. Any |
| * previous address or CUDA array state associated with the texture reference |
| * is superseded by this function. Any memory previously bound to \p hTexRef |
| * is unbound. |
| * |
| * Since the hardware enforces an alignment requirement on texture base |
| * addresses, ::cuTexRefSetAddress() passes back a byte offset in |
| * \p *ByteOffset that must be applied to texture fetches in order to read from |
| * the desired memory. This offset must be divided by the texel size and |
| * passed to kernels that read from the texture so they can be applied to the |
| * ::tex1Dfetch() function. |
| * |
| * If the device memory pointer was returned from ::cuMemAlloc(), the offset |
| * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. |
| * |
| * The total number of elements (or texels) in the linear address range |
| * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. |
| * The number of elements is computed as (\p bytes / bytesPerElement), |
| * where bytesPerElement is determined from the data format and number of |
| * components set using ::cuTexRefSetFormat(). |
| * |
| * \param ByteOffset - Returned byte offset |
| * \param hTexRef - Texture reference to bind |
| * \param dptr - Device pointer to bind |
| * \param bytes - Size of memory to bind in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTexture |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); |
| |
| /** |
| * \brief Binds an address as a 2D texture reference |
| * |
| * \deprecated |
| * |
| * Binds a linear address range to the texture reference \p hTexRef. Any |
| * previous address or CUDA array state associated with the texture reference |
| * is superseded by this function. Any memory previously bound to \p hTexRef |
| * is unbound. |
| * |
| * Using a ::tex2D() function inside a kernel requires a call to either |
| * ::cuTexRefSetArray() to bind the corresponding texture reference to an |
| * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear |
| * memory. |
| * |
| * Function calls to ::cuTexRefSetFormat() cannot follow calls to |
| * ::cuTexRefSetAddress2D() for the same texture reference. |
| * |
| * It is required that \p dptr be aligned to the appropriate hardware-specific |
| * texture alignment. You can query this value using the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is |
| * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. |
| * This value can be queried using the device attribute |
| * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is |
| * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * Width and Height, which are specified in elements (or texels), cannot exceed |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. |
| * \p Pitch, which is specified in bytes, cannot exceed |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. |
| * |
| * \param hTexRef - Texture reference to bind |
| * \param desc - Descriptor of CUDA array |
| * \param dptr - Device pointer to bind |
| * \param Pitch - Line pitch in bytes |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTexture2D |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); |
| |
| /** |
| * \brief Sets the format for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the format of the data to be read by the texture reference |
| * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the |
| * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: |
| * They specify the format of each component and the number of components per |
| * array element. |
| * |
| * \param hTexRef - Texture reference |
| * \param fmt - Format to set |
| * \param NumPackedComponents - Number of components per array element |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaCreateChannelDesc, |
| * ::cudaBindTexture, |
| * ::cudaBindTexture2D, |
| * ::cudaBindTextureToArray, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); |
| |
| /** |
| * \brief Sets the addressing mode for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the addressing mode \p am for the given dimension \p dim of the |
| * texture reference \p hTexRef. If \p dim is zero, the addressing mode is |
| * applied to the first parameter of the functions used to fetch from the |
| * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined |
| * as: |
| * \code |
| typedef enum CUaddress_mode_enum { |
| CU_TR_ADDRESS_MODE_WRAP = 0, |
| CU_TR_ADDRESS_MODE_CLAMP = 1, |
| CU_TR_ADDRESS_MODE_MIRROR = 2, |
| CU_TR_ADDRESS_MODE_BORDER = 3 |
| } CUaddress_mode; |
| * \endcode |
| * |
| * Note that this call has no effect if \p hTexRef is bound to linear memory. |
| * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only |
| * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. |
| * |
| * \param hTexRef - Texture reference |
| * \param dim - Dimension |
| * \param am - Addressing mode to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTexture, |
| * ::cudaBindTexture2D, |
| * ::cudaBindTextureToArray, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); |
| |
| /** |
| * \brief Sets the filtering mode for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the filtering mode \p fm to be used when reading memory through |
| * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: |
| * |
| * \code |
| typedef enum CUfilter_mode_enum { |
| CU_TR_FILTER_MODE_POINT = 0, |
| CU_TR_FILTER_MODE_LINEAR = 1 |
| } CUfilter_mode; |
| * \endcode |
| * |
| * Note that this call has no effect if \p hTexRef is bound to linear memory. |
| * |
| * \param hTexRef - Texture reference |
| * \param fm - Filtering mode to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); |
| |
| /** |
| * \brief Sets the mipmap filtering mode for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the mipmap filtering mode \p fm to be used when reading memory through |
| * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: |
| * |
| * \code |
| typedef enum CUfilter_mode_enum { |
| CU_TR_FILTER_MODE_POINT = 0, |
| CU_TR_FILTER_MODE_LINEAR = 1 |
| } CUfilter_mode; |
| * \endcode |
| * |
| * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
| * |
| * \param hTexRef - Texture reference |
| * \param fm - Filtering mode to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); |
| |
| /** |
| * \brief Sets the mipmap level bias for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when |
| * reading memory through the texture reference \p hTexRef. |
| * |
| * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
| * |
| * \param hTexRef - Texture reference |
| * \param bias - Mipmap level bias |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); |
| |
| /** |
| * \brief Sets the mipmap min/max mipmap level clamps for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp |
| * respectively, to be used when reading memory through the texture reference |
| * \p hTexRef. |
| * |
| * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. |
| * |
| * \param hTexRef - Texture reference |
| * \param minMipmapLevelClamp - Mipmap min level clamp |
| * \param maxMipmapLevelClamp - Mipmap max level clamp |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); |
| |
| /** |
| * \brief Sets the maximum anisotropy for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through |
| * the texture reference \p hTexRef. |
| * |
| * Note that this call has no effect if \p hTexRef is bound to linear memory. |
| * |
| * \param hTexRef - Texture reference |
| * \param maxAniso - Maximum anisotropy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTextureToArray, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); |
| |
| /** |
| * \brief Sets the border color for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference |
| * \p hTexRef. The color value supports only float type and holds color components in |
| * the following sequence: |
| * pBorderColor[0] holds 'R' component |
| * pBorderColor[1] holds 'G' component |
| * pBorderColor[2] holds 'B' component |
| * pBorderColor[3] holds 'A' component |
| * |
| * Note that the color values can be set only when the Address mode is set to |
| * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. |
| * Applications using integer border color values have to "reinterpret_cast" their values to float. |
| * |
| * \param hTexRef - Texture reference |
| * \param pBorderColor - RGBA color |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddressMode, |
| * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, |
| * ::cudaBindTexture, |
| * ::cudaBindTexture2D, |
| * ::cudaBindTextureToArray, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); |
| |
| /** |
| * \brief Sets the flags for a texture reference |
| * |
| * \deprecated |
| * |
| * Specifies optional flags via \p Flags to specify the behavior of data |
| * returned through the texture reference \p hTexRef. The valid flags are: |
| * |
| * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of |
| * having the texture promote integer data to floating point data in the |
| * range [0, 1]. Note that texture with 32-bit integer format |
| * would not be promoted, regardless of whether or not this |
| * flag is specified; |
| * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the |
| * default behavior of having the texture coordinates range |
| * from [0, Dim) where Dim is the width or height of the CUDA |
| * array. Instead, the texture coordinates [0, 1.0) reference |
| * the entire breadth of the array dimension; |
| * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear |
| * filtering optimizations. Trilinear optimizations improve texture filtering |
| * performance by allowing bilinear filtering on textures in scenarios where |
| * it can closely approximate the expected results. |
| * |
| * \param hTexRef - Texture reference |
| * \param Flags - Optional flags to set |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, |
| * ::cudaBindTexture, |
| * ::cudaBindTexture2D, |
| * ::cudaBindTextureToArray, |
| * ::cudaBindTextureToMipmappedArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); |
| |
| /** |
| * \brief Gets the address associated with a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *pdptr the base address bound to the texture reference |
| * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
| * is not bound to any device memory range. |
| * |
| * \param pdptr - Returned device address |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the array bound to a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *phArray the CUDA array bound to the texture reference |
| * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
| * is not bound to any CUDA array. |
| * |
| * \param phArray - Returned array |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the mipmapped array bound to a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture |
| * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference |
| * is not bound to any CUDA mipmapped array. |
| * |
| * \param phMipmappedArray - Returned mipmapped array |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the addressing mode used by a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *pam the addressing mode corresponding to the |
| * dimension \p dim of the texture reference \p hTexRef. Currently, the only |
| * valid value for \p dim are 0 and 1. |
| * |
| * \param pam - Returned addressing mode |
| * \param hTexRef - Texture reference |
| * \param dim - Dimension |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); |
| |
| /** |
| * \brief Gets the filter-mode used by a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *pfm the filtering mode of the texture reference |
| * \p hTexRef. |
| * |
| * \param pfm - Returned filtering mode |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the format used by a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *pFormat and \p *pNumChannels the format and number |
| * of components of the CUDA array bound to the texture reference \p hTexRef. |
| * If \p pFormat or \p pNumChannels is NULL, it will be ignored. |
| * |
| * \param pFormat - Returned format |
| * \param pNumChannels - Returned number of components |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the mipmap filtering mode for a texture reference |
| * |
| * \deprecated |
| * |
| * Returns the mipmap filtering mode in \p pfm that's used when reading memory through |
| * the texture reference \p hTexRef. |
| * |
| * \param pfm - Returned mipmap filtering mode |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the mipmap level bias for a texture reference |
| * |
| * \deprecated |
| * |
| * Returns the mipmap level bias in \p pBias that's added to the specified mipmap |
| * level when reading memory through the texture reference \p hTexRef. |
| * |
| * \param pbias - Returned mipmap level bias |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the min/max mipmap level clamps for a texture reference |
| * |
| * \deprecated |
| * |
| * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp |
| * that's used when reading memory through the texture reference \p hTexRef. |
| * |
| * \param pminMipmapLevelClamp - Returned mipmap min level clamp |
| * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the maximum anisotropy for a texture reference |
| * |
| * \deprecated |
| * |
| * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through |
| * the texture reference \p hTexRef. |
| * |
| * \param pmaxAniso - Returned maximum anisotropy |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the border color used by a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p pBorderColor, values of the RGBA color used by |
| * the texture reference \p hTexRef. |
| * The color value is of type float and holds color components in |
| * the following sequence: |
| * pBorderColor[0] holds 'R' component |
| * pBorderColor[1] holds 'G' component |
| * pBorderColor[2] holds 'B' component |
| * pBorderColor[3] holds 'A' component |
| * |
| * \param hTexRef - Texture reference |
| * \param pBorderColor - Returned Type and Value of RGBA color |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddressMode, |
| * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); |
| |
| /** |
| * \brief Gets the flags used by a texture reference |
| * |
| * \deprecated |
| * |
| * Returns in \p *pFlags the flags of the texture reference \p hTexRef. |
| * |
| * \param pFlags - Returned flags |
| * \param hTexRef - Texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefSetAddress, |
| * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, |
| * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, |
| * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, |
| * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); |
| |
| /** |
| * \brief Creates a texture reference |
| * |
| * \deprecated |
| * |
| * Creates a texture reference and returns its handle in \p *pTexRef. Once |
| * created, the application must call ::cuTexRefSetArray() or |
| * ::cuTexRefSetAddress() to associate the reference with allocated memory. |
| * Other texture reference functions are used to specify the format and |
| * interpretation (addressing, filtering, etc.) to be used when the memory is |
| * read through this texture reference. |
| * |
| * \param pTexRef - Returned texture reference |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefDestroy |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); |
| |
| /** |
| * \brief Destroys a texture reference |
| * |
| * \deprecated |
| * |
| * Destroys the texture reference specified by \p hTexRef. |
| * |
| * \param hTexRef - Texture reference to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuTexRefCreate |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); |
| |
| /** @} */ /* END CUDA_TEXREF_DEPRECATED */ |
| |
| |
| /** |
| * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] |
| * |
| * ___MANBRIEF___ surface reference management functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the surface reference management functions of the |
| * low-level CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Sets the CUDA array for a surface reference. |
| * |
| * \deprecated |
| * |
| * Sets the CUDA array \p hArray to be read and written by the surface reference |
| * \p hSurfRef. Any previous CUDA array state associated with the surface |
| * reference is superseded by this function. \p Flags must be set to 0. |
| * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. |
| * Any CUDA array previously bound to \p hSurfRef is unbound. |
| |
| * \param hSurfRef - Surface reference handle |
| * \param hArray - CUDA array handle |
| * \param Flags - set to 0 |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuModuleGetSurfRef, |
| * ::cuSurfRefGetArray, |
| * ::cudaBindSurfaceToArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); |
| |
| /** |
| * \brief Passes back the CUDA array bound to a surface reference. |
| * |
| * \deprecated |
| * |
| * Returns in \p *phArray the CUDA array bound to the surface reference |
| * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference |
| * is not bound to any CUDA array. |
| |
| * \param phArray - Surface reference handle |
| * \param hSurfRef - Surface reference handle |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray |
| */ |
| __CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); |
| |
| /** @} */ /* END CUDA_SURFREF_DEPRECATED */ |
| |
| /** |
| * \defgroup CUDA_TEXOBJECT Texture Object Management |
| * |
| * ___MANBRIEF___ texture object management functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the texture object management functions of the |
| * low-level CUDA driver application programming interface. The texture |
| * object API is only supported on devices of compute capability 3.0 or higher. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Creates a texture object |
| * |
| * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes |
| * the data to texture from. \p pTexDesc describes how the data should be sampled. |
| * \p pResViewDesc is an optional argument that specifies an alternate format for |
| * the data described by \p pResDesc, and also describes the subresource region |
| * to restrict access to when texturing. \p pResViewDesc can only be specified if |
| * the type of resource is a CUDA array or a CUDA mipmapped array. |
| * |
| * Texture objects are only supported on devices of compute capability 3.0 or higher. |
| * Additionally, a texture object is an opaque value, and, as such, should only be |
| * accessed through CUDA API calls. |
| * |
| * The ::CUDA_RESOURCE_DESC structure is defined as: |
| * \code |
| typedef struct CUDA_RESOURCE_DESC_st |
| { |
| CUresourcetype resType; |
| |
| union { |
| struct { |
| CUarray hArray; |
| } array; |
| struct { |
| CUmipmappedArray hMipmappedArray; |
| } mipmap; |
| struct { |
| CUdeviceptr devPtr; |
| CUarray_format format; |
| unsigned int numChannels; |
| size_t sizeInBytes; |
| } linear; |
| struct { |
| CUdeviceptr devPtr; |
| CUarray_format format; |
| unsigned int numChannels; |
| size_t width; |
| size_t height; |
| size_t pitchInBytes; |
| } pitch2D; |
| } res; |
| |
| unsigned int flags; |
| } CUDA_RESOURCE_DESC; |
| |
| * \endcode |
| * where: |
| * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. |
| * CUresourceType is defined as: |
| * \code |
| typedef enum CUresourcetype_enum { |
| CU_RESOURCE_TYPE_ARRAY = 0x00, |
| CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, |
| CU_RESOURCE_TYPE_LINEAR = 0x02, |
| CU_RESOURCE_TYPE_PITCH2D = 0x03 |
| } CUresourcetype; |
| * \endcode |
| * |
| * \par |
| * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray |
| * must be set to a valid CUDA array handle. |
| * |
| * \par |
| * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray |
| * must be set to a valid CUDA mipmapped array handle. |
| * |
| * \par |
| * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr |
| * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. |
| * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels |
| * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes |
| * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). |
| * |
| * \par |
| * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr |
| * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. |
| * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels |
| * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width |
| * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed |
| * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. |
| * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to |
| * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. |
| * |
| * - ::flags must be set to zero. |
| * |
| * |
| * The ::CUDA_TEXTURE_DESC struct is defined as |
| * \code |
| typedef struct CUDA_TEXTURE_DESC_st { |
| CUaddress_mode addressMode[3]; |
| CUfilter_mode filterMode; |
| unsigned int flags; |
| unsigned int maxAnisotropy; |
| CUfilter_mode mipmapFilterMode; |
| float mipmapLevelBias; |
| float minMipmapLevelClamp; |
| float maxMipmapLevelClamp; |
| } CUDA_TEXTURE_DESC; |
| * \endcode |
| * where |
| * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: |
| * \code |
| typedef enum CUaddress_mode_enum { |
| CU_TR_ADDRESS_MODE_WRAP = 0, |
| CU_TR_ADDRESS_MODE_CLAMP = 1, |
| CU_TR_ADDRESS_MODE_MIRROR = 2, |
| CU_TR_ADDRESS_MODE_BORDER = 3 |
| } CUaddress_mode; |
| * \endcode |
| * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES |
| * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. |
| * |
| * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: |
| * \code |
| typedef enum CUfilter_mode_enum { |
| CU_TR_FILTER_MODE_POINT = 0, |
| CU_TR_FILTER_MODE_LINEAR = 1 |
| } CUfilter_mode; |
| * \endcode |
| * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. |
| * |
| * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: |
| * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of |
| * having the texture promote integer data to floating point data in the |
| * range [0, 1]. Note that texture with 32-bit integer format would not be |
| * promoted, regardless of whether or not this flag is specified. |
| * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior |
| * of having the texture coordinates range from [0, Dim) where Dim is the |
| * width or height of the CUDA array. Instead, the texture coordinates |
| * [0, 1.0) reference the entire breadth of the array dimension; Note that |
| * for CUDA mipmapped arrays, this flag has to be set. |
| * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear |
| * filtering optimizations. Trilinear optimizations improve texture filtering |
| * performance by allowing bilinear filtering on textures in scenarios where |
| * it can closely approximate the expected results. |
| * |
| * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be |
| * clamped to the range [1,16]. |
| * |
| * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. |
| * |
| * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. |
| * |
| * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. |
| * |
| * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. |
| * |
| * |
| * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as |
| * \code |
| typedef struct CUDA_RESOURCE_VIEW_DESC_st |
| { |
| CUresourceViewFormat format; |
| size_t width; |
| size_t height; |
| size_t depth; |
| unsigned int firstMipmapLevel; |
| unsigned int lastMipmapLevel; |
| unsigned int firstLayer; |
| unsigned int lastLayer; |
| } CUDA_RESOURCE_VIEW_DESC; |
| * \endcode |
| * where: |
| * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should |
| * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block |
| * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. |
| * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have |
| * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base |
| * format but with 4 channels. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block |
| * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, |
| * this value has to be equal to that of the original resource. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block |
| * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, |
| * this value has to be equal to that of the original resource. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the |
| * original resource. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. |
| * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp |
| * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, |
| * then the actual minimum mipmap level clamp will be 3.2. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value |
| * has to be zero. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. |
| * For non-layered resources, this value has to be zero. |
| * |
| * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, |
| * this value has to be zero. |
| * |
| * |
| * \param pTexObject - Texture object to create |
| * \param pResDesc - Resource descriptor |
| * \param pTexDesc - Texture descriptor |
| * \param pResViewDesc - Resource view descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuTexObjectDestroy, |
| * ::cudaCreateTextureObject |
| */ |
| CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); |
| |
| /** |
| * \brief Destroys a texture object |
| * |
| * Destroys the texture object specified by \p texObject. |
| * |
| * \param texObject - Texture object to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuTexObjectCreate, |
| * ::cudaDestroyTextureObject |
| */ |
| CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); |
| |
| /** |
| * \brief Returns a texture object's resource descriptor |
| * |
| * Returns the resource descriptor for the texture object specified by \p texObject. |
| * |
| * \param pResDesc - Resource descriptor |
| * \param texObject - Texture object |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuTexObjectCreate, |
| * ::cudaGetTextureObjectResourceDesc, |
| */ |
| CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); |
| |
| /** |
| * \brief Returns a texture object's texture descriptor |
| * |
| * Returns the texture descriptor for the texture object specified by \p texObject. |
| * |
| * \param pTexDesc - Texture descriptor |
| * \param texObject - Texture object |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuTexObjectCreate, |
| * ::cudaGetTextureObjectTextureDesc |
| */ |
| CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); |
| |
| /** |
| * \brief Returns a texture object's resource view descriptor |
| * |
| * Returns the resource view descriptor for the texture object specified by \p texObject. |
| * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * \param pResViewDesc - Resource view descriptor |
| * \param texObject - Texture object |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuTexObjectCreate, |
| * ::cudaGetTextureObjectResourceViewDesc |
| */ |
| CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); |
| |
| /** @} */ /* END CUDA_TEXOBJECT */ |
| |
| /** |
| * \defgroup CUDA_SURFOBJECT Surface Object Management |
| * |
| * ___MANBRIEF___ surface object management functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the surface object management functions of the |
| * low-level CUDA driver application programming interface. The surface |
| * object API is only supported on devices of compute capability 3.0 or higher. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Creates a surface object |
| * |
| * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes |
| * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be |
| * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray |
| * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. |
| * |
| * Surface objects are only supported on devices of compute capability 3.0 or higher. |
| * Additionally, a surface object is an opaque value, and, as such, should only be |
| * accessed through CUDA API calls. |
| * |
| * \param pSurfObject - Surface object to create |
| * \param pResDesc - Resource descriptor |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuSurfObjectDestroy, |
| * ::cudaCreateSurfaceObject |
| */ |
| CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); |
| |
| /** |
| * \brief Destroys a surface object |
| * |
| * Destroys the surface object specified by \p surfObject. |
| * |
| * \param surfObject - Surface object to destroy |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuSurfObjectCreate, |
| * ::cudaDestroySurfaceObject |
| */ |
| CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); |
| |
| /** |
| * \brief Returns a surface object's resource descriptor |
| * |
| * Returns the resource descriptor for the surface object specified by \p surfObject. |
| * |
| * \param pResDesc - Resource descriptor |
| * \param surfObject - Surface object |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * |
| * \sa |
| * ::cuSurfObjectCreate, |
| * ::cudaGetSurfaceObjectResourceDesc |
| */ |
| CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); |
| |
| /** @} */ /* END CUDA_SURFOBJECT */ |
| |
| /** |
| * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access |
| * |
| * ___MANBRIEF___ direct peer context memory access functions of the low-level |
| * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the direct peer context memory access functions |
| * of the low-level CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Queries if a device may directly access a peer device's memory. |
| * |
| * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of |
| * directly accessing memory from contexts on \p peerDev and 0 otherwise. |
| * If direct access of \p peerDev from \p dev is possible, then access may be |
| * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). |
| * |
| * \param canAccessPeer - Returned access capability |
| * \param dev - Device from which allocations on \p peerDev are to |
| * be directly accessed. |
| * \param peerDev - Device on which the allocations to be directly accessed |
| * by \p dev reside. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuCtxEnablePeerAccess, |
| * ::cuCtxDisablePeerAccess, |
| * ::cudaDeviceCanAccessPeer |
| */ |
| CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); |
| |
| /** |
| * \brief Enables direct access to memory allocations in a peer context. |
| * |
| * If both the current context and \p peerContext are on devices which support unified |
| * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same |
| * major compute capability, then on success all allocations from \p peerContext will |
| * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional |
| * details. |
| * |
| * Note that access granted by this call is unidirectional and that in order to access |
| * memory from the current context in \p peerContext, a separate symmetric call |
| * to ::cuCtxEnablePeerAccess() is required. |
| * |
| * Note that there are both device-wide and system-wide limitations per system |
| * configuration, as noted in the CUDA Programming Guide under the section |
| * "Peer-to-Peer Memory Access". |
| * |
| * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates |
| * that the ::CUdevice of the current context cannot directly access memory |
| * from the ::CUdevice of \p peerContext. |
| * |
| * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of |
| * \p peerContext from the current context has already been enabled. |
| * |
| * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible |
| * because hardware resources required for peer access have been exhausted. |
| * |
| * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext |
| * is not a valid context, or if the current context is \p peerContext. |
| * |
| * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. |
| * |
| * \param peerContext - Peer context to enable direct access to from the current context |
| * \param Flags - Reserved for future use and must be set to 0 |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, |
| * ::CUDA_ERROR_TOO_MANY_PEERS, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceCanAccessPeer, |
| * ::cuCtxDisablePeerAccess, |
| * ::cudaDeviceEnablePeerAccess |
| */ |
| CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); |
| |
| /** |
| * \brief Disables direct access to memory allocations in a peer context and |
| * unregisters any registered allocations. |
| * |
| Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has |
| * not yet been enabled from \p peerContext to the current context. |
| * |
| * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if |
| * \p peerContext is not a valid context. |
| * |
| * \param peerContext - Peer context to disable direct access to |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * \notefnerr |
| * |
| * \sa |
| * ::cuDeviceCanAccessPeer, |
| * ::cuCtxEnablePeerAccess, |
| * ::cudaDeviceDisablePeerAccess |
| */ |
| CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); |
| |
| /** |
| * \brief Queries attributes of the link between two devices. |
| * |
| * Returns in \p *value the value of the requested attribute \p attrib of the |
| * link between \p srcDevice and \p dstDevice. The supported attributes are: |
| * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the |
| * performance of the link between two devices. |
| * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. |
| * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over |
| * the link are supported. |
| * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can |
| * be accessed over the link. |
| * |
| * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid |
| * or if they represent the same device. |
| * |
| * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is |
| * a null pointer. |
| * |
| * \param value - Returned value of the requested attribute |
| * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. |
| * \param srcDevice - The source device of the target link. |
| * \param dstDevice - The destination device of the target link. |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_DEVICE, |
| * ::CUDA_ERROR_INVALID_VALUE |
| * \notefnerr |
| * |
| * \sa |
| * ::cuCtxEnablePeerAccess, |
| * ::cuCtxDisablePeerAccess, |
| * ::cuDeviceCanAccessPeer, |
| * ::cudaDeviceGetP2PAttribute |
| */ |
| CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); |
| |
| /** @} */ /* END CUDA_PEER_ACCESS */ |
| |
| /** |
| * \defgroup CUDA_GRAPHICS Graphics Interoperability |
| * |
| * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA |
| * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ |
| * |
| * This section describes the graphics interoperability functions of the |
| * low-level CUDA driver application programming interface. |
| * |
| * @{ |
| */ |
| |
| /** |
| * \brief Unregisters a graphics resource for access by CUDA |
| * |
| * Unregisters the graphics resource \p resource so it is not accessible by |
| * CUDA unless registered again. |
| * |
| * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is |
| * returned. |
| * |
| * \param resource - Resource to unregister |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_UNKNOWN |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsD3D9RegisterResource, |
| * ::cuGraphicsD3D10RegisterResource, |
| * ::cuGraphicsD3D11RegisterResource, |
| * ::cuGraphicsGLRegisterBuffer, |
| * ::cuGraphicsGLRegisterImage, |
| * ::cudaGraphicsUnregisterResource |
| */ |
| CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); |
| |
| /** |
| * \brief Get an array through which to access a subresource of a mapped graphics resource. |
| * |
| * Returns in \p *pArray an array through which the subresource of the mapped |
| * graphics resource \p resource which corresponds to array index \p arrayIndex |
| * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may |
| * change every time that \p resource is mapped. |
| * |
| * If \p resource is not a texture then it cannot be accessed via an array and |
| * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. |
| * If \p arrayIndex is not a valid array index for \p resource then |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * If \p mipLevel is not a valid mipmap level for \p resource then |
| * ::CUDA_ERROR_INVALID_VALUE is returned. |
| * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
| * |
| * \param pArray - Returned array through which a subresource of \p resource may be accessed |
| * \param resource - Mapped resource to access |
| * \param arrayIndex - Array index for array textures or cubemap face |
| * index as defined by ::CUarray_cubemap_face for |
| * cubemap textures for the subresource to access |
| * \param mipLevel - Mipmap level for the subresource to access |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_MAPPED, |
| * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsResourceGetMappedPointer, |
| * ::cudaGraphicsSubResourceGetMappedArray |
| */ |
| CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); |
| |
| /** |
| * \brief Get a mipmapped array through which to access a mapped graphics resource. |
| * |
| * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics |
| * resource \p resource. The value set in \p *pMipmappedArray may change every time |
| * that \p resource is mapped. |
| * |
| * If \p resource is not a texture then it cannot be accessed via a mipmapped array and |
| * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. |
| * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
| * |
| * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed |
| * \param resource - Mapped resource to access |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_MAPPED, |
| * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsResourceGetMappedPointer, |
| * ::cudaGraphicsResourceGetMappedMipmappedArray |
| */ |
| CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); |
| |
| /** |
| * \brief Get a device pointer through which to access a mapped graphics resource. |
| * |
| * Returns in \p *pDevPtr a pointer through which the mapped graphics resource |
| * \p resource may be accessed. |
| * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. |
| * The value set in \p pPointer may change every time that \p resource is mapped. |
| * |
| * If \p resource is not a buffer then it cannot be accessed via a pointer and |
| * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. |
| * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. |
| * * |
| * \param pDevPtr - Returned pointer through which \p resource may be accessed |
| * \param pSize - Returned size of the buffer accessible starting at \p *pPointer |
| * \param resource - Mapped resource to access |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_MAPPED, |
| * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsMapResources, |
| * ::cuGraphicsSubResourceGetMappedArray, |
| * ::cudaGraphicsResourceGetMappedPointer |
| */ |
| CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); |
| |
| /** |
| * \brief Set usage flags for mapping a graphics resource |
| * |
| * Set \p flags for mapping the graphics resource \p resource. |
| * |
| * Changes to \p flags will take effect the next time \p resource is mapped. |
| * The \p flags argument may be any of the following: |
| |
| * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this |
| * resource will be used. It is therefore assumed that this resource will be |
| * read from and written to by CUDA kernels. This is the default value. |
| * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which |
| * access this resource will not write to this resource. |
| * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels |
| * which access this resource will not read from this resource and will |
| * write over the entire contents of the resource, so none of the data |
| * previously stored in the resource will be preserved. |
| * |
| * If \p resource is presently mapped for access by CUDA then |
| * ::CUDA_ERROR_ALREADY_MAPPED is returned. |
| * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. |
| * |
| * \param resource - Registered resource to set flags for |
| * \param flags - Parameters for resource mapping |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_ALREADY_MAPPED |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsMapResources, |
| * ::cudaGraphicsResourceSetMapFlags |
| */ |
| CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); |
| |
| /** |
| * \brief Map graphics resources for access by CUDA |
| * |
| * Maps the \p count graphics resources in \p resources for access by CUDA. |
| * |
| * The resources in \p resources may be accessed by CUDA until they |
| * are unmapped. The graphics API from which \p resources were registered |
| * should not access any resources while they are mapped by CUDA. If an |
| * application does so, the results are undefined. |
| * |
| * This function provides the synchronization guarantee that any graphics calls |
| * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA |
| * work issued in \p stream begins. |
| * |
| * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. |
| * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. |
| * |
| * \param count - Number of resources to map |
| * \param resources - Resources to map for CUDA usage |
| * \param hStream - Stream with which to synchronize |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_ALREADY_MAPPED, |
| * ::CUDA_ERROR_UNKNOWN |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsResourceGetMappedPointer, |
| * ::cuGraphicsSubResourceGetMappedArray, |
| * ::cuGraphicsUnmapResources, |
| * ::cudaGraphicsMapResources |
| */ |
| CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
| |
| /** |
| * \brief Unmap graphics resources. |
| * |
| * Unmaps the \p count graphics resources in \p resources. |
| * |
| * Once unmapped, the resources in \p resources may not be accessed by CUDA |
| * until they are mapped again. |
| * |
| * This function provides the synchronization guarantee that any CUDA work issued |
| * in \p stream before ::cuGraphicsUnmapResources() will complete before any |
| * subsequently issued graphics work begins. |
| * |
| * |
| * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. |
| * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. |
| * |
| * \param count - Number of resources to unmap |
| * \param resources - Resources to unmap |
| * \param hStream - Stream with which to synchronize |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_HANDLE, |
| * ::CUDA_ERROR_NOT_MAPPED, |
| * ::CUDA_ERROR_UNKNOWN |
| * \note_null_stream |
| * \notefnerr |
| * |
| * \sa |
| * ::cuGraphicsMapResources, |
| * ::cudaGraphicsUnmapResources |
| */ |
| CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
| |
| /** @} */ /* END CUDA_GRAPHICS */ |
| |
| CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); |
| |
| |
| /** |
| * \brief Returns a module handle |
| * |
| * Returns in \p *hmod the handle of the module that function \p hfunc |
| * is located in. The lifetime of the module corresponds to the lifetime of |
| * the context it was loaded in or until the module is explicitly unloaded. |
| * |
| * The CUDA runtime manages its own modules loaded into the primary context. |
| * If the handle returned by this API refers to a module loaded by the CUDA runtime, |
| * calling ::cuModuleUnload() on that module will result in undefined behavior. |
| * |
| * \param hmod - Returned module handle |
| * \param hfunc - Function to retrieve module for |
| * |
| * \return |
| * ::CUDA_SUCCESS, |
| * ::CUDA_ERROR_DEINITIALIZED, |
| * ::CUDA_ERROR_NOT_INITIALIZED, |
| * ::CUDA_ERROR_INVALID_CONTEXT, |
| * ::CUDA_ERROR_INVALID_VALUE, |
| * ::CUDA_ERROR_NOT_FOUND |
| * \notefnerr |
| * |
| */ |
| CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); |
| |
| |
| /** |
| * CUDA API versioning support |
| */ |
| #if defined(__CUDA_API_VERSION_INTERNAL) |
| #undef cuMemHostRegister |
| #undef cuGraphicsResourceSetMapFlags |
| #undef cuLinkCreate |
| #undef cuLinkAddData |
| #undef cuLinkAddFile |
| #undef cuDeviceTotalMem |
| #undef cuCtxCreate |
| #undef cuModuleGetGlobal |
| #undef cuMemGetInfo |
| #undef cuMemAlloc |
| #undef cuMemAllocPitch |
| #undef cuMemFree |
| #undef cuMemGetAddressRange |
| #undef cuMemAllocHost |
| #undef cuMemHostGetDevicePointer |
| #undef cuMemcpyHtoD |
| #undef cuMemcpyDtoH |
| #undef cuMemcpyDtoD |
| #undef cuMemcpyDtoA |
| #undef cuMemcpyAtoD |
| #undef cuMemcpyHtoA |
| #undef cuMemcpyAtoH |
| #undef cuMemcpyAtoA |
| #undef cuMemcpyHtoAAsync |
| #undef cuMemcpyAtoHAsync |
| #undef cuMemcpy2D |
| #undef cuMemcpy2DUnaligned |
| #undef cuMemcpy3D |
| #undef cuMemcpyHtoDAsync |
| #undef cuMemcpyDtoHAsync |
| #undef cuMemcpyDtoDAsync |
| #undef cuMemcpy2DAsync |
| #undef cuMemcpy3DAsync |
| #undef cuMemsetD8 |
| #undef cuMemsetD16 |
| #undef cuMemsetD32 |
| #undef cuMemsetD2D8 |
| #undef cuMemsetD2D16 |
| #undef cuMemsetD2D32 |
| #undef cuArrayCreate |
| #undef cuArrayGetDescriptor |
| #undef cuArray3DCreate |
| #undef cuArray3DGetDescriptor |
| #undef cuTexRefSetAddress |
| #undef cuTexRefSetAddress2D |
| #undef cuTexRefGetAddress |
| #undef cuGraphicsResourceGetMappedPointer |
| #undef cuCtxDestroy |
| #undef cuCtxPopCurrent |
| #undef cuCtxPushCurrent |
| #undef cuStreamDestroy |
| #undef cuEventDestroy |
| #undef cuMemcpy |
| #undef cuMemcpyAsync |
| #undef cuMemcpyPeer |
| #undef cuMemcpyPeerAsync |
| #undef cuMemcpy3DPeer |
| #undef cuMemcpy3DPeerAsync |
| #undef cuMemsetD8Async |
| #undef cuMemsetD16Async |
| #undef cuMemsetD32Async |
| #undef cuMemsetD2D8Async |
| #undef cuMemsetD2D16Async |
| #undef cuMemsetD2D32Async |
| #undef cuStreamGetPriority |
| #undef cuStreamGetFlags |
| #undef cuStreamGetCtx |
| #undef cuStreamWaitEvent |
| #undef cuStreamAddCallback |
| #undef cuStreamAttachMemAsync |
| #undef cuStreamQuery |
| #undef cuStreamSynchronize |
| #undef cuEventRecord |
| #undef cuLaunchKernel |
| #undef cuLaunchHostFunc |
| #undef cuGraphicsMapResources |
| #undef cuGraphicsUnmapResources |
| #undef cuStreamWriteValue32 |
| #undef cuStreamWaitValue32 |
| #undef cuStreamWriteValue64 |
| #undef cuStreamWaitValue64 |
| #undef cuStreamBatchMemOp |
| #undef cuMemPrefetchAsync |
| #undef cuLaunchCooperativeKernel |
| #undef cuSignalExternalSemaphoresAsync |
| #undef cuWaitExternalSemaphoresAsync |
| #undef cuStreamBeginCapture |
| #undef cuStreamEndCapture |
| #undef cuStreamIsCapturing |
| #undef cuStreamGetCaptureInfo |
| #undef cuGraphLaunch |
| #undef cuDevicePrimaryCtxRelease |
| #undef cuDevicePrimaryCtxReset |
| #undef cuDevicePrimaryCtxSetFlags |
| #undef cuStreamCopyAttributes |
| #undef cuStreamSetAttribute |
| #undef cuStreamGetAttribute |
| #undef cuGraphInstantiate |
| |
| CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); |
| CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); |
| CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); |
| CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, |
| unsigned int numOptions, CUjit_option *options, void **optionValues); |
| CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, |
| unsigned int numOptions, CUjit_option *options, void **optionValues); |
| CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); |
| |
| typedef unsigned int CUdeviceptr_v1; |
| |
| typedef struct CUDA_MEMCPY2D_v1_st |
| { |
| unsigned int srcXInBytes; /**< Source X in bytes */ |
| unsigned int srcY; /**< Source Y */ |
| CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
| const void *srcHost; /**< Source host pointer */ |
| CUdeviceptr_v1 srcDevice; /**< Source device pointer */ |
| CUarray srcArray; /**< Source array reference */ |
| unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ |
| |
| unsigned int dstXInBytes; /**< Destination X in bytes */ |
| unsigned int dstY; /**< Destination Y */ |
| CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
| void *dstHost; /**< Destination host pointer */ |
| CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ |
| CUarray dstArray; /**< Destination array reference */ |
| unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ |
| |
| unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ |
| unsigned int Height; /**< Height of 2D memory copy */ |
| } CUDA_MEMCPY2D_v1; |
| |
| typedef struct CUDA_MEMCPY3D_v1_st |
| { |
| unsigned int srcXInBytes; /**< Source X in bytes */ |
| unsigned int srcY; /**< Source Y */ |
| unsigned int srcZ; /**< Source Z */ |
| unsigned int srcLOD; /**< Source LOD */ |
| CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ |
| const void *srcHost; /**< Source host pointer */ |
| CUdeviceptr_v1 srcDevice; /**< Source device pointer */ |
| CUarray srcArray; /**< Source array reference */ |
| void *reserved0; /**< Must be NULL */ |
| unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ |
| unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ |
| |
| unsigned int dstXInBytes; /**< Destination X in bytes */ |
| unsigned int dstY; /**< Destination Y */ |
| unsigned int dstZ; /**< Destination Z */ |
| unsigned int dstLOD; /**< Destination LOD */ |
| CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ |
| void *dstHost; /**< Destination host pointer */ |
| CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ |
| CUarray dstArray; /**< Destination array reference */ |
| void *reserved1; /**< Must be NULL */ |
| unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ |
| unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ |
| |
| unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ |
| unsigned int Height; /**< Height of 3D memory copy */ |
| unsigned int Depth; /**< Depth of 3D memory copy */ |
| } CUDA_MEMCPY3D_v1; |
| |
| typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st |
| { |
| unsigned int Width; /**< Width of array */ |
| unsigned int Height; /**< Height of array */ |
| |
| CUarray_format Format; /**< Array format */ |
| unsigned int NumChannels; /**< Channels per array element */ |
| } CUDA_ARRAY_DESCRIPTOR_v1; |
| |
| typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st |
| { |
| unsigned int Width; /**< Width of 3D array */ |
| unsigned int Height; /**< Height of 3D array */ |
| unsigned int Depth; /**< Depth of 3D array */ |
| |
| CUarray_format Format; /**< Array format */ |
| unsigned int NumChannels; /**< Channels per array element */ |
| unsigned int Flags; /**< Flags */ |
| } CUDA_ARRAY3D_DESCRIPTOR_v1; |
| |
| CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); |
| CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); |
| CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); |
| CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); |
| CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); |
| CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); |
| CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); |
| CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); |
| CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); |
| CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); |
| CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); |
| CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); |
| CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); |
| CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); |
| CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); |
| CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); |
| CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); |
| CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); |
| CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); |
| CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); |
| CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); |
| CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); |
| CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); |
| CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); |
| CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); |
| CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); |
| CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); |
| CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); |
| |
| CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); |
| CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); |
| CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); |
| CUresult CUDAAPI cuStreamDestroy(CUstream hStream); |
| CUresult CUDAAPI cuEventDestroy(CUevent hEvent); |
| CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); |
| CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); |
| CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); |
| |
| CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); |
| CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); |
| CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); |
| CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); |
| CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); |
| CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); |
| CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); |
| CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); |
| CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); |
| CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); |
| CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); |
| CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); |
| CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); |
| |
| CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); |
| CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); |
| |
| CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); |
| CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); |
| CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); |
| CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); |
| CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); |
| CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); |
| CUresult CUDAAPI cuStreamQuery(CUstream hStream); |
| CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); |
| CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); |
| CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); |
| CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); |
| CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
| CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); |
| CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
| CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); |
| CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
| CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); |
| CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); |
| CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); |
| CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); |
| CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
| CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); |
| CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); |
| CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); |
| CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); |
| CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); |
| CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); |
| CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id); |
| CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); |
| CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); |
| CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); |
| CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); |
| CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); |
| #endif |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| #if defined(__GNUC__) |
| #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) |
| #pragma GCC visibility pop |
| #endif |
| #endif |
| |
| #undef __CUDA_DEPRECATED |
| |
| #endif /* __cuda_cuda_h__ */ |