From 4d8ff417e7c977a6d27df8ae6c5ab4a0d4b27b99 Mon Sep 17 00:00:00 2001 From: Michael Fabian 'Xaymar' Dirks Date: Sat, 11 Apr 2020 03:12:39 +0200 Subject: [PATCH] nvidia-cuda: Improve usage of CUDA resources and functions Load additional functions from CUDA and add new enumerations to support them: * cuDevicePrimaryCtxSetFlags allows us to sched scheduling mode for the GPU. * cuCtxgetStreamPriorityRange allows us to check which priority levels are supported. * cuStreamCreateWithPriority allows us to create streams with non-default priority. The scheduler mode is now set to yield so that other threads can do work when we hit an eventual stalling problem. Streams can also now be created with higher priority and different flags, if necessary. In most cases this should allow CUDA resources to execute even while the GPU is under heavy load. --- source/nvidia/cuda/nvidia-cuda-context.cpp | 3 ++ source/nvidia/cuda/nvidia-cuda-gs-texture.cpp | 4 +-- source/nvidia/cuda/nvidia-cuda-stream.cpp | 11 +++++-- source/nvidia/cuda/nvidia-cuda-stream.hpp | 4 ++- source/nvidia/cuda/nvidia-cuda.cpp | 3 ++ source/nvidia/cuda/nvidia-cuda.hpp | 32 +++++++++++++++---- 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/source/nvidia/cuda/nvidia-cuda-context.cpp b/source/nvidia/cuda/nvidia-cuda-context.cpp index 2b886b37..a4621d7f 100644 --- a/source/nvidia/cuda/nvidia-cuda-context.cpp +++ b/source/nvidia/cuda/nvidia-cuda-context.cpp @@ -70,6 +70,9 @@ nvidia::cuda::context::context(std::shared_ptr<::nvidia::cuda::cuda> cuda, ID3D1 if (cu_result res = _cuda->cuDevicePrimaryCtxRetain(&_ctx, _device); res != cu_result::SUCCESS) { throw std::runtime_error("Failed to acquire primary device context."); } + + _cuda->cuDevicePrimaryCtxSetFlags(_device, cu_context_flags::SCHEDULER_YIELD); + _has_device = true; } #endif diff --git a/source/nvidia/cuda/nvidia-cuda-gs-texture.cpp b/source/nvidia/cuda/nvidia-cuda-gs-texture.cpp index a94f5e97..ceea4c32 100644 --- a/source/nvidia/cuda/nvidia-cuda-gs-texture.cpp +++ b/source/nvidia/cuda/nvidia-cuda-gs-texture.cpp @@ -28,8 +28,8 @@ nvidia::cuda::gstexture::gstexture(std::shared_ptr cuda, std if (!cuda) throw std::invalid_argument("cuda"); - auto gtc = gs::context{}; - int dev_type = gs_get_device_type(); + gs::context gctx; + int dev_type = gs_get_device_type(); if (dev_type == GS_DEVICE_OPENGL) { // ToDo diff --git a/source/nvidia/cuda/nvidia-cuda-stream.cpp b/source/nvidia/cuda/nvidia-cuda-stream.cpp index 8aac943a..377a1818 100644 --- a/source/nvidia/cuda/nvidia-cuda-stream.cpp +++ b/source/nvidia/cuda/nvidia-cuda-stream.cpp @@ -20,9 +20,16 @@ #include "nvidia-cuda-stream.hpp" #include -nvidia::cuda::stream::stream(std::shared_ptr<::nvidia::cuda::cuda> cuda) : _cuda(cuda) +nvidia::cuda::stream::stream(std::shared_ptr<::nvidia::cuda::cuda> cuda, ::nvidia::cuda::cu_stream_flags flags, + std::int32_t priority) + : _cuda(cuda) { - nvidia::cuda::cu_result res = _cuda->cuStreamCreate(&_stream, 0); + nvidia::cuda::cu_result res; + if (priority == 0) { + res = _cuda->cuStreamCreate(&_stream, flags); + } else { + res = _cuda->cuStreamCreateWithPriority(&_stream, flags, priority); + } switch (res) { case nvidia::cuda::cu_result::SUCCESS: break; diff --git a/source/nvidia/cuda/nvidia-cuda-stream.hpp b/source/nvidia/cuda/nvidia-cuda-stream.hpp index 5d5e2674..5dd97db9 100644 --- a/source/nvidia/cuda/nvidia-cuda-stream.hpp +++ b/source/nvidia/cuda/nvidia-cuda-stream.hpp @@ -27,7 +27,9 @@ namespace nvidia::cuda { ::nvidia::cuda::cu_stream_t _stream; public: - stream(std::shared_ptr<::nvidia::cuda::cuda> cuda); + stream(std::shared_ptr<::nvidia::cuda::cuda> cuda, + ::nvidia::cuda::cu_stream_flags flags = ::nvidia::cuda::cu_stream_flags::DEFAULT, + std::int32_t priority = 0); ~stream(); ::nvidia::cuda::cu_stream_t get(); diff --git a/source/nvidia/cuda/nvidia-cuda.cpp b/source/nvidia/cuda/nvidia-cuda.cpp index 3187a819..1b3be614 100644 --- a/source/nvidia/cuda/nvidia-cuda.cpp +++ b/source/nvidia/cuda/nvidia-cuda.cpp @@ -61,10 +61,12 @@ nvidia::cuda::cuda::cuda() // Primary Context Management CUDA_LOAD_SYMBOL(cuDevicePrimaryCtxRetain); CUDA_LOAD_SYMBOL_V2(cuDevicePrimaryCtxRelease); + CUDA_LOAD_SYMBOL_V2(cuDevicePrimaryCtxSetFlags); // Context Management CUDA_LOAD_SYMBOL_V2(cuCtxDestroy); CUDA_LOAD_SYMBOL(cuCtxGetCurrent); + CUDA_LOAD_SYMBOL(cuCtxGetStreamPriorityRange); CUDA_LOAD_SYMBOL_V2(cuCtxPopCurrent); CUDA_LOAD_SYMBOL_V2(cuCtxPushCurrent); CUDA_LOAD_SYMBOL(cuCtxSetCurrent); @@ -93,6 +95,7 @@ nvidia::cuda::cuda::cuda() // Stream Managment CUDA_LOAD_SYMBOL(cuStreamCreate); + CUDA_LOAD_SYMBOL(cuStreamCreateWithPriority); CUDA_LOAD_SYMBOL_V2(cuStreamDestroy); CUDA_LOAD_SYMBOL(cuStreamSynchronize); diff --git a/source/nvidia/cuda/nvidia-cuda.hpp b/source/nvidia/cuda/nvidia-cuda.hpp index 1e3b2f6e..eb17dbc9 100644 --- a/source/nvidia/cuda/nvidia-cuda.hpp +++ b/source/nvidia/cuda/nvidia-cuda.hpp @@ -21,6 +21,7 @@ #include #include #include +#include "utility.hpp" #ifdef WIN32 #pragma warning(push) @@ -75,11 +76,26 @@ namespace nvidia::cuda { FLOAT = 0b00100000, }; + enum class cu_context_flags : std::uint32_t { + SCHEDULER_AUTO = 0x0, + SCHEDULER_SPIN = 0x1, + SCHEDULER_YIELD = 0x2, + SCHEDULER_BLOCKING_SYNC = 0x4, + MAP_HOST = 0x8, + LOCAL_MEMORY_RESIZE_TO_MAXIMUM = 0x10, + }; + + enum class cu_stream_flags : std::uint32_t { + DEFAULT = 0x0, + NON_BLOCKING = 0x1, + }; + typedef void* cu_array_t; typedef void* cu_context_t; typedef std::uint64_t cu_device_ptr_t; typedef void* cu_graphics_resource_t; typedef void* cu_stream_t; + typedef std::int32_t cu_device_t; struct cu_memcpy2d_t { std::size_t src_x_in_bytes; @@ -138,10 +154,10 @@ namespace nvidia::cuda { // Primary Context Management // cuDevicePrimaryCtxGetState - CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRelease, std::int32_t device); + CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRelease, cu_device_t device); // cuDevicePrimaryCtxReset_v2 - CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRetain, cu_context_t* ctx, std::int32_t device); - // cuDevicePrimaryCtxSetFlags_v2 + CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRetain, cu_context_t* ctx, cu_device_t device); + CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxSetFlags, cu_device_t device, cu_context_flags flags); // Context Management // cuCtxCreate_v2 @@ -153,7 +169,7 @@ namespace nvidia::cuda { // cuCtxGetFlags // cuCtxGetLimit // cuCtxGetSharedMemConfig - // cuCtxGetStreamPriorityRange + CUDA_DEFINE_FUNCTION(cuCtxGetStreamPriorityRange, std::int32_t* lowestPriority, std::int32_t* highestPriority); CUDA_DEFINE_FUNCTION(cuCtxPopCurrent, cu_context_t* ctx); CUDA_DEFINE_FUNCTION(cuCtxPushCurrent, cu_context_t ctx); // cuCtxSetCacheConfig @@ -278,8 +294,9 @@ namespace nvidia::cuda { // cuStreamAddCallback // cuStreamAttachMemAsync // cuStreamBeginCapture_v2 - CUDA_DEFINE_FUNCTION(cuStreamCreate, cu_stream_t* stream, std::uint32_t flags); - // cuStreamCreateWithPriority + CUDA_DEFINE_FUNCTION(cuStreamCreate, cu_stream_t* stream, cu_stream_flags flags); + CUDA_DEFINE_FUNCTION(cuStreamCreateWithPriority, cu_stream_t* stream, cu_stream_flags flags, + std::int32_t priority); CUDA_DEFINE_FUNCTION(cuStreamDestroy, cu_stream_t stream); // cuStreamEndCapture // cuStreamGetCaptureInfo @@ -385,3 +402,6 @@ namespace nvidia::cuda { #endif }; } // namespace nvidia::cuda + +P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::cu_context_flags) +P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::cu_stream_flags)