Merge pull request #11544 from Kelebek1/reduce_stream_buffer_renderdoc

Allow GPUs without rebar to open multiple RenderDoc captures
This commit is contained in:
liamwhite 2023-10-07 12:49:19 -04:00 committed by GitHub
commit 15a5bdd979
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 14 deletions

View File

@ -24,25 +24,38 @@ using namespace Common::Literals;
// Maximum potential alignment of a Vulkan buffer // Maximum potential alignment of a Vulkan buffer
constexpr VkDeviceSize MAX_ALIGNMENT = 256; constexpr VkDeviceSize MAX_ALIGNMENT = 256;
// Maximum size to put elements in the stream buffer
constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8_MiB;
// Stream buffer size in bytes // Stream buffer size in bytes
constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128_MiB; constexpr VkDeviceSize MAX_STREAM_BUFFER_SIZE = 128_MiB;
constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
size_t Region(size_t iterator) noexcept { size_t GetStreamBufferSize(const Device& device) {
return iterator / REGION_SIZE; VkDeviceSize size{0};
if (device.HasDebuggingToolAttached()) {
ForEachDeviceLocalHostVisibleHeap(device, [&size](size_t index, VkMemoryHeap& heap) {
size = std::max(size, heap.size);
});
// If rebar is not supported, cut the max heap size to 40%. This will allow 2 captures to be
// loaded at the same time in RenderDoc. If rebar is supported, this shouldn't be an issue
// as the heap will be much larger.
if (size <= 256_MiB) {
size = size * 40 / 100;
}
} else {
size = MAX_STREAM_BUFFER_SIZE;
}
return std::min(Common::AlignUp(size, MAX_ALIGNMENT), MAX_STREAM_BUFFER_SIZE);
} }
} // Anonymous namespace } // Anonymous namespace
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
Scheduler& scheduler_) Scheduler& scheduler_)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
stream_buffer_size{GetStreamBufferSize(device)}, region_size{stream_buffer_size /
StagingBufferPool::NUM_SYNCS} {
VkBufferCreateInfo stream_ci = { VkBufferCreateInfo stream_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr, .pNext = nullptr,
.flags = 0, .flags = 0,
.size = STREAM_BUFFER_SIZE, .size = stream_buffer_size,
.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE, .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
@ -63,7 +76,7 @@ StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& mem
StagingBufferPool::~StagingBufferPool() = default; StagingBufferPool::~StagingBufferPool() = default;
StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) { StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage, bool deferred) {
if (!deferred && usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) { if (!deferred && usage == MemoryUsage::Upload && size <= region_size) {
return GetStreamBuffer(size); return GetStreamBuffer(size);
} }
return GetStagingBuffer(size, usage, deferred); return GetStagingBuffer(size, usage, deferred);
@ -101,7 +114,7 @@ StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
used_iterator = iterator; used_iterator = iterator;
free_iterator = std::max(free_iterator, iterator + size); free_iterator = std::max(free_iterator, iterator + size);
if (iterator + size >= STREAM_BUFFER_SIZE) { if (iterator + size >= stream_buffer_size) {
std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS, std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS,
current_tick); current_tick);
used_iterator = 0; used_iterator = 0;

View File

@ -90,6 +90,9 @@ private:
void ReleaseCache(MemoryUsage usage); void ReleaseCache(MemoryUsage usage);
void ReleaseLevel(StagingBuffersCache& cache, size_t log2); void ReleaseLevel(StagingBuffersCache& cache, size_t log2);
size_t Region(size_t iter) const noexcept {
return iter / region_size;
}
const Device& device; const Device& device;
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
@ -97,6 +100,8 @@ private:
vk::Buffer stream_buffer; vk::Buffer stream_buffer;
std::span<u8> stream_pointer; std::span<u8> stream_pointer;
VkDeviceSize stream_buffer_size;
VkDeviceSize region_size;
size_t iterator = 0; size_t iterator = 0;
size_t used_iterator = 0; size_t used_iterator = 0;

View File

@ -9,6 +9,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "common/literals.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "common/polyfill_ranges.h" #include "common/polyfill_ranges.h"
#include "video_core/vulkan_common/vma.h" #include "video_core/vulkan_common/vma.h"
@ -69,8 +70,7 @@ struct Range {
case MemoryUsage::Download: case MemoryUsage::Download:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; return VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
case MemoryUsage::DeviceLocal: case MemoryUsage::DeviceLocal:
return VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | return {};
VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT;
} }
return {}; return {};
} }
@ -212,7 +212,20 @@ MemoryAllocator::MemoryAllocator(const Device& device_)
: device{device_}, allocator{device.GetAllocator()}, : device{device_}, allocator{device.GetAllocator()},
properties{device_.GetPhysical().GetMemoryProperties().memoryProperties}, properties{device_.GetPhysical().GetMemoryProperties().memoryProperties},
buffer_image_granularity{ buffer_image_granularity{
device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {} device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {
// GPUs not supporting rebar may only have a region with less than 256MB host visible/device
// local memory. In that case, opening 2 RenderDoc captures side-by-side is not possible due to
// the heap running out of memory. With RenderDoc attached and only a small host/device region,
// only allow the stream buffer in this memory heap.
if (device.HasDebuggingToolAttached()) {
using namespace Common::Literals;
ForEachDeviceLocalHostVisibleHeap(device, [this](size_t index, VkMemoryHeap& heap) {
if (heap.size <= 256_MiB) {
valid_memory_types &= ~(1u << index);
}
});
}
}
MemoryAllocator::~MemoryAllocator() = default; MemoryAllocator::~MemoryAllocator() = default;
@ -244,7 +257,7 @@ vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsa
.usage = MemoryUsageVma(usage), .usage = MemoryUsageVma(usage),
.requiredFlags = 0, .requiredFlags = 0,
.preferredFlags = MemoryUsagePreferedVmaFlags(usage), .preferredFlags = MemoryUsagePreferedVmaFlags(usage),
.memoryTypeBits = 0, .memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types,
.pool = VK_NULL_HANDLE, .pool = VK_NULL_HANDLE,
.pUserData = nullptr, .pUserData = nullptr,
.priority = 0.f, .priority = 0.f,

View File

@ -7,6 +7,7 @@
#include <span> #include <span>
#include <vector> #include <vector>
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
VK_DEFINE_HANDLE(VmaAllocator) VK_DEFINE_HANDLE(VmaAllocator)
@ -26,6 +27,18 @@ enum class MemoryUsage {
Stream, ///< Requests device local host visible buffer, falling back host memory. Stream, ///< Requests device local host visible buffer, falling back host memory.
}; };
template <typename F>
void ForEachDeviceLocalHostVisibleHeap(const Device& device, F&& f) {
auto memory_props = device.GetPhysical().GetMemoryProperties().memoryProperties;
for (size_t i = 0; i < memory_props.memoryTypeCount; i++) {
auto& memory_type = memory_props.memoryTypes[i];
if ((memory_type.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
(memory_type.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
f(memory_type.heapIndex, memory_props.memoryHeaps[memory_type.heapIndex]);
}
}
}
/// Ownership handle of a memory commitment. /// Ownership handle of a memory commitment.
/// Points to a subregion of a memory allocation. /// Points to a subregion of a memory allocation.
class MemoryCommit { class MemoryCommit {
@ -124,6 +137,7 @@ private:
std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations. std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations.
VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers VkDeviceSize buffer_image_granularity; // The granularity for adjacent offsets between buffers
// and optimal images // and optimal images
u32 valid_memory_types{~0u};
}; };
} // namespace Vulkan } // namespace Vulkan