From 47d0d292d5cc5f0404e126023279db7decd532ac Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 28 Jun 2023 06:28:13 +0200 Subject: [PATCH 1/3] MemoryTracking: Initial setup of atomic writes. --- src/core/core.cpp | 26 ++++ src/core/core.h | 11 +- src/core/gpu_dirty_memory_manager.h | 112 ++++++++++++++++++ src/core/memory.cpp | 7 +- src/video_core/buffer_cache/buffer_cache.h | 26 +++- .../buffer_cache/buffer_cache_base.h | 2 + src/video_core/gpu.cpp | 4 +- .../renderer_vulkan/vk_rasterizer.cpp | 9 +- 8 files changed, 183 insertions(+), 14 deletions(-) create mode 100644 src/core/gpu_dirty_memory_manager.h diff --git a/src/core/core.cpp b/src/core/core.cpp index b74fd0a58..deefeb301 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -27,6 +27,7 @@ #include "core/file_sys/savedata_factory.h" #include "core/file_sys/vfs_concat.h" #include "core/file_sys/vfs_real.h" +#include "core/gpu_dirty_memory_manager.h" #include "core/hid/hid_core.h" #include "core/hle/kernel/k_memory_manager.h" #include "core/hle/kernel/k_process.h" @@ -54,6 +55,7 @@ #include "video_core/renderer_base.h" #include "video_core/video_core.h" + MICROPROFILE_DEFINE(ARM_CPU0, "ARM", "CPU 0", MP_RGB(255, 64, 64)); MICROPROFILE_DEFINE(ARM_CPU1, "ARM", "CPU 1", MP_RGB(255, 64, 64)); MICROPROFILE_DEFINE(ARM_CPU2, "ARM", "CPU 2", MP_RGB(255, 64, 64)); @@ -540,6 +542,9 @@ struct System::Impl { std::array dynarmic_ticks{}; std::array microprofile_cpu{}; + + std::array + gpu_dirty_memory_write_manager{}; }; System::System() : impl{std::make_unique(*this)} {} @@ -629,10 +634,31 @@ void System::PrepareReschedule(const u32 core_index) { impl->kernel.PrepareReschedule(core_index); } +Core::GPUDirtyMemoryManager& System::CurrentGPUDirtyMemoryManager() { + const std::size_t core = impl->kernel.GetCurrentHostThreadID(); + return impl->gpu_dirty_memory_write_manager[core < Core::Hardware::NUM_CPU_CORES + ? core + : Core::Hardware::NUM_CPU_CORES - 1]; +} + +/// Provides a constant reference to the current gou dirty memory manager. +const Core::GPUDirtyMemoryManager& System::CurrentGPUDirtyMemoryManager() const { + const std::size_t core = impl->kernel.GetCurrentHostThreadID(); + return impl->gpu_dirty_memory_write_manager[core < Core::Hardware::NUM_CPU_CORES + ? core + : Core::Hardware::NUM_CPU_CORES - 1]; +} + size_t System::GetCurrentHostThreadID() const { return impl->kernel.GetCurrentHostThreadID(); } +void System::GatherGPUDirtyMemory(std::function& callback) { + for (auto& manager : impl->gpu_dirty_memory_write_manager) { + manager.Gather(callback); + } +} + PerfStatsResults System::GetAndResetPerfStats() { return impl->GetAndResetPerfStats(); } diff --git a/src/core/core.h b/src/core/core.h index 93afc9303..14b2f7785 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -108,9 +108,10 @@ class CpuManager; class Debugger; class DeviceMemory; class ExclusiveMonitor; -class SpeedLimiter; +class GPUDirtyMemoryManager; class PerfStats; class Reporter; +class SpeedLimiter; class TelemetrySession; struct PerfStatsResults; @@ -225,6 +226,14 @@ public: /// Prepare the core emulation for a reschedule void PrepareReschedule(u32 core_index); + /// Provides a reference to the gou dirty memory manager. + [[nodiscard]] Core::GPUDirtyMemoryManager& CurrentGPUDirtyMemoryManager(); + + /// Provides a constant reference to the current gou dirty memory manager. + [[nodiscard]] const Core::GPUDirtyMemoryManager& CurrentGPUDirtyMemoryManager() const; + + void GatherGPUDirtyMemory(std::function& callback); + [[nodiscard]] size_t GetCurrentHostThreadID() const; /// Gets and resets core performance statistics diff --git a/src/core/gpu_dirty_memory_manager.h b/src/core/gpu_dirty_memory_manager.h new file mode 100644 index 000000000..9c3d41d11 --- /dev/null +++ b/src/core/gpu_dirty_memory_manager.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "core/memory.h" + +namespace Core { + +class GPUDirtyMemoryManager { +public: + GPUDirtyMemoryManager() : current{default_transform} {} + + ~GPUDirtyMemoryManager() = default; + + void Collect(VAddr address, size_t size) { + TransformAddress t = BuildTransform(address, size); + TransformAddress tmp, original; + do { + tmp = current.load(std::memory_order_acquire); + original = tmp; + if (tmp.address != t.address) { + if (IsValid(tmp.address)) { + std::scoped_lock lk(guard); + back_buffer.emplace_back(tmp); + current.exchange(t, std::memory_order_relaxed); + return; + } + tmp.address = t.address; + tmp.mask = 0; + } + if ((tmp.mask | t.mask) == tmp.mask) { + return; + } + tmp.mask |= t.mask; + } while (!current.compare_exchange_weak(original, tmp, std::memory_order_release, + std::memory_order_relaxed)); + } + + void Gather(std::function& callback) { + { + std::scoped_lock lk(guard); + TransformAddress t = current.exchange(default_transform, std::memory_order_relaxed); + front_buffer.swap(back_buffer); + if (IsValid(t.address)) { + front_buffer.emplace_back(t); + } + } + for (auto& transform : front_buffer) { + size_t offset = 0; + u64 mask = transform.mask; + while (mask != 0) { + const size_t empty_bits = std::countr_zero(mask); + offset += empty_bits << align_bits; + mask = mask >> empty_bits; + + const size_t continuous_bits = std::countr_one(mask); + callback((transform.address << Memory::YUZU_PAGEBITS) + offset, + continuous_bits << align_bits); + mask = continuous_bits < align_size ? (mask >> continuous_bits) : 0; + offset += continuous_bits << align_bits; + } + } + front_buffer.clear(); + } + +private: + struct alignas(16) TransformAddress { + VAddr address; + u64 mask; + }; + + constexpr static size_t align_bits = 6U; + constexpr static size_t align_size = 1U << align_bits; + constexpr static size_t align_mask = align_size - 1; + constexpr static TransformAddress default_transform = {.address = ~0ULL, .mask = 0ULL}; + + bool IsValid(VAddr address) { + return address < (1ULL << 39); + } + + template + T CreateMask(size_t top_bit, size_t minor_bit) { + T mask = ~T(0); + mask <<= (sizeof(T) * 8 - top_bit); + mask >>= (sizeof(T) * 8 - top_bit); + mask >>= minor_bit; + mask <<= minor_bit; + return mask; + } + + TransformAddress BuildTransform(VAddr address, size_t size) { + const size_t minor_address = address & Memory::YUZU_PAGEMASK; + const size_t minor_bit = minor_address >> align_bits; + const size_t top_bit = (minor_address + size + align_mask) >> align_bits; + TransformAddress result{}; + result.address = address >> Memory::YUZU_PAGEBITS; + result.mask = CreateMask(top_bit, minor_bit); + return result; + } + + std::atomic current{}; + std::mutex guard; + std::vector back_buffer; + std::vector front_buffer; +}; + +} // namespace Core diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 514ba0d66..60b246bdd 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -13,6 +13,7 @@ #include "common/swap.h" #include "core/core.h" #include "core/device_memory.h" +#include "core/gpu_dirty_memory_manager.h" #include "core/hardware_properties.h" #include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_process.h" @@ -678,7 +679,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, GetInteger(vaddr), static_cast(data)); }, - [&]() { system.GPU().InvalidateRegion(GetInteger(vaddr), sizeof(T)); }); + [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(T)); }); if (ptr) { std::memcpy(ptr, &data, sizeof(T)); } @@ -692,7 +693,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped WriteExclusive{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, GetInteger(vaddr), static_cast(data)); }, - [&]() { system.GPU().InvalidateRegion(GetInteger(vaddr), sizeof(T)); }); + [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(T)); }); if (ptr) { const auto volatile_pointer = reinterpret_cast(ptr); return Common::AtomicCompareAndSwap(volatile_pointer, data, expected); @@ -707,7 +708,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped WriteExclusive128 @ 0x{:016X} = 0x{:016X}{:016X}", GetInteger(vaddr), static_cast(data[1]), static_cast(data[0])); }, - [&]() { system.GPU().InvalidateRegion(GetInteger(vaddr), sizeof(u128)); }); + [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(u128)); }); if (ptr) { const auto volatile_pointer = reinterpret_cast(ptr); return Common::AtomicCompareAndSwap(volatile_pointer, data, expected); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 58a45ab67..9239ad862 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -115,7 +115,21 @@ void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { template void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { - memory_tracker.CachedCpuWrite(cpu_addr, size); + const bool is_dirty = IsRegionRegistered(cpu_addr, size); + if (!is_dirty) { + return; + } + VAddr aligned_start = Common::AlignDown(cpu_addr, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { + WriteMemory(cpu_addr, size); + return; + } + + tmp_buffer.resize_destructive(size); + cpu_memory.ReadBlockUnsafe(cpu_addr, tmp_buffer.data(), size); + + InlineMemoryImplementation(cpu_addr, size, tmp_buffer); } template @@ -1553,6 +1567,14 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, return false; } + InlineMemoryImplementation(dest_address, copy_size, inlined_buffer); + + return true; +} + +template +void BufferCache

::InlineMemoryImplementation(VAddr dest_address, size_t copy_size, + std::span inlined_buffer) { const IntervalType subtract_interval{dest_address, dest_address + copy_size}; ClearDownload(subtract_interval); common_ranges.subtract(subtract_interval); @@ -1574,8 +1596,6 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, } else { buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer.first(copy_size)); } - - return true; } template diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index fe6068cfe..4d9bab7f7 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -543,6 +543,8 @@ private: void ClearDownload(IntervalType subtract_interval); + void InlineMemoryImplementation(VAddr dest_address, size_t copy_size, std::span inlined_buffer); + VideoCore::RasterizerInterface& rasterizer; Core::Memory::Memory& cpu_memory; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index db385076d..f823a1e2b 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -95,7 +95,9 @@ struct GPU::Impl { /// Synchronizes CPU writes with Host GPU memory. void InvalidateGPUCache() { - rasterizer->InvalidateGPUCache(); + std::function callback_writes( + [this](VAddr address, size_t size) { rasterizer->OnCPUWrite(address, size); }); + system.GatherGPUDirtyMemory(callback_writes); } /// Signal the ending of command list. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index f7c0d939a..a63a29e61 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -570,7 +570,7 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - pipeline_cache.OnCPUWrite(addr, size); + { std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); @@ -579,14 +579,11 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.CachedWriteMemory(addr, size); } + pipeline_cache.InvalidateRegion(addr, size); } void RasterizerVulkan::InvalidateGPUCache() { - pipeline_cache.SyncGuestHost(); - { - std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.FlushCachedWrites(); - } + gpu.InvalidateGPUCache(); } void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { From da440da9f54cc860f3c69da685a415d5ec9d7b64 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Wed, 28 Jun 2023 19:32:50 +0200 Subject: [PATCH 2/3] Memory Tracking: Optimize tracking to only use atomic writes when contested with the host GPU --- src/core/core.cpp | 8 +++- src/core/gpu_dirty_memory_manager.h | 14 +++++-- src/core/memory.cpp | 39 ++++++++++++++++--- src/core/memory.h | 6 ++- src/video_core/buffer_cache/buffer_cache.h | 15 ++++++- .../buffer_cache/buffer_cache_base.h | 5 ++- src/video_core/fence_manager.h | 2 +- src/video_core/gpu.cpp | 10 ++++- src/video_core/gpu.h | 4 ++ src/video_core/gpu_thread.cpp | 6 +-- src/video_core/rasterizer_interface.h | 4 +- .../renderer_null/null_rasterizer.cpp | 5 ++- .../renderer_null/null_rasterizer.h | 3 +- .../renderer_opengl/gl_rasterizer.cpp | 35 ++++++++++++----- .../renderer_opengl/gl_rasterizer.h | 3 +- .../renderer_vulkan/vk_rasterizer.cpp | 25 +++++++++++- .../renderer_vulkan/vk_rasterizer.h | 3 +- src/video_core/shader_cache.cpp | 2 +- src/video_core/shader_cache.h | 2 +- 19 files changed, 153 insertions(+), 38 deletions(-) diff --git a/src/core/core.cpp b/src/core/core.cpp index deefeb301..9e3eb3795 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -55,7 +55,6 @@ #include "video_core/renderer_base.h" #include "video_core/video_core.h" - MICROPROFILE_DEFINE(ARM_CPU0, "ARM", "CPU 0", MP_RGB(255, 64, 64)); MICROPROFILE_DEFINE(ARM_CPU1, "ARM", "CPU 1", MP_RGB(255, 64, 64)); MICROPROFILE_DEFINE(ARM_CPU2, "ARM", "CPU 2", MP_RGB(255, 64, 64)); @@ -132,7 +131,10 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs, struct System::Impl { explicit Impl(System& system) : kernel{system}, fs_controller{system}, memory{system}, hid_core{}, room_network{}, - cpu_manager{system}, reporter{system}, applet_manager{system}, time_manager{system} {} + cpu_manager{system}, reporter{system}, applet_manager{system}, time_manager{system}, + gpu_dirty_memory_write_manager{} { + memory.SetGPUDirtyManagers(gpu_dirty_memory_write_manager); + } void Initialize(System& system) { device_memory = std::make_unique(); @@ -236,6 +238,8 @@ struct System::Impl { // Setting changes may require a full system reinitialization (e.g., disabling multicore). ReinitializeIfNecessary(system); + memory.SetGPUDirtyManagers(gpu_dirty_memory_write_manager); + kernel.Initialize(); cpu_manager.Initialize(); diff --git a/src/core/gpu_dirty_memory_manager.h b/src/core/gpu_dirty_memory_manager.h index 9c3d41d11..789b7530f 100644 --- a/src/core/gpu_dirty_memory_manager.h +++ b/src/core/gpu_dirty_memory_manager.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + #pragma once #include @@ -59,8 +62,7 @@ public: mask = mask >> empty_bits; const size_t continuous_bits = std::countr_one(mask); - callback((transform.address << Memory::YUZU_PAGEBITS) + offset, - continuous_bits << align_bits); + callback((transform.address << page_bits) + offset, continuous_bits << align_bits); mask = continuous_bits < align_size ? (mask >> continuous_bits) : 0; offset += continuous_bits << align_bits; } @@ -74,6 +76,10 @@ private: u64 mask; }; + constexpr static size_t page_bits = Memory::YUZU_PAGEBITS; + constexpr static size_t page_size = 1ULL << page_bits; + constexpr static size_t page_mask = page_size - 1; + constexpr static size_t align_bits = 6U; constexpr static size_t align_size = 1U << align_bits; constexpr static size_t align_mask = align_size - 1; @@ -94,11 +100,11 @@ private: } TransformAddress BuildTransform(VAddr address, size_t size) { - const size_t minor_address = address & Memory::YUZU_PAGEMASK; + const size_t minor_address = address & page_mask; const size_t minor_bit = minor_address >> align_bits; const size_t top_bit = (minor_address + size + align_mask) >> align_bits; TransformAddress result{}; - result.address = address >> Memory::YUZU_PAGEBITS; + result.address = address >> page_bits; result.mask = CreateMask(top_bit, minor_bit); return result; } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 60b246bdd..257406f09 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -3,6 +3,7 @@ #include #include +#include #include "common/assert.h" #include "common/atomic_ops.h" @@ -679,7 +680,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, GetInteger(vaddr), static_cast(data)); }, - [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(T)); }); + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(T)); }); if (ptr) { std::memcpy(ptr, &data, sizeof(T)); } @@ -693,7 +694,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped WriteExclusive{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, GetInteger(vaddr), static_cast(data)); }, - [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(T)); }); + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(T)); }); if (ptr) { const auto volatile_pointer = reinterpret_cast(ptr); return Common::AtomicCompareAndSwap(volatile_pointer, data, expected); @@ -708,7 +709,7 @@ struct Memory::Impl { LOG_ERROR(HW_Memory, "Unmapped WriteExclusive128 @ 0x{:016X} = 0x{:016X}{:016X}", GetInteger(vaddr), static_cast(data[1]), static_cast(data[0])); }, - [&]() { system.CurrentGPUDirtyMemoryManager().Collect(GetInteger(vaddr), sizeof(u128)); }); + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(u128)); }); if (ptr) { const auto volatile_pointer = reinterpret_cast(ptr); return Common::AtomicCompareAndSwap(volatile_pointer, data, expected); @@ -718,7 +719,7 @@ struct Memory::Impl { void HandleRasterizerDownload(VAddr address, size_t size) { const size_t core = system.GetCurrentHostThreadID(); - auto& current_area = rasterizer_areas[core]; + auto& current_area = rasterizer_read_areas[core]; const VAddr end_address = address + size; if (current_area.start_address <= address && end_address <= current_area.end_address) [[likely]] { @@ -727,9 +728,31 @@ struct Memory::Impl { current_area = system.GPU().OnCPURead(address, size); } - Common::PageTable* current_page_table = nullptr; - std::array rasterizer_areas{}; + void HandleRasterizerWrite(VAddr address, size_t size) { + const size_t core = system.GetCurrentHostThreadID(); + auto& current_area = rasterizer_write_areas[core]; + VAddr subaddress = address >> YUZU_PAGEBITS; + bool do_collection = current_area.last_address == subaddress; + if (!do_collection) [[unlikely]] { + do_collection = system.GPU().OnCPUWrite(address, size); + if (!do_collection) { + return; + } + current_area.last_address = subaddress; + } + gpu_dirty_managers[core].Collect(address, size); + } + + struct GPUDirtyState { + VAddr last_address; + }; + Core::System& system; + Common::PageTable* current_page_table = nullptr; + std::array + rasterizer_read_areas{}; + std::array rasterizer_write_areas{}; + std::span gpu_dirty_managers; }; Memory::Memory(Core::System& system_) : system{system_} { @@ -877,6 +900,10 @@ void Memory::ZeroBlock(Common::ProcessAddress dest_addr, const std::size_t size) impl->ZeroBlock(*system.ApplicationProcess(), dest_addr, size); } +void Memory::SetGPUDirtyManagers(std::span managers) { + impl->gpu_dirty_managers = managers; +} + Result Memory::InvalidateDataCache(Common::ProcessAddress dest_addr, const std::size_t size) { return impl->InvalidateDataCache(*system.ApplicationProcess(), dest_addr, size); } diff --git a/src/core/memory.h b/src/core/memory.h index 72a0be813..ea01824f8 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -5,6 +5,7 @@ #include #include +#include #include #include "common/typed_address.h" #include "core/hle/result.h" @@ -15,7 +16,8 @@ struct PageTable; namespace Core { class System; -} +class GPUDirtyMemoryManager; +} // namespace Core namespace Kernel { class PhysicalMemory; @@ -458,6 +460,8 @@ public: */ void MarkRegionDebug(Common::ProcessAddress vaddr, u64 size, bool debug); + void SetGPUDirtyManagers(std::span managers); + private: Core::System& system; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 9239ad862..b5ed3380f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -132,6 +132,19 @@ void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { InlineMemoryImplementation(cpu_addr, size, tmp_buffer); } +template +bool BufferCache

::OnCPUWrite(VAddr cpu_addr, u64 size) { + const bool is_dirty = IsRegionRegistered(cpu_addr, size); + if (!is_dirty) { + return false; + } + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + return true; + } + WriteMemory(cpu_addr, size); + return false; +} + template std::optional BufferCache

::GetFlushArea(VAddr cpu_addr, u64 size) { @@ -1574,7 +1587,7 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, template void BufferCache

::InlineMemoryImplementation(VAddr dest_address, size_t copy_size, - std::span inlined_buffer) { + std::span inlined_buffer) { const IntervalType subtract_interval{dest_address, dest_address + copy_size}; ClearDownload(subtract_interval); common_ranges.subtract(subtract_interval); diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 4d9bab7f7..460fc7551 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -245,6 +245,8 @@ public: void CachedWriteMemory(VAddr cpu_addr, u64 size); + bool OnCPUWrite(VAddr cpu_addr, u64 size); + void DownloadMemory(VAddr cpu_addr, u64 size); std::optional GetFlushArea(VAddr cpu_addr, u64 size); @@ -543,7 +545,8 @@ private: void ClearDownload(IntervalType subtract_interval); - void InlineMemoryImplementation(VAddr dest_address, size_t copy_size, std::span inlined_buffer); + void InlineMemoryImplementation(VAddr dest_address, size_t copy_size, + std::span inlined_buffer); VideoCore::RasterizerInterface& rasterizer; Core::Memory::Memory& cpu_memory; diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 35d699bbf..ab20ff30f 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -69,7 +69,6 @@ public: } void SignalFence(std::function&& func) { - rasterizer.InvalidateGPUCache(); bool delay_fence = Settings::IsGPULevelHigh(); if constexpr (!can_async_check) { TryReleasePendingFences(); @@ -96,6 +95,7 @@ public: guard.unlock(); cv.notify_all(); } + rasterizer.InvalidateGPUCache(); } void SignalSyncPoint(u32 value) { diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index f823a1e2b..c192e33b2 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -96,7 +96,7 @@ struct GPU::Impl { /// Synchronizes CPU writes with Host GPU memory. void InvalidateGPUCache() { std::function callback_writes( - [this](VAddr address, size_t size) { rasterizer->OnCPUWrite(address, size); }); + [this](VAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); }); system.GatherGPUDirtyMemory(callback_writes); } @@ -301,6 +301,10 @@ struct GPU::Impl { gpu_thread.InvalidateRegion(addr, size); } + bool OnCPUWrite(VAddr addr, u64 size) { + return rasterizer->OnCPUWrite(addr, size); + } + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size) { gpu_thread.FlushAndInvalidateRegion(addr, size); @@ -563,6 +567,10 @@ void GPU::InvalidateRegion(VAddr addr, u64 size) { impl->InvalidateRegion(addr, size); } +bool GPU::OnCPUWrite(VAddr addr, u64 size) { + return impl->OnCPUWrite(addr, size); +} + void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { impl->FlushAndInvalidateRegion(addr, size); } diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index e49c40cf2..ba2838b89 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -250,6 +250,10 @@ public: /// Notify rasterizer that any caches of the specified region should be invalidated void InvalidateRegion(VAddr addr, u64 size); + /// Notify rasterizer that CPU is trying to write this area. It returns true if the area is + /// sensible, false otherwise + bool OnCPUWrite(VAddr addr, u64 size); + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 889144f38..2f0f9f593 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -47,7 +47,7 @@ static void RunThread(std::stop_token stop_token, Core::System& system, } else if (const auto* flush = std::get_if(&next.data)) { rasterizer->FlushRegion(flush->addr, flush->size); } else if (const auto* invalidate = std::get_if(&next.data)) { - rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); + rasterizer->OnCacheInvalidation(invalidate->addr, invalidate->size); } else { ASSERT(false); } @@ -102,12 +102,12 @@ void ThreadManager::TickGPU() { } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { - rasterizer->OnCPUWrite(addr, size); + rasterizer->OnCacheInvalidation(addr, size); } void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important - rasterizer->OnCPUWrite(addr, size); + rasterizer->OnCacheInvalidation(addr, size); } u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) { diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 7566a8c4e..cb8029a4f 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -109,7 +109,9 @@ public: } /// Notify rasterizer that any caches of the specified region are desync with guest - virtual void OnCPUWrite(VAddr addr, u64 size) = 0; + virtual void OnCacheInvalidation(VAddr addr, u64 size) = 0; + + virtual bool OnCPUWrite(VAddr addr, u64 size) = 0; /// Sync memory between guest and host. virtual void InvalidateGPUCache() = 0; diff --git a/src/video_core/renderer_null/null_rasterizer.cpp b/src/video_core/renderer_null/null_rasterizer.cpp index bf2ce4c49..92ecf6682 100644 --- a/src/video_core/renderer_null/null_rasterizer.cpp +++ b/src/video_core/renderer_null/null_rasterizer.cpp @@ -47,7 +47,10 @@ bool RasterizerNull::MustFlushRegion(VAddr addr, u64 size, VideoCommon::CacheTyp return false; } void RasterizerNull::InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} -void RasterizerNull::OnCPUWrite(VAddr addr, u64 size) {} +bool RasterizerNull::OnCPUWrite(VAddr addr, u64 size) { + return false; +} +void RasterizerNull::OnCacheInvalidation(VAddr addr, u64 size) {} VideoCore::RasterizerDownloadArea RasterizerNull::GetFlushArea(VAddr addr, u64 size) { VideoCore::RasterizerDownloadArea new_area{ .start_address = Common::AlignDown(addr, Core::Memory::YUZU_PAGESIZE), diff --git a/src/video_core/renderer_null/null_rasterizer.h b/src/video_core/renderer_null/null_rasterizer.h index a8d35d2c1..93b9a6971 100644 --- a/src/video_core/renderer_null/null_rasterizer.h +++ b/src/video_core/renderer_null/null_rasterizer.h @@ -53,7 +53,8 @@ public: VideoCommon::CacheType which = VideoCommon::CacheType::All) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index edf527f2d..aadd6967c 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -485,12 +485,33 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache } } -void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { +bool RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { + MICROPROFILE_SCOPE(OpenGL_CacheManagement); + if (addr == 0 || size == 0) { + return false; + } + + { + std::scoped_lock lock{buffer_cache.mutex}; + if (buffer_cache.OnCPUWrite(addr, size)) { + return true; + } + } + + { + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.WriteMemory(addr, size); + } + + shader_cache.InvalidateRegion(addr, size); + return false; +} + +void RasterizerOpenGL::OnCacheInvalidation(VAddr addr, u64 size) { MICROPROFILE_SCOPE(OpenGL_CacheManagement); if (addr == 0 || size == 0) { return; } - shader_cache.OnCPUWrite(addr, size); { std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); @@ -499,15 +520,11 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.CachedWriteMemory(addr, size); } + shader_cache.InvalidateRegion(addr, size); } void RasterizerOpenGL::InvalidateGPUCache() { - MICROPROFILE_SCOPE(OpenGL_CacheManagement); - shader_cache.SyncGuestHost(); - { - std::scoped_lock lock{buffer_cache.mutex}; - buffer_cache.FlushCachedWrites(); - } + gpu.InvalidateGPUCache(); } void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { @@ -519,7 +536,7 @@ void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { std::scoped_lock lock{buffer_cache.mutex}; buffer_cache.WriteMemory(addr, size); } - shader_cache.OnCPUWrite(addr, size); + shader_cache.OnCacheInvalidation(addr, size); } void RasterizerOpenGL::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a73ad15c1..8eda2ddba 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -98,7 +98,8 @@ public: VideoCore::RasterizerDownloadArea GetFlushArea(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index a63a29e61..456bb040e 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -566,7 +566,28 @@ void RasterizerVulkan::InnerInvalidation(std::span> sequences) override; - void OnCPUWrite(VAddr addr, u64 size) override; + void OnCacheInvalidation(VAddr addr, u64 size) override; + bool OnCPUWrite(VAddr addr, u64 size) override; void InvalidateGPUCache() override; void UnmapMemory(VAddr addr, u64 size) override; void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index 4db948b6d..01701201d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -24,7 +24,7 @@ void ShaderCache::InvalidateRegion(VAddr addr, size_t size) { RemovePendingShaders(); } -void ShaderCache::OnCPUWrite(VAddr addr, size_t size) { +void ShaderCache::OnCacheInvalidation(VAddr addr, size_t size) { std::scoped_lock lock{invalidation_mutex}; InvalidatePagesInRegion(addr, size); } diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h index f3cc4c70b..de8e08002 100644 --- a/src/video_core/shader_cache.h +++ b/src/video_core/shader_cache.h @@ -62,7 +62,7 @@ public: /// @brief Unmarks a memory region as cached and marks it for removal /// @param addr Start address of the CPU write operation /// @param size Number of bytes of the CPU write operation - void OnCPUWrite(VAddr addr, size_t size); + void OnCacheInvalidation(VAddr addr, size_t size); /// @brief Flushes delayed removal operations void SyncGuestHost(); From 0e6b559c98e3dee54c3c9eaef2d3e59f3871882d Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 29 Jun 2023 12:24:56 +0200 Subject: [PATCH 3/3] Memory Tracker: Use 64 bit atomics instead of 128 bits --- src/core/gpu_dirty_memory_manager.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/core/gpu_dirty_memory_manager.h b/src/core/gpu_dirty_memory_manager.h index 789b7530f..9687531e8 100644 --- a/src/core/gpu_dirty_memory_manager.h +++ b/src/core/gpu_dirty_memory_manager.h @@ -16,7 +16,10 @@ namespace Core { class GPUDirtyMemoryManager { public: - GPUDirtyMemoryManager() : current{default_transform} {} + GPUDirtyMemoryManager() : current{default_transform} { + back_buffer.reserve(256); + front_buffer.reserve(256); + } ~GPUDirtyMemoryManager() = default; @@ -62,7 +65,8 @@ public: mask = mask >> empty_bits; const size_t continuous_bits = std::countr_one(mask); - callback((transform.address << page_bits) + offset, continuous_bits << align_bits); + callback((static_cast(transform.address) << page_bits) + offset, + continuous_bits << align_bits); mask = continuous_bits < align_size ? (mask >> continuous_bits) : 0; offset += continuous_bits << align_bits; } @@ -71,19 +75,19 @@ public: } private: - struct alignas(16) TransformAddress { - VAddr address; - u64 mask; + struct alignas(8) TransformAddress { + u32 address; + u32 mask; }; - constexpr static size_t page_bits = Memory::YUZU_PAGEBITS; + constexpr static size_t page_bits = Memory::YUZU_PAGEBITS - 1; constexpr static size_t page_size = 1ULL << page_bits; constexpr static size_t page_mask = page_size - 1; constexpr static size_t align_bits = 6U; constexpr static size_t align_size = 1U << align_bits; constexpr static size_t align_mask = align_size - 1; - constexpr static TransformAddress default_transform = {.address = ~0ULL, .mask = 0ULL}; + constexpr static TransformAddress default_transform = {.address = ~0U, .mask = 0U}; bool IsValid(VAddr address) { return address < (1ULL << 39); @@ -104,8 +108,8 @@ private: const size_t minor_bit = minor_address >> align_bits; const size_t top_bit = (minor_address + size + align_mask) >> align_bits; TransformAddress result{}; - result.address = address >> page_bits; - result.mask = CreateMask(top_bit, minor_bit); + result.address = static_cast(address >> page_bits); + result.mask = CreateMask(top_bit, minor_bit); return result; }