From 4a571b3057aae23657a7bfe0ac8505b0b7cfae98 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Sun, 23 Apr 2023 14:05:41 +0200 Subject: [PATCH] early-access version 3524 --- README.md | 2 +- src/audio_core/renderer/adsp/adsp.cpp | 1 - .../renderer/adsp/audio_renderer.cpp | 5 +- .../renderer/adsp/command_list_processor.cpp | 1 - .../command/performance/performance.cpp | 15 +- src/audio_core/sink/sink_stream.cpp | 1 - src/common/CMakeLists.txt | 2 + src/common/steady_clock.cpp | 5 +- src/common/wall_clock.cpp | 73 +- src/common/wall_clock.h | 69 +- src/common/x64/cpu_detect.cpp | 3 + src/common/x64/cpu_wait.cpp | 20 +- src/common/x64/native_clock.cpp | 165 +-- src/common/x64/native_clock.h | 56 +- src/common/x64/rdtsc.cpp | 39 + src/common/x64/rdtsc.h | 37 + src/core/CMakeLists.txt | 1 - src/core/core_timing.cpp | 35 +- src/core/core_timing.h | 11 +- src/core/hle/kernel/k_scheduler.cpp | 5 +- src/core/hle/kernel/svc/svc_info.cpp | 4 +- src/core/hle/service/hid/hidbus.cpp | 1 - .../nvnflinger/buffer_queue_consumer.cpp | 32 +- .../service/nvnflinger/buffer_queue_core.cpp | 4 + src/core/hle/service/nvnflinger/buffer_slot.h | 1 + .../hle/service/nvnflinger/nvnflinger.cpp | 12 +- src/core/hle/service/time/clock_types.h | 13 +- .../time/standard_steady_clock_core.cpp | 2 +- .../time/tick_based_steady_clock_core.cpp | 2 +- src/core/hle/service/time/time.cpp | 4 +- .../hle/service/time/time_sharedmemory.cpp | 5 +- src/tests/CMakeLists.txt | 2 +- src/tests/video_core/memory_tracker.cpp | 547 ++++++++++ src/video_core/CMakeLists.txt | 5 + src/video_core/buffer_cache/buffer_base.h | 459 +-------- src/video_core/buffer_cache/buffer_cache.cpp | 4 +- src/video_core/buffer_cache/buffer_cache.h | 974 ++++++------------ .../buffer_cache/buffer_cache_base.h | 581 +++++++++++ .../buffer_cache/memory_tracker_base.h | 266 +++++ src/video_core/buffer_cache/word_manager.h | 476 +++++++++ src/video_core/engines/maxwell_3d.cpp | 2 +- src/video_core/gpu.cpp | 14 +- .../renderer_opengl/gl_buffer_cache.h | 4 + .../renderer_opengl/gl_buffer_cache_base.cpp | 9 + .../renderer_vulkan/vk_buffer_cache.cpp | 8 +- .../renderer_vulkan/vk_buffer_cache.h | 10 +- .../renderer_vulkan/vk_buffer_cache_base.cpp | 9 + src/video_core/texture_cache/texture_cache.h | 26 +- .../texture_cache/texture_cache_base.h | 5 - 49 files changed, 2542 insertions(+), 1485 deletions(-) create mode 100755 src/common/x64/rdtsc.cpp create mode 100755 src/common/x64/rdtsc.h create mode 100755 src/tests/video_core/memory_tracker.cpp create mode 100755 src/video_core/buffer_cache/buffer_cache_base.h create mode 100755 src/video_core/buffer_cache/memory_tracker_base.h create mode 100755 src/video_core/buffer_cache/word_manager.h create mode 100755 src/video_core/renderer_opengl/gl_buffer_cache_base.cpp create mode 100755 src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp diff --git a/README.md b/README.md index 66424126c..3d93744d7 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 3520. +This is the source code for early-access 3524. ## Legal Notice diff --git a/src/audio_core/renderer/adsp/adsp.cpp b/src/audio_core/renderer/adsp/adsp.cpp index c67e3ee06..ca845ea9e 100755 --- a/src/audio_core/renderer/adsp/adsp.cpp +++ b/src/audio_core/renderer/adsp/adsp.cpp @@ -7,7 +7,6 @@ #include "common/logging/log.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/memory.h" namespace AudioCore::AudioRenderer::ADSP { diff --git a/src/audio_core/renderer/adsp/audio_renderer.cpp b/src/audio_core/renderer/adsp/audio_renderer.cpp index 2e5558f09..fcf49b299 100755 --- a/src/audio_core/renderer/adsp/audio_renderer.cpp +++ b/src/audio_core/renderer/adsp/audio_renderer.cpp @@ -13,7 +13,6 @@ #include "common/thread.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" MICROPROFILE_DEFINE(Audio_Renderer, "Audio", "DSP", MP_RGB(60, 19, 97)); @@ -144,6 +143,7 @@ void AudioRenderer::ThreadFunc() { mailbox->ADSPSendMessage(RenderMessage::AudioRenderer_InitializeOK); + // 0.12 seconds (2304000 / 19200000) constexpr u64 max_process_time{2'304'000ULL}; while (true) { @@ -179,8 +179,7 @@ void AudioRenderer::ThreadFunc() { u64 max_time{max_process_time}; if (index == 1 && command_buffer.applet_resource_user_id == mailbox->GetCommandBuffer(0).applet_resource_user_id) { - max_time = max_process_time - - Core::Timing::CyclesToNs(render_times_taken[0]).count(); + max_time = max_process_time - render_times_taken[0]; if (render_times_taken[0] > max_process_time) { max_time = 0; } diff --git a/src/audio_core/renderer/adsp/command_list_processor.cpp b/src/audio_core/renderer/adsp/command_list_processor.cpp index a83f05ad1..544ae4354 100755 --- a/src/audio_core/renderer/adsp/command_list_processor.cpp +++ b/src/audio_core/renderer/adsp/command_list_processor.cpp @@ -9,7 +9,6 @@ #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/memory.h" namespace AudioCore::AudioRenderer::ADSP { diff --git a/src/audio_core/renderer/command/performance/performance.cpp b/src/audio_core/renderer/command/performance/performance.cpp index 14bc43293..b31880de3 100755 --- a/src/audio_core/renderer/command/performance/performance.cpp +++ b/src/audio_core/renderer/command/performance/performance.cpp @@ -5,7 +5,6 @@ #include "audio_core/renderer/command/performance/performance.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" namespace AudioCore::AudioRenderer { @@ -18,20 +17,18 @@ void PerformanceCommand::Process(const ADSP::CommandListProcessor& processor) { auto base{entry_address.translated_address}; if (state == PerformanceState::Start) { auto start_time_ptr{reinterpret_cast(base + entry_address.entry_start_time_offset)}; - *start_time_ptr = static_cast( - Core::Timing::CyclesToUs(processor.system->CoreTiming().GetClockTicks() - - processor.start_time - processor.current_processing_time) - .count()); + *start_time_ptr = + static_cast(processor.system->CoreTiming().GetClockTicks() - processor.start_time - + processor.current_processing_time); } else if (state == PerformanceState::Stop) { auto processed_time_ptr{ reinterpret_cast(base + entry_address.entry_processed_time_offset)}; auto entry_count_ptr{ reinterpret_cast(base + entry_address.header_entry_count_offset)}; - *processed_time_ptr = static_cast( - Core::Timing::CyclesToUs(processor.system->CoreTiming().GetClockTicks() - - processor.start_time - processor.current_processing_time) - .count()); + *processed_time_ptr = + static_cast(processor.system->CoreTiming().GetClockTicks() - processor.start_time - + processor.current_processing_time); (*entry_count_ptr)++; } } diff --git a/src/audio_core/sink/sink_stream.cpp b/src/audio_core/sink/sink_stream.cpp index fd33389fe..d57be6189 100755 --- a/src/audio_core/sink/sink_stream.cpp +++ b/src/audio_core/sink/sink_stream.cpp @@ -15,7 +15,6 @@ #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" namespace AudioCore::Sink { diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 3db47004b..94c983c66 100755 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -164,6 +164,8 @@ if(ARCHITECTURE_x86_64) x64/cpu_wait.h x64/native_clock.cpp x64/native_clock.h + x64/rdtsc.cpp + x64/rdtsc.h x64/xbyak_abi.h x64/xbyak_util.h ) diff --git a/src/common/steady_clock.cpp b/src/common/steady_clock.cpp index 782859196..9415eed29 100755 --- a/src/common/steady_clock.cpp +++ b/src/common/steady_clock.cpp @@ -28,13 +28,12 @@ static s64 GetSystemTimeNS() { // GetSystemTimePreciseAsFileTime returns the file time in 100ns units. static constexpr s64 Multiplier = 100; // Convert Windows epoch to Unix epoch. - static constexpr s64 WindowsEpochToUnixEpochNS = 0x19DB1DED53E8000LL; + static constexpr s64 WindowsEpochToUnixEpoch = 0x19DB1DED53E8000LL; FILETIME filetime; GetSystemTimePreciseAsFileTime(&filetime); return Multiplier * ((static_cast(filetime.dwHighDateTime) << 32) + - static_cast(filetime.dwLowDateTime)) - - WindowsEpochToUnixEpochNS; + static_cast(filetime.dwLowDateTime) - WindowsEpochToUnixEpoch); } #endif diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp index 2d906e900..b7bb3e2c4 100755 --- a/src/common/wall_clock.cpp +++ b/src/common/wall_clock.cpp @@ -2,88 +2,71 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/steady_clock.h" -#include "common/uint128.h" #include "common/wall_clock.h" #ifdef ARCHITECTURE_x86_64 #include "common/x64/cpu_detect.h" #include "common/x64/native_clock.h" +#include "common/x64/rdtsc.h" #endif namespace Common { class StandardWallClock final : public WallClock { public: - explicit StandardWallClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_) - : WallClock{emulated_cpu_frequency_, emulated_clock_frequency_, false}, - start_time{SteadyClock::Now()} {} + explicit StandardWallClock() : start_time{SteadyClock::Now()} {} - std::chrono::nanoseconds GetTimeNS() override { + std::chrono::nanoseconds GetTimeNS() const override { return SteadyClock::Now() - start_time; } - std::chrono::microseconds GetTimeUS() override { - return std::chrono::duration_cast(GetTimeNS()); + std::chrono::microseconds GetTimeUS() const override { + return static_cast(GetHostTicksElapsed() / NsToUsRatio::den); } - std::chrono::milliseconds GetTimeMS() override { - return std::chrono::duration_cast(GetTimeNS()); + std::chrono::milliseconds GetTimeMS() const override { + return static_cast(GetHostTicksElapsed() / NsToMsRatio::den); } - u64 GetClockCycles() override { - const u128 temp = Common::Multiply64Into128(GetTimeNS().count(), emulated_clock_frequency); - return Common::Divide128On32(temp, NS_RATIO).first; + u64 GetCNTPCT() const override { + return GetHostTicksElapsed() * NsToCNTPCTRatio::num / NsToCNTPCTRatio::den; } - u64 GetCPUCycles() override { - const u128 temp = Common::Multiply64Into128(GetTimeNS().count(), emulated_cpu_frequency); - return Common::Divide128On32(temp, NS_RATIO).first; + u64 GetHostTicksNow() const override { + return static_cast(SteadyClock::Now().time_since_epoch().count()); } - void Pause([[maybe_unused]] bool is_paused) override { - // Do nothing in this clock type. + u64 GetHostTicksElapsed() const override { + return static_cast(GetTimeNS().count()); + } + + bool IsNative() const override { + return false; } private: SteadyClock::time_point start_time; }; +std::unique_ptr CreateOptimalClock() { #ifdef ARCHITECTURE_x86_64 - -std::unique_ptr CreateBestMatchingClock(u64 emulated_cpu_frequency, - u64 emulated_clock_frequency) { const auto& caps = GetCPUCaps(); - u64 rtsc_frequency = 0; - if (caps.invariant_tsc) { - rtsc_frequency = caps.tsc_frequency ? caps.tsc_frequency : EstimateRDTSCFrequency(); - } - // Fallback to StandardWallClock if the hardware TSC does not have the precision greater than: - // - A nanosecond - // - The emulated CPU frequency - // - The emulated clock counter frequency (CNTFRQ) - if (rtsc_frequency <= WallClock::NS_RATIO || rtsc_frequency <= emulated_cpu_frequency || - rtsc_frequency <= emulated_clock_frequency) { - return std::make_unique(emulated_cpu_frequency, - emulated_clock_frequency); + if (caps.invariant_tsc && caps.tsc_frequency >= WallClock::CNTFRQ) { + return std::make_unique(caps.tsc_frequency); } else { - return std::make_unique(emulated_cpu_frequency, emulated_clock_frequency, - rtsc_frequency); + // Fallback to StandardWallClock if the hardware TSC + // - Is not invariant + // - Is not more precise than CNTFRQ + return std::make_unique(); } -} - #else - -std::unique_ptr CreateBestMatchingClock(u64 emulated_cpu_frequency, - u64 emulated_clock_frequency) { - return std::make_unique(emulated_cpu_frequency, emulated_clock_frequency); + return std::make_unique(); +#endif } -#endif - -std::unique_ptr CreateStandardWallClock(u64 emulated_cpu_frequency, - u64 emulated_clock_frequency) { - return std::make_unique(emulated_cpu_frequency, emulated_clock_frequency); +std::unique_ptr CreateStandardWallClock() { + return std::make_unique(); } } // namespace Common diff --git a/src/common/wall_clock.h b/src/common/wall_clock.h index b796ea937..6ed085f4d 100755 --- a/src/common/wall_clock.h +++ b/src/common/wall_clock.h @@ -5,6 +5,7 @@ #include #include +#include #include "common/common_types.h" @@ -12,50 +13,60 @@ namespace Common { class WallClock { public: - static constexpr u64 NS_RATIO = 1'000'000'000; - static constexpr u64 US_RATIO = 1'000'000; - static constexpr u64 MS_RATIO = 1'000; + static constexpr u64 CNTFRQ = 19'200'000; // CNTPCT_EL0 Frequency = 19.2 MHz virtual ~WallClock() = default; - /// Returns current wall time in nanoseconds - [[nodiscard]] virtual std::chrono::nanoseconds GetTimeNS() = 0; + /// @returns The time in nanoseconds since the construction of this clock. + virtual std::chrono::nanoseconds GetTimeNS() const = 0; - /// Returns current wall time in microseconds - [[nodiscard]] virtual std::chrono::microseconds GetTimeUS() = 0; + /// @returns The time in microseconds since the construction of this clock. + virtual std::chrono::microseconds GetTimeUS() const = 0; - /// Returns current wall time in milliseconds - [[nodiscard]] virtual std::chrono::milliseconds GetTimeMS() = 0; + /// @returns The time in milliseconds since the construction of this clock. + virtual std::chrono::milliseconds GetTimeMS() const = 0; - /// Returns current wall time in emulated clock cycles - [[nodiscard]] virtual u64 GetClockCycles() = 0; + /// @returns The guest CNTPCT ticks since the construction of this clock. + virtual u64 GetCNTPCT() const = 0; - /// Returns current wall time in emulated cpu cycles - [[nodiscard]] virtual u64 GetCPUCycles() = 0; + /// @returns The raw host timer ticks since an indeterminate epoch. + virtual u64 GetHostTicksNow() const = 0; - virtual void Pause(bool is_paused) = 0; + /// @returns The raw host timer ticks since the construction of this clock. + virtual u64 GetHostTicksElapsed() const = 0; - /// Tells if the wall clock, uses the host CPU's hardware clock - [[nodiscard]] bool IsNative() const { - return is_native; + /// @returns Whether the clock directly uses the host's hardware clock. + virtual bool IsNative() const = 0; + + static inline u64 NSToCNTPCT(u64 ns) { + return ns * NsToCNTPCTRatio::num / NsToCNTPCTRatio::den; + } + + static inline u64 USToCNTPCT(u64 us) { + return us * UsToCNTPCTRatio::num / UsToCNTPCTRatio::den; + } + + static inline u64 CNTPCTToNS(u64 cntpct) { + return cntpct * NsToCNTPCTRatio::den / NsToCNTPCTRatio::num; + } + + static inline u64 CNTPCTToUS(u64 cntpct) { + return cntpct * UsToCNTPCTRatio::den / UsToCNTPCTRatio::num; } protected: - explicit WallClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_, bool is_native_) - : emulated_cpu_frequency{emulated_cpu_frequency_}, - emulated_clock_frequency{emulated_clock_frequency_}, is_native{is_native_} {} + using NsRatio = std::nano; + using UsRatio = std::micro; + using MsRatio = std::milli; - u64 emulated_cpu_frequency; - u64 emulated_clock_frequency; - -private: - bool is_native; + using NsToUsRatio = std::ratio_divide; + using NsToMsRatio = std::ratio_divide; + using NsToCNTPCTRatio = std::ratio; + using UsToCNTPCTRatio = std::ratio; }; -[[nodiscard]] std::unique_ptr CreateBestMatchingClock(u64 emulated_cpu_frequency, - u64 emulated_clock_frequency); +std::unique_ptr CreateOptimalClock(); -[[nodiscard]] std::unique_ptr CreateStandardWallClock(u64 emulated_cpu_frequency, - u64 emulated_clock_frequency); +std::unique_ptr CreateStandardWallClock(); } // namespace Common diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index 81759cacc..3edfb4369 100755 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -14,6 +14,7 @@ #include "common/common_types.h" #include "common/logging/log.h" #include "common/x64/cpu_detect.h" +#include "common/x64/rdtsc.h" #ifdef _WIN32 #include @@ -187,6 +188,8 @@ static CPUCaps Detect() { caps.tsc_frequency = static_cast(caps.crystal_frequency) * caps.tsc_crystal_ratio_numerator / caps.tsc_crystal_ratio_denominator; + } else { + caps.tsc_frequency = X64::EstimateRDTSCFrequency(); } } diff --git a/src/common/x64/cpu_wait.cpp b/src/common/x64/cpu_wait.cpp index cfeef6a3d..c53dd4945 100755 --- a/src/common/x64/cpu_wait.cpp +++ b/src/common/x64/cpu_wait.cpp @@ -9,19 +9,11 @@ #include "common/x64/cpu_detect.h" #include "common/x64/cpu_wait.h" +#include "common/x64/rdtsc.h" namespace Common::X64 { #ifdef _MSC_VER -__forceinline static u64 FencedRDTSC() { - _mm_lfence(); - _ReadWriteBarrier(); - const u64 result = __rdtsc(); - _mm_lfence(); - _ReadWriteBarrier(); - return result; -} - __forceinline static void TPAUSE() { // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. // For reference: @@ -32,16 +24,6 @@ __forceinline static void TPAUSE() { _tpause(0, FencedRDTSC() + PauseCycles); } #else -static u64 FencedRDTSC() { - u64 eax; - u64 edx; - asm volatile("lfence\n\t" - "rdtsc\n\t" - "lfence\n\t" - : "=a"(eax), "=d"(edx)); - return (edx << 32) | eax; -} - static void TPAUSE() { // 100,000 cycles is a reasonable amount of time to wait to save on CPU resources. // For reference: diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index c1c4e0e74..a90a1faed 100755 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -1,164 +1,45 @@ // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include - -#include "common/atomic_ops.h" -#include "common/steady_clock.h" #include "common/uint128.h" #include "common/x64/native_clock.h" +#include "common/x64/rdtsc.h" -#ifdef _MSC_VER -#include -#endif +namespace Common::X64 { -namespace Common { +NativeClock::NativeClock(u64 rdtsc_frequency_) + : start_ticks{FencedRDTSC()}, rdtsc_frequency{rdtsc_frequency_}, + ns_rdtsc_factor{GetFixedPoint64Factor(NsRatio::den, rdtsc_frequency)}, + us_rdtsc_factor{GetFixedPoint64Factor(UsRatio::den, rdtsc_frequency)}, + ms_rdtsc_factor{GetFixedPoint64Factor(MsRatio::den, rdtsc_frequency)}, + cntpct_rdtsc_factor{GetFixedPoint64Factor(CNTFRQ, rdtsc_frequency)} {} -#ifdef _MSC_VER -__forceinline static u64 FencedRDTSC() { - _mm_lfence(); - _ReadWriteBarrier(); - const u64 result = __rdtsc(); - _mm_lfence(); - _ReadWriteBarrier(); - return result; -} -#else -static u64 FencedRDTSC() { - u64 eax; - u64 edx; - asm volatile("lfence\n\t" - "rdtsc\n\t" - "lfence\n\t" - : "=a"(eax), "=d"(edx)); - return (edx << 32) | eax; -} -#endif - -template -static u64 RoundToNearest(u64 value) { - const auto mod = value % Nearest; - return mod >= (Nearest / 2) ? (value - mod + Nearest) : (value - mod); +std::chrono::nanoseconds NativeClock::GetTimeNS() const { + return std::chrono::nanoseconds{MultiplyHigh(GetHostTicksElapsed(), ns_rdtsc_factor)}; } -u64 EstimateRDTSCFrequency() { - // Discard the first result measuring the rdtsc. - FencedRDTSC(); - std::this_thread::sleep_for(std::chrono::milliseconds{1}); - FencedRDTSC(); - - // Get the current time. - const auto start_time = Common::RealTimeClock::Now(); - const u64 tsc_start = FencedRDTSC(); - // Wait for 250 milliseconds. - std::this_thread::sleep_for(std::chrono::milliseconds{250}); - const auto end_time = Common::RealTimeClock::Now(); - const u64 tsc_end = FencedRDTSC(); - // Calculate differences. - const u64 timer_diff = static_cast( - std::chrono::duration_cast(end_time - start_time).count()); - const u64 tsc_diff = tsc_end - tsc_start; - const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff); - return RoundToNearest<1000>(tsc_freq); +std::chrono::microseconds NativeClock::GetTimeUS() const { + return std::chrono::microseconds{MultiplyHigh(GetHostTicksElapsed(), us_rdtsc_factor)}; } -namespace X64 { -NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_, - u64 rtsc_frequency_) - : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ - rtsc_frequency_} { - // Thread to re-adjust the RDTSC frequency after 10 seconds has elapsed. - time_sync_thread = std::jthread{[this](std::stop_token token) { - // Get the current time. - const auto start_time = Common::RealTimeClock::Now(); - const u64 tsc_start = FencedRDTSC(); - // Wait for 10 seconds. - if (!Common::StoppableTimedWait(token, std::chrono::seconds{10})) { - return; - } - const auto end_time = Common::RealTimeClock::Now(); - const u64 tsc_end = FencedRDTSC(); - // Calculate differences. - const u64 timer_diff = static_cast( - std::chrono::duration_cast(end_time - start_time).count()); - const u64 tsc_diff = tsc_end - tsc_start; - const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff); - rtsc_frequency = tsc_freq; - CalculateAndSetFactors(); - }}; - - time_point.inner.last_measure = FencedRDTSC(); - time_point.inner.accumulated_ticks = 0U; - CalculateAndSetFactors(); +std::chrono::milliseconds NativeClock::GetTimeMS() const { + return std::chrono::milliseconds{MultiplyHigh(GetHostTicksElapsed(), ms_rdtsc_factor)}; } -u64 NativeClock::GetRTSC() { - TimePoint new_time_point{}; - TimePoint current_time_point{}; - - current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); - do { - const u64 current_measure = FencedRDTSC(); - u64 diff = current_measure - current_time_point.inner.last_measure; - diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) - new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure - ? current_measure - : current_time_point.inner.last_measure; - new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; - } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, - current_time_point.pack, current_time_point.pack)); - return new_time_point.inner.accumulated_ticks; +u64 NativeClock::GetCNTPCT() const { + return MultiplyHigh(GetHostTicksElapsed(), cntpct_rdtsc_factor); } -void NativeClock::Pause(bool is_paused) { - if (!is_paused) { - TimePoint current_time_point{}; - TimePoint new_time_point{}; - - current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); - do { - new_time_point.pack = current_time_point.pack; - new_time_point.inner.last_measure = FencedRDTSC(); - } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, - current_time_point.pack, current_time_point.pack)); - } +u64 NativeClock::GetHostTicksNow() const { + return FencedRDTSC(); } -std::chrono::nanoseconds NativeClock::GetTimeNS() { - const u64 rtsc_value = GetRTSC(); - return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)}; +u64 NativeClock::GetHostTicksElapsed() const { + return FencedRDTSC() - start_ticks; } -std::chrono::microseconds NativeClock::GetTimeUS() { - const u64 rtsc_value = GetRTSC(); - return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)}; +bool NativeClock::IsNative() const { + return true; } -std::chrono::milliseconds NativeClock::GetTimeMS() { - const u64 rtsc_value = GetRTSC(); - return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)}; -} - -u64 NativeClock::GetClockCycles() { - const u64 rtsc_value = GetRTSC(); - return MultiplyHigh(rtsc_value, clock_rtsc_factor); -} - -u64 NativeClock::GetCPUCycles() { - const u64 rtsc_value = GetRTSC(); - return MultiplyHigh(rtsc_value, cpu_rtsc_factor); -} - -void NativeClock::CalculateAndSetFactors() { - ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); - us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); - ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency); - clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency); - cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency); -} - -} // namespace X64 - -} // namespace Common +} // namespace Common::X64 diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h index 5504eb3c1..3b487d442 100755 --- a/src/common/x64/native_clock.h +++ b/src/common/x64/native_clock.h @@ -3,58 +3,36 @@ #pragma once -#include "common/polyfill_thread.h" #include "common/wall_clock.h" -namespace Common { +namespace Common::X64 { -namespace X64 { class NativeClock final : public WallClock { public: - explicit NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequency_, - u64 rtsc_frequency_); + explicit NativeClock(u64 rdtsc_frequency_); - std::chrono::nanoseconds GetTimeNS() override; + std::chrono::nanoseconds GetTimeNS() const override; - std::chrono::microseconds GetTimeUS() override; + std::chrono::microseconds GetTimeUS() const override; - std::chrono::milliseconds GetTimeMS() override; + std::chrono::milliseconds GetTimeMS() const override; - u64 GetClockCycles() override; + u64 GetCNTPCT() const override; - u64 GetCPUCycles() override; + u64 GetHostTicksNow() const override; - void Pause(bool is_paused) override; + u64 GetHostTicksElapsed() const override; + + bool IsNative() const override; private: - u64 GetRTSC(); + u64 start_ticks; + u64 rdtsc_frequency; - void CalculateAndSetFactors(); - - union alignas(16) TimePoint { - TimePoint() : pack{} {} - u128 pack{}; - struct Inner { - u64 last_measure{}; - u64 accumulated_ticks{}; - } inner; - }; - - TimePoint time_point; - - // factors - u64 clock_rtsc_factor{}; - u64 cpu_rtsc_factor{}; - u64 ns_rtsc_factor{}; - u64 us_rtsc_factor{}; - u64 ms_rtsc_factor{}; - - u64 rtsc_frequency; - - std::jthread time_sync_thread; + u64 ns_rdtsc_factor; + u64 us_rdtsc_factor; + u64 ms_rdtsc_factor; + u64 cntpct_rdtsc_factor; }; -} // namespace X64 -u64 EstimateRDTSCFrequency(); - -} // namespace Common +} // namespace Common::X64 diff --git a/src/common/x64/rdtsc.cpp b/src/common/x64/rdtsc.cpp new file mode 100755 index 000000000..9273274a3 --- /dev/null +++ b/src/common/x64/rdtsc.cpp @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +#include "common/steady_clock.h" +#include "common/uint128.h" +#include "common/x64/rdtsc.h" + +namespace Common::X64 { + +template +static u64 RoundToNearest(u64 value) { + const auto mod = value % Nearest; + return mod >= (Nearest / 2) ? (value - mod + Nearest) : (value - mod); +} + +u64 EstimateRDTSCFrequency() { + // Discard the first result measuring the rdtsc. + FencedRDTSC(); + std::this_thread::sleep_for(std::chrono::milliseconds{1}); + FencedRDTSC(); + + // Get the current time. + const auto start_time = RealTimeClock::Now(); + const u64 tsc_start = FencedRDTSC(); + // Wait for 100 milliseconds. + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + const auto end_time = RealTimeClock::Now(); + const u64 tsc_end = FencedRDTSC(); + // Calculate differences. + const u64 timer_diff = static_cast( + std::chrono::duration_cast(end_time - start_time).count()); + const u64 tsc_diff = tsc_end - tsc_start; + const u64 tsc_freq = MultiplyAndDivide64(tsc_diff, 1000000000ULL, timer_diff); + return RoundToNearest<100'000>(tsc_freq); +} + +} // namespace Common::X64 diff --git a/src/common/x64/rdtsc.h b/src/common/x64/rdtsc.h new file mode 100755 index 000000000..0ec4f52f9 --- /dev/null +++ b/src/common/x64/rdtsc.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#ifdef _MSC_VER +#include +#endif + +#include "common/common_types.h" + +namespace Common::X64 { + +#ifdef _MSC_VER +__forceinline static u64 FencedRDTSC() { + _mm_lfence(); + _ReadWriteBarrier(); + const u64 result = __rdtsc(); + _mm_lfence(); + _ReadWriteBarrier(); + return result; +} +#else +static inline u64 FencedRDTSC() { + u64 eax; + u64 edx; + asm volatile("lfence\n\t" + "rdtsc\n\t" + "lfence\n\t" + : "=a"(eax), "=d"(edx)); + return (edx << 32) | eax; +} +#endif + +u64 EstimateRDTSCFrequency(); + +} // namespace Common::X64 diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 21b9c1c2d..0e0df31b6 100755 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -16,7 +16,6 @@ add_library(core STATIC core.h core_timing.cpp core_timing.h - core_timing_util.h cpu_manager.cpp cpu_manager.h crypto/aes_util.cpp diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 3e74ed6cd..b65f52f93 100755 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -16,7 +16,6 @@ #include "common/microprofile.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/hardware_properties.h" namespace Core::Timing { @@ -45,9 +44,7 @@ struct CoreTiming::Event { } }; -CoreTiming::CoreTiming() - : cpu_clock{Common::CreateBestMatchingClock(Hardware::BASE_CLOCK_RATE, Hardware::CNTFREQ)}, - event_clock{Common::CreateStandardWallClock(Hardware::BASE_CLOCK_RATE, Hardware::CNTFREQ)} {} +CoreTiming::CoreTiming() : clock{Common::CreateOptimalClock()} {} CoreTiming::~CoreTiming() { Reset(); @@ -180,7 +177,7 @@ void CoreTiming::AddTicks(u64 ticks_to_add) { void CoreTiming::Idle() { if (!event_queue.empty()) { const u64 next_event_time = event_queue.front().time; - const u64 next_ticks = nsToCycles(std::chrono::nanoseconds(next_event_time)) + 10U; + const u64 next_ticks = Common::WallClock::NSToCNTPCT(next_event_time) + 10U; if (next_ticks > ticks) { ticks = next_ticks; } @@ -193,18 +190,11 @@ void CoreTiming::ResetTicks() { downcount = MAX_SLICE_LENGTH; } -u64 CoreTiming::GetCPUTicks() const { - if (is_multicore) [[likely]] { - return cpu_clock->GetCPUCycles(); - } - return ticks; -} - u64 CoreTiming::GetClockTicks() const { if (is_multicore) [[likely]] { - return cpu_clock->GetClockCycles(); + return clock->GetCNTPCT(); } - return CpuCyclesToClockCycles(ticks); + return ticks; } std::optional CoreTiming::Advance() { @@ -297,9 +287,7 @@ void CoreTiming::ThreadLoop() { } paused_set = true; - event_clock->Pause(true); pause_event.Wait(); - event_clock->Pause(false); } } @@ -315,25 +303,18 @@ void CoreTiming::Reset() { has_started = false; } -std::chrono::nanoseconds CoreTiming::GetCPUTimeNs() const { - if (is_multicore) [[likely]] { - return cpu_clock->GetTimeNS(); - } - return CyclesToNs(ticks); -} - std::chrono::nanoseconds CoreTiming::GetGlobalTimeNs() const { if (is_multicore) [[likely]] { - return event_clock->GetTimeNS(); + return clock->GetTimeNS(); } - return CyclesToNs(ticks); + return std::chrono::nanoseconds{Common::WallClock::CNTPCTToNS(ticks)}; } std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const { if (is_multicore) [[likely]] { - return event_clock->GetTimeUS(); + return clock->GetTimeUS(); } - return CyclesToUs(ticks); + return std::chrono::microseconds{Common::WallClock::CNTPCTToUS(ticks)}; } } // namespace Core::Timing diff --git a/src/core/core_timing.h b/src/core/core_timing.h index 2601e9918..5ce1f0ff1 100755 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -116,15 +116,9 @@ public: return downcount; } - /// Returns current time in emulated CPU cycles - u64 GetCPUTicks() const; - - /// Returns current time in emulated in Clock cycles + /// Returns the current CNTPCT tick value. u64 GetClockTicks() const; - /// Returns current time in nanoseconds. - std::chrono::nanoseconds GetCPUTimeNs() const; - /// Returns current time in microseconds. std::chrono::microseconds GetGlobalTimeUs() const; @@ -142,8 +136,7 @@ private: void Reset(); - std::unique_ptr cpu_clock; - std::unique_ptr event_clock; + std::unique_ptr clock; s64 global_timer = 0; diff --git a/src/core/hle/kernel/k_scheduler.cpp b/src/core/hle/kernel/k_scheduler.cpp index 106817c8d..7379af9c0 100755 --- a/src/core/hle/kernel/k_scheduler.cpp +++ b/src/core/hle/kernel/k_scheduler.cpp @@ -184,7 +184,8 @@ u64 KScheduler::UpdateHighestPriorityThread(KThread* highest_thread) { prev_highest_thread != highest_thread) [[likely]] { if (prev_highest_thread != nullptr) [[likely]] { IncrementScheduledCount(prev_highest_thread); - prev_highest_thread->SetLastScheduledTick(m_kernel.System().CoreTiming().GetCPUTicks()); + prev_highest_thread->SetLastScheduledTick( + m_kernel.System().CoreTiming().GetClockTicks()); } if (m_state.should_count_idle) { if (highest_thread != nullptr) [[likely]] { @@ -351,7 +352,7 @@ void KScheduler::SwitchThread(KThread* next_thread) { // Update the CPU time tracking variables. const s64 prev_tick = m_last_context_switch_time; - const s64 cur_tick = m_kernel.System().CoreTiming().GetCPUTicks(); + const s64 cur_tick = m_kernel.System().CoreTiming().GetClockTicks(); const s64 tick_diff = cur_tick - prev_tick; cur_thread->AddCpuTime(m_core_id, tick_diff); if (cur_process != nullptr) { diff --git a/src/core/hle/kernel/svc/svc_info.cpp b/src/core/hle/kernel/svc/svc_info.cpp index 2b2c878b5..445cdd87b 100755 --- a/src/core/hle/kernel/svc/svc_info.cpp +++ b/src/core/hle/kernel/svc/svc_info.cpp @@ -199,9 +199,9 @@ Result GetInfo(Core::System& system, u64* result, InfoType info_id_type, Handle if (same_thread && info_sub_id == 0xFFFFFFFFFFFFFFFF) { const u64 thread_ticks = current_thread->GetCpuTime(); - out_ticks = thread_ticks + (core_timing.GetCPUTicks() - prev_ctx_ticks); + out_ticks = thread_ticks + (core_timing.GetClockTicks() - prev_ctx_ticks); } else if (same_thread && info_sub_id == system.Kernel().CurrentPhysicalCoreIndex()) { - out_ticks = core_timing.GetCPUTicks() - prev_ctx_ticks; + out_ticks = core_timing.GetClockTicks() - prev_ctx_ticks; } *result = out_ticks; diff --git a/src/core/hle/service/hid/hidbus.cpp b/src/core/hle/service/hid/hidbus.cpp index 91ab51746..9442d8d0c 100755 --- a/src/core/hle/service/hid/hidbus.cpp +++ b/src/core/hle/service/hid/hidbus.cpp @@ -5,7 +5,6 @@ #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/hid/hid_types.h" #include "core/hle/kernel/k_event.h" #include "core/hle/kernel/k_readable_event.h" diff --git a/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp b/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp index 51291539d..8b37d8663 100755 --- a/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp +++ b/src/core/hle/service/nvnflinger/buffer_queue_consumer.cpp @@ -92,6 +92,14 @@ Status BufferQueueConsumer::AcquireBuffer(BufferItem* out_buffer, LOG_DEBUG(Service_Nvnflinger, "acquiring slot={}", slot); + // If the front buffer is still being tracked, update its slot state + if (core->StillTracking(*front)) { + slots[slot].acquire_called = true; + slots[slot].needs_cleanup_on_release = false; + slots[slot].buffer_state = BufferState::Acquired; + slots[slot].fence = Fence::NoFence(); + } + // If the buffer has previously been acquired by the consumer, set graphic_buffer to nullptr to // avoid unnecessarily remapping this buffer on the consumer side. if (out_buffer->acquire_called) { @@ -134,13 +142,29 @@ Status BufferQueueConsumer::ReleaseBuffer(s32 slot, u64 frame_number, const Fenc ++current; } - slots[slot].buffer_state = BufferState::Free; + if (slots[slot].buffer_state == BufferState::Acquired) { + slots[slot].fence = release_fence; + slots[slot].buffer_state = BufferState::Free; - nvmap.FreeHandle(slots[slot].graphic_buffer->BufferId(), true); + nvmap.FreeHandle(slots[slot].graphic_buffer->BufferId(), true); - listener = core->connected_producer_listener; + listener = core->connected_producer_listener; - LOG_DEBUG(Service_Nvnflinger, "releasing slot {}", slot); + LOG_DEBUG(Service_Nvnflinger, "releasing slot {}", slot); + } else if (slots[slot].needs_cleanup_on_release) { + LOG_DEBUG(Service_Nvnflinger, "releasing a stale buffer slot {} (state = {})", slot, + slots[slot].buffer_state); + + slots[slot].needs_cleanup_on_release = false; + + return Status::StaleBufferSlot; + } else { + LOG_ERROR(Service_Nvnflinger, + "attempted to release buffer slot {} but its state was {}", slot, + slots[slot].buffer_state); + + return Status::BadValue; + } core->SignalDequeueCondition(); } diff --git a/src/core/hle/service/nvnflinger/buffer_queue_core.cpp b/src/core/hle/service/nvnflinger/buffer_queue_core.cpp index 2dbe29616..4745c4980 100755 --- a/src/core/hle/service/nvnflinger/buffer_queue_core.cpp +++ b/src/core/hle/service/nvnflinger/buffer_queue_core.cpp @@ -86,6 +86,10 @@ void BufferQueueCore::FreeBufferLocked(s32 slot) { slots[slot].graphic_buffer.reset(); + if (slots[slot].buffer_state == BufferState::Acquired) { + slots[slot].needs_cleanup_on_release = true; + } + slots[slot].buffer_state = BufferState::Free; slots[slot].frame_number = UINT32_MAX; slots[slot].acquire_called = false; diff --git a/src/core/hle/service/nvnflinger/buffer_slot.h b/src/core/hle/service/nvnflinger/buffer_slot.h index d25bca049..37daca78b 100755 --- a/src/core/hle/service/nvnflinger/buffer_slot.h +++ b/src/core/hle/service/nvnflinger/buffer_slot.h @@ -31,6 +31,7 @@ struct BufferSlot final { u64 frame_number{}; Fence fence; bool acquire_called{}; + bool needs_cleanup_on_release{}; bool attached_by_consumer{}; bool is_preallocated{}; }; diff --git a/src/core/hle/service/nvnflinger/nvnflinger.cpp b/src/core/hle/service/nvnflinger/nvnflinger.cpp index 4988e6e17..64ea0d790 100755 --- a/src/core/hle/service/nvnflinger/nvnflinger.cpp +++ b/src/core/hle/service/nvnflinger/nvnflinger.cpp @@ -46,11 +46,8 @@ void Nvnflinger::SplitVSync(std::stop_token stop_token) { vsync_signal.wait(false); vsync_signal.store(false); - guard->lock(); - + const auto lock_guard = Lock(); Compose(); - - guard->unlock(); } } @@ -70,7 +67,9 @@ Nvnflinger::Nvnflinger(Core::System& system_, HosBinderDriverServer& hos_binder_ [this](std::uintptr_t, s64 time, std::chrono::nanoseconds ns_late) -> std::optional { vsync_signal.store(true); - vsync_signal.notify_all(); + const auto lock_guard = Lock(); + vsync_signal.notify_one(); + return std::chrono::nanoseconds(GetNextTicks()); }); @@ -267,8 +266,9 @@ void Nvnflinger::Compose() { SCOPE_EXIT({ display.SignalVSyncEvent(); }); // Don't do anything for displays without layers. - if (!display.HasLayers()) + if (!display.HasLayers()) { continue; + } // TODO(Subv): Support more than 1 layer. VI::Layer& layer = display.GetLayer(0); diff --git a/src/core/hle/service/time/clock_types.h b/src/core/hle/service/time/clock_types.h index 788bbff2b..0691dba20 100755 --- a/src/core/hle/service/time/clock_types.h +++ b/src/core/hle/service/time/clock_types.h @@ -3,6 +3,8 @@ #pragma once +#include + #include "common/common_funcs.h" #include "common/common_types.h" #include "common/uuid.h" @@ -62,18 +64,19 @@ static_assert(std::is_trivially_copyable_v, /// https://switchbrew.org/wiki/Glue_services#TimeSpanType struct TimeSpanType { s64 nanoseconds{}; - static constexpr s64 ns_per_second{1000000000ULL}; s64 ToSeconds() const { - return nanoseconds / ns_per_second; + return nanoseconds / std::nano::den; } static TimeSpanType FromSeconds(s64 seconds) { - return {seconds * ns_per_second}; + return {seconds * std::nano::den}; } - static TimeSpanType FromTicks(u64 ticks, u64 frequency) { - return FromSeconds(static_cast(ticks) / static_cast(frequency)); + template + static TimeSpanType FromTicks(u64 ticks) { + using TicksToNSRatio = std::ratio; + return {static_cast(ticks * TicksToNSRatio::num / TicksToNSRatio::den)}; } }; static_assert(sizeof(TimeSpanType) == 8, "TimeSpanType is incorrect size"); diff --git a/src/core/hle/service/time/standard_steady_clock_core.cpp b/src/core/hle/service/time/standard_steady_clock_core.cpp index af1afe605..650c2497a 100755 --- a/src/core/hle/service/time/standard_steady_clock_core.cpp +++ b/src/core/hle/service/time/standard_steady_clock_core.cpp @@ -10,7 +10,7 @@ namespace Service::Time::Clock { TimeSpanType StandardSteadyClockCore::GetCurrentRawTimePoint(Core::System& system) { const TimeSpanType ticks_time_span{ - TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)}; + TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks())}; TimeSpanType raw_time_point{setup_value.nanoseconds + ticks_time_span.nanoseconds}; if (raw_time_point.nanoseconds < cached_raw_time_point.nanoseconds) { diff --git a/src/core/hle/service/time/tick_based_steady_clock_core.cpp b/src/core/hle/service/time/tick_based_steady_clock_core.cpp index bb8473cf8..b1193b072 100755 --- a/src/core/hle/service/time/tick_based_steady_clock_core.cpp +++ b/src/core/hle/service/time/tick_based_steady_clock_core.cpp @@ -10,7 +10,7 @@ namespace Service::Time::Clock { SteadyClockTimePoint TickBasedSteadyClockCore::GetTimePoint(Core::System& system) { const TimeSpanType ticks_time_span{ - TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)}; + TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks())}; return {ticks_time_span.ToSeconds(), GetClockSourceId()}; } diff --git a/src/core/hle/service/time/time.cpp b/src/core/hle/service/time/time.cpp index fce69204e..2a5161968 100755 --- a/src/core/hle/service/time/time.cpp +++ b/src/core/hle/service/time/time.cpp @@ -240,8 +240,8 @@ void Module::Interface::CalculateMonotonicSystemClockBaseTimePoint(HLERequestCon const auto current_time_point{steady_clock_core.GetCurrentTimePoint(system)}; if (current_time_point.clock_source_id == context.steady_time_point.clock_source_id) { - const auto ticks{Clock::TimeSpanType::FromTicks(system.CoreTiming().GetClockTicks(), - Core::Hardware::CNTFREQ)}; + const auto ticks{Clock::TimeSpanType::FromTicks( + system.CoreTiming().GetClockTicks())}; const s64 base_time_point{context.offset + current_time_point.time_point - ticks.ToSeconds()}; IPC::ResponseBuilder rb{ctx, (sizeof(s64) / 4) + 2}; diff --git a/src/core/hle/service/time/time_sharedmemory.cpp b/src/core/hle/service/time/time_sharedmemory.cpp index 73993b0c2..531836b11 100755 --- a/src/core/hle/service/time/time_sharedmemory.cpp +++ b/src/core/hle/service/time/time_sharedmemory.cpp @@ -21,8 +21,9 @@ SharedMemory::~SharedMemory() = default; void SharedMemory::SetupStandardSteadyClock(const Common::UUID& clock_source_id, Clock::TimeSpanType current_time_point) { - const Clock::TimeSpanType ticks_time_span{Clock::TimeSpanType::FromTicks( - system.CoreTiming().GetClockTicks(), Core::Hardware::CNTFREQ)}; + const Clock::TimeSpanType ticks_time_span{ + Clock::TimeSpanType::FromTicks( + system.CoreTiming().GetClockTicks())}; const Clock::SteadyClockContext context{ static_cast(current_time_point.nanoseconds - ticks_time_span.nanoseconds), clock_source_id}; diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 00c181a00..052ef42e6 100755 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -15,7 +15,7 @@ add_executable(tests core/core_timing.cpp core/internal_network/network.cpp precompiled_headers.h - video_core/buffer_base.cpp + video_core/memory_tracker.cpp input_common/calibration_configuration_job.cpp ) diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp new file mode 100755 index 000000000..77d391f15 --- /dev/null +++ b/src/tests/video_core/memory_tracker.cpp @@ -0,0 +1,547 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include +#include +#include + +#include + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/memory_tracker_base.h" + +namespace { +using Range = std::pair; + +constexpr u64 PAGE = 4096; +constexpr u64 WORD = 4096 * 64; +constexpr u64 HIGH_PAGE_BITS = 22; +constexpr u64 HIGH_PAGE_SIZE = 1ULL << HIGH_PAGE_BITS; + +constexpr VAddr c = 16 * HIGH_PAGE_SIZE; + +class RasterizerInterface { +public: + void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { + const u64 page_start{addr >> Core::Memory::YUZU_PAGEBITS}; + const u64 page_end{(addr + size + Core::Memory::YUZU_PAGESIZE - 1) >> + Core::Memory::YUZU_PAGEBITS}; + for (u64 page = page_start; page < page_end; ++page) { + int& value = page_table[page]; + value += delta; + if (value < 0) { + throw std::logic_error{"negative page"}; + } + if (value == 0) { + page_table.erase(page); + } + } + } + + [[nodiscard]] int Count(VAddr addr) const noexcept { + const auto it = page_table.find(addr >> Core::Memory::YUZU_PAGEBITS); + return it == page_table.end() ? 0 : it->second; + } + + [[nodiscard]] unsigned Count() const noexcept { + unsigned count = 0; + for (const auto& [index, value] : page_table) { + count += value; + } + return count; + } + +private: + std::unordered_map page_table; +}; +} // Anonymous namespace + +using MemoryTracker = VideoCommon::MemoryTrackerBase; + +TEST_CASE("MemoryTracker: Small region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == WORD / PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD) == Range{0, 0}); + + memory_track->MarkRegionAsCpuModified(c + PAGE, 1); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD) == Range{c + PAGE * 1, c + PAGE * 2}); +} + +TEST_CASE("MemoryTracker: Large region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + memory_track->MarkRegionAsCpuModified(c + 4096, WORD * 4); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD + PAGE * 2) == + Range{c + PAGE, c + WORD + PAGE * 2}); + REQUIRE(memory_track->ModifiedCpuRegion(c + PAGE * 2, PAGE * 6) == + Range{c + PAGE * 2, c + PAGE * 8}); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{c + PAGE, c + WORD * 4 + PAGE}); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 4, PAGE) == + Range{c + WORD * 4, c + WORD * 4 + PAGE}); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 3 + PAGE * 63, PAGE) == + Range{c + WORD * 3 + PAGE * 63, c + WORD * 4}); + + memory_track->MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 6, PAGE); + memory_track->MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 5, WORD) == + Range{c + WORD * 5 + PAGE * 6, c + WORD * 5 + PAGE * 9}); + + memory_track->UnmarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 5, WORD) == + Range{c + WORD * 5 + PAGE * 6, c + WORD * 5 + PAGE * 7}); + + memory_track->MarkRegionAsCpuModified(c + PAGE, WORD * 31 + PAGE * 63); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{c + PAGE, c + WORD * 32}); + + memory_track->UnmarkRegionAsCpuModified(c + PAGE * 4, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE * 6, PAGE); + + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{0, 0}); +} + +TEST_CASE("MemoryTracker: Rasterizer counting", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(rasterizer.Count() == 1); + memory_track->MarkRegionAsCpuModified(c, PAGE * 2); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE, PAGE); + REQUIRE(rasterizer.Count() == 2); + memory_track->MarkRegionAsCpuModified(c, PAGE * 2); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Basic range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c, PAGE); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 1U); +} + +TEST_CASE("MemoryTracker: Border upload", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c, WORD * 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); +} + +TEST_CASE("MemoryTracker: Border upload range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - PAGE, PAGE * 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - PAGE, PAGE, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE); + }); + memory_track->ForEachUploadRange(c + WORD, PAGE, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == PAGE); + }); +} + +TEST_CASE("MemoryTracker: Border upload partial range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - 1, 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - 1, 1, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE); + }); + memory_track->ForEachUploadRange(c + WORD + 50, 1, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == PAGE); + }); +} + +TEST_CASE("MemoryTracker: Partial word uploads", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 2); + memory_track->ForEachUploadRange(c + 0x79000, 0x24000, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD * 2); + REQUIRE(size == PAGE * 0x1d); + ++num; + }); + REQUIRE(num == 3); +} + +TEST_CASE("MemoryTracker: Partial page upload", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + int num = 0; + memory_track->MarkRegionAsCpuModified(c + PAGE * 2, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 9, PAGE); + memory_track->ForEachUploadRange(c, PAGE * 3, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 2); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE * 7, PAGE * 3, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 9); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words on the right") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 9); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 10, WORD * 7, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == WORD * 7 - PAGE * 3); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE, WORD * 8, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD * 7 + PAGE * 10); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words on the left", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 8); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 16, WORD * 7, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 16); + REQUIRE(size == WORD * 7 - PAGE * 3); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words in the middle", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 8); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, PAGE * 140); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 16, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 16); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); + memory_track->ForEachUploadRange(c, WORD * 8, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD + PAGE * 16); + REQUIRE(size == PAGE * 73); + ++num; + }); + REQUIRE(num == 3); +} + +TEST_CASE("MemoryTracker: Empty right bits", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2048); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c, WORD * 2048, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 1", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c - WORD, 3 * WORD); + memory_track->MarkRegionAsCpuModified(c, PAGE); + REQUIRE(rasterizer.Count() == (3 * WORD - PAGE) / PAGE); + int num = 0; + memory_track->ForEachUploadRange(c - WORD, WORD, [&](u64 offset, u64 size) { ++num; }); + memory_track->ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { ++num; }); + memory_track->ForEachUploadRange(c - PAGE, PAGE, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + memory_track->ForEachUploadRange(c - PAGE, PAGE * 2, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 1); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 2 * WORD / PAGE); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 2", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x22000, PAGE)); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x28000, PAGE)); + REQUIRE(rasterizer.Count() == 2); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x21100, PAGE - 0x100)); + REQUIRE(rasterizer.Count() == 3); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c - PAGE, PAGE * 2)); + memory_track->UnmarkRegionAsCpuModified(c - PAGE * 3, PAGE * 2); + memory_track->UnmarkRegionAsCpuModified(c - PAGE * 2, PAGE * 2); + REQUIRE(rasterizer.Count() == 7); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 3", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, 0x310720); + REQUIRE(rasterizer.Count(c) == 1); + REQUIRE(rasterizer.Count(c + PAGE) == 1); + REQUIRE(rasterizer.Count(c + WORD) == 1); + REQUIRE(rasterizer.Count(c + WORD + PAGE) == 1); +} + +TEST_CASE("MemoryTracker: Sparse regions 1", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c + PAGE * 1, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 3, PAGE * 4); + memory_track->ForEachUploadRange(c, WORD, [i = 0](u64 offset, u64 size) mutable { + static constexpr std::array offsets{c + PAGE, c + PAGE * 3}; + static constexpr std::array sizes{PAGE, PAGE * 4}; + REQUIRE(offset == offsets.at(i)); + REQUIRE(size == sizes.at(i)); + ++i; + }); +} + +TEST_CASE("MemoryTracker: Sparse regions 2", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE * 0x23); + REQUIRE(rasterizer.Count() == 0x23); + memory_track->MarkRegionAsCpuModified(c + PAGE * 0x1B, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 0x21, PAGE); + memory_track->ForEachUploadRange(c, PAGE * 0x23, [i = 0](u64 offset, u64 size) mutable { + static constexpr std::array offsets{c + PAGE * 0x1B, c + PAGE * 0x21}; + static constexpr std::array sizes{PAGE, PAGE}; + REQUIRE(offset == offsets.at(i)); + REQUIRE(size == sizes.at(i)); + ++i; + }); +} + +TEST_CASE("MemoryTracker: Single page modified range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c, PAGE)); +} + +TEST_CASE("MemoryTracker: Two page modified range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE * 2)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c, PAGE)); +} + +TEST_CASE("MemoryTracker: Multi word modified ranges", "[video_core]") { + for (int offset = 0; offset < 4; ++offset) { + const VAddr address = c + WORD * offset; + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(address, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 48, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 56, PAGE)); + + memory_track->UnmarkRegionAsCpuModified(address + PAGE * 32, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE, WORD)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 31, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 33, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 31, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); + + memory_track->UnmarkRegionAsCpuModified(address + PAGE * 33, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); + } +} + +TEST_CASE("MemoryTracker: Single page in large region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 16); + REQUIRE(!memory_track->IsRegionCpuModified(c, WORD * 16)); + + memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE * 8)); + REQUIRE(!memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 7, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 8, PAGE * 2)); +} + +TEST_CASE("MemoryTracker: Wrap word regions") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + memory_track->MarkRegionAsCpuModified(c + PAGE * 63, PAGE * 2); + REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 2)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 62, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 64, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE * 8)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 60, PAGE * 8)); + + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 127, WORD * 16)); + memory_track->MarkRegionAsCpuModified(c + PAGE * 127, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 127, WORD * 16)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 127, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 126, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 126, PAGE * 2)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 128, WORD * 16)); +} + +TEST_CASE("MemoryTracker: Unaligned page region query") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c + 4000, 1000); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + 4000, 1000)); + REQUIRE(memory_track->IsRegionCpuModified(c + 4000, 1)); +} + +TEST_CASE("MemoryTracker: Cached write") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, c + PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Multiple cached write") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + memory_track->CachedCpuWrite(c + PAGE * 3, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 3, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 3, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write unmarked") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write iterated") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write downloads") { + RasterizerInterface rasterizer; + std::unique_ptr memory_track(std::make_unique(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 64); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + REQUIRE(rasterizer.Count() == 63); + memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE); + int num = 0; + memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionGpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionGpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} \ No newline at end of file diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 828f00911..b72e2a647 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -11,8 +11,11 @@ endif() add_library(video_core STATIC buffer_cache/buffer_base.h + buffer_cache/buffer_cache_base.h buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h + buffer_cache/memory_tracker_base.h + buffer_cache/word_manager.h cache_types.h cdma_pusher.cpp cdma_pusher.h @@ -104,6 +107,7 @@ add_library(video_core STATIC renderer_null/renderer_null.h renderer_opengl/blit_image.cpp renderer_opengl/blit_image.h + renderer_opengl/gl_buffer_cache_base.cpp renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_compute_pipeline.cpp @@ -154,6 +158,7 @@ add_library(video_core STATIC renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache_base.cpp renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h renderer_vulkan/vk_command_pool.cpp diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 22eed2578..10b3ac776 100755 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -11,9 +11,7 @@ #include "common/alignment.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/settings.h" -#include "core/memory.h" +#include "video_core/buffer_cache/word_manager.h" namespace VideoCommon { @@ -36,116 +34,14 @@ struct NullBufferParams {}; */ template class BufferBase { - static constexpr u64 PAGES_PER_WORD = 64; - static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; - static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; - - /// Vector tracking modified pages tightly packed with small vector optimization - union WordsArray { - /// Returns the pointer to the words state - [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { - return is_short ? &stack : heap; - } - - /// Returns the pointer to the words state - [[nodiscard]] u64* Pointer(bool is_short) noexcept { - return is_short ? &stack : heap; - } - - u64 stack = 0; ///< Small buffers storage - u64* heap; ///< Not-small buffers pointer to the storage - }; - - struct Words { - explicit Words() = default; - explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { - if (IsShort()) { - cpu.stack = ~u64{0}; - gpu.stack = 0; - cached_cpu.stack = 0; - untracked.stack = ~u64{0}; - } else { - // Share allocation between CPU and GPU pages and set their default values - const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 4]; - cpu.heap = alloc; - gpu.heap = alloc + num_words; - cached_cpu.heap = alloc + num_words * 2; - untracked.heap = alloc + num_words * 3; - std::fill_n(cpu.heap, num_words, ~u64{0}); - std::fill_n(gpu.heap, num_words, 0); - std::fill_n(cached_cpu.heap, num_words, 0); - std::fill_n(untracked.heap, num_words, ~u64{0}); - } - // Clean up tailing bits - const u64 last_word_size = size_bytes % BYTES_PER_WORD; - const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); - const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - const u64 last_word = (~u64{0} << shift) >> shift; - cpu.Pointer(IsShort())[NumWords() - 1] = last_word; - untracked.Pointer(IsShort())[NumWords() - 1] = last_word; - } - - ~Words() { - Release(); - } - - Words& operator=(Words&& rhs) noexcept { - Release(); - size_bytes = rhs.size_bytes; - cpu = rhs.cpu; - gpu = rhs.gpu; - cached_cpu = rhs.cached_cpu; - untracked = rhs.untracked; - rhs.cpu.heap = nullptr; - return *this; - } - - Words(Words&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, - cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { - rhs.cpu.heap = nullptr; - } - - Words& operator=(const Words&) = delete; - Words(const Words&) = delete; - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return size_bytes <= BYTES_PER_WORD; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return Common::DivCeil(size_bytes, BYTES_PER_WORD); - } - - /// Release buffer resources - void Release() { - if (!IsShort()) { - // CPU written words is the base for the heap allocation - delete[] cpu.heap; - } - } - - u64 size_bytes = 0; - WordsArray cpu; - WordsArray gpu; - WordsArray cached_cpu; - WordsArray untracked; - }; - - enum class Type { - CPU, - GPU, - CachedCPU, - Untracked, - }; - public: + static constexpr u64 BASE_PAGE_BITS = 16; + static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) - : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, - words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} + : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, + word_manager(cpu_addr, rasterizer_, + Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {} explicit BufferBase(NullBufferParams) {} @@ -159,94 +55,82 @@ public: [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return word_manager.template ModifiedRegion(offset, query_size); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return word_manager.template ModifiedRegion(offset, query_size); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return word_manager.template IsRegionModified(offset, query_size); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return word_manager.template IsRegionModified(offset, query_size); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the CPU /// but don't mark it as modified until FlusHCachedWrites is called. void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { flags |= BufferFlagBits::CachedWrites; - ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Flushes cached CPU writes, and notify the rasterizer about the deltas void FlushCachedWrites() noexcept { flags &= ~BufferFlagBits::CachedWrites; - const u64 num_words = NumWords(); - u64* const cached_words = Array(); - u64* const untracked_words = Array(); - u64* const cpu_words = Array(); - for (u64 word_index = 0; word_index < num_words; ++word_index) { - const u64 cached_bits = cached_words[word_index]; - NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); - untracked_words[word_index] |= cached_bits; - cpu_words[word_index] |= cached_bits; - if (!Settings::values.use_pessimistic_flushes) { - cached_words[word_index] = 0; - } - } + word_manager.FlushCachedWrites(); } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, clear, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, clear, func); } template void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); + word_manager.template ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); } /// Mark buffer as picked @@ -297,7 +181,7 @@ public: /// Returns the size in bytes of the buffer [[nodiscard]] u64 SizeBytes() const noexcept { - return words.size_bytes; + return word_manager.SizeBytes(); } size_t getLRUID() const noexcept { @@ -309,301 +193,8 @@ public: } private: - template - u64* Array() noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - template - const u64* Array() const noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - /** - * Change the state of a range of pages - * - * @param dirty_addr Base address to mark or unmark as modified - * @param size Size in bytes to mark or unmark as modified - */ - template - void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { - const s64 difference = dirty_addr - cpu_addr; - const u64 offset = std::max(difference, 0); - size += std::min(difference, 0); - if (offset >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array(); - u64* const state_words = Array(); - const u64 offset_end = std::min(offset + size, SizeBytes()); - const u64 begin_page_index = offset / BYTES_PER_PAGE; - const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; - const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); - const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); - u64 page_index = begin_page_index % PAGES_PER_WORD; - u64 word_index = begin_word_index; - while (word_index < end_word_index) { - const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; - const u64 left_offset = - std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; - const u64 right_offset = page_index; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer(word_index, untracked_words[word_index], bits); - } - if constexpr (enable) { - state_words[word_index] |= bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] |= bits; - } - } else { - state_words[word_index] &= ~bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] &= ~bits; - } - } - page_index = 0; - ++word_index; - } - } - - /** - * Notify rasterizer about changes in the CPU tracking state of a word in the buffer - * - * @param word_index Index to the word to notify to the rasterizer - * @param current_bits Current state of the word - * @param new_bits New state of the word - * - * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages - */ - template - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { - u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; - VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - while (changed_bits != 0) { - const int empty_bits = std::countr_zero(changed_bits); - addr += empty_bits * BYTES_PER_PAGE; - changed_bits >>= empty_bits; - - const u32 continuous_bits = std::countr_one(changed_bits); - const u64 size = continuous_bits * BYTES_PER_PAGE; - const VAddr begin_addr = addr; - addr += size; - changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; - rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); - } - } - - /** - * Loop over each page in the given range, turn off those bits and notify the rasterizer if - * needed. Call the given function on each turned off range. - * - * @param query_cpu_range Base CPU address to loop over - * @param size Size in bytes of the CPU range to loop over - * @param func Function to call for each turned off region - */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { - static_assert(type != Type::Untracked); - - const s64 difference = query_cpu_range - cpu_addr; - const u64 query_begin = std::max(difference, 0); - size += std::min(difference, 0); - if (query_begin >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array(); - u64* const state_words = Array(); - const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); - u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; - u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); - - const auto modified = [](u64 word) { return word != 0; }; - const auto first_modified_word = std::find_if(words_begin, words_end, modified); - if (first_modified_word == words_end) { - // Exit early when the buffer is not modified - return; - } - const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); - - const u64 word_index_begin = std::distance(state_words, first_modified_word); - const u64 word_index_end = std::distance(state_words, last_modified_word); - - const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = - static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); - const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; - const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; - const u64 query_page_begin = query_begin / BYTES_PER_PAGE; - const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); - const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); - const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); - const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; - const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; - - u64 page_begin = first_word_page_begin; - u64 current_base = 0; - u64 current_size = 0; - bool on_going = false; - for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { - const bool is_last_word = word_index + 1 == word_index_end; - const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; - const u64 right_offset = page_begin; - const u64 left_offset = PAGES_PER_WORD - page_end; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - - const u64 current_word = state_words[word_index] & bits; - if (clear) { - state_words[word_index] &= ~bits; - } - - if constexpr (type == Type::CPU) { - const u64 current_bits = untracked_words[word_index] & bits; - untracked_words[word_index] &= ~bits; - NotifyRasterizer(word_index, current_bits, ~u64{0}); - } - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); - u64 page = page_begin; - page_begin = 0; - - while (page < page_end) { - const int empty_bits = std::countr_zero(word >> page); - if (on_going && empty_bits != 0) { - InvokeModifiedRange(func, current_size, current_base); - current_size = 0; - on_going = false; - } - if (empty_bits == PAGES_PER_WORD) { - break; - } - page += empty_bits; - - const int continuous_bits = std::countr_one(word >> page); - if (!on_going && continuous_bits != 0) { - current_base = word_index * PAGES_PER_WORD + page; - on_going = true; - } - current_size += continuous_bits; - page += continuous_bits; - } - } - if (on_going && current_size > 0) { - InvokeModifiedRange(func, current_size, current_base); - } - } - - template - void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { - const u64 current_size_bytes = current_size * BYTES_PER_PAGE; - const u64 offset_begin = current_base * BYTES_PER_PAGE; - const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); - func(offset_begin, offset_end - offset_begin); - } - - /** - * Returns true when a region has been modified - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array(); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; - for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); - const u64 local_page_end = page_end % PAGES_PER_WORD; - const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; - if (((word >> page_index) << page_index) << page_end_shift != 0) { - return true; - } - } - return false; - } - - /** - * Returns a begin end pair with the inclusive modified region - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template - [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array(); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_base = offset / BYTES_PER_PAGE; - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 begin = std::numeric_limits::max(); - u64 end = 0; - for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 local_page_begin = std::countr_zero(word); - const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); - const u64 page_index = word_index * PAGES_PER_WORD; - const u64 page_begin = std::max(page_index + local_page_begin, page_base); - const u64 page_end = std::min(page_index + local_page_end, page_limit); - begin = std::min(begin, page_begin); - end = std::max(end, page_end); - } - static constexpr std::pair EMPTY{0, 0}; - return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return words.NumWords(); - } - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return words.IsShort(); - } - - RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; - Words words; + WordManager word_manager; BufferFlagBits flags{}; int stream_score = 0; size_t lru_id = SIZE_MAX; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 27e2962e2..ae236be58 100755 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #include "common/microprofile.h" diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d61869b2b..e4a4e78cf 100755 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1,486 +1,29 @@ -// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include -#include #include -#include #include -#include -#include -#include -#include - -#include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/literals.h" -#include "common/lru_cache.h" -#include "common/microprofile.h" -#include "common/polyfill_ranges.h" -#include "common/scratch_buffer.h" -#include "common/settings.h" -#include "core/memory.h" -#include "video_core/buffer_cache/buffer_base.h" -#include "video_core/control/channel_state_cache.h" -#include "video_core/delayed_destruction_ring.h" -#include "video_core/dirty_flags.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/slot_vector.h" -#include "video_core/texture_cache/types.h" +#include "video_core/buffer_cache/buffer_cache_base.h" namespace VideoCommon { -MICROPROFILE_DECLARE(GPU_PrepareBuffers); -MICROPROFILE_DECLARE(GPU_BindUploadBuffers); -MICROPROFILE_DECLARE(GPU_DownloadMemory); - -using BufferId = SlotId; - -using VideoCore::Surface::PixelFormat; -using namespace Common::Literals; - -constexpr u32 NUM_VERTEX_BUFFERS = 32; -constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; -constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; -constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; -constexpr u32 NUM_STORAGE_BUFFERS = 16; -constexpr u32 NUM_TEXTURE_BUFFERS = 16; -constexpr u32 NUM_STAGES = 5; - -enum class ObtainBufferSynchronize : u32 { - NoSynchronize = 0, - FullSynchronize = 1, - SynchronizeNoDirty = 2, -}; - -enum class ObtainBufferOperation : u32 { - DoNothing = 0, - MarkAsWritten = 1, - DiscardWrite = 2, - MarkQuery = 3, -}; - -using UniformBufferSizes = std::array, NUM_STAGES>; -using ComputeUniformBufferSizes = std::array; - -template -class BufferCache : public VideoCommon::ChannelSetupCaches { - - // Page size for caching purposes. - // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 YUZU_PAGEBITS = 16; - static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; - - static constexpr bool IS_OPENGL = P::IS_OPENGL; - static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = - P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; - static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = - P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; - static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; - static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; - static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; - static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; - - static constexpr BufferId NULL_BUFFER_ID{0}; - - static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; - static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; - static constexpr s64 TARGET_THRESHOLD = 4_GiB; - - using Maxwell = Tegra::Engines::Maxwell3D::Regs; - - using Runtime = typename P::Runtime; - using Buffer = typename P::Buffer; - - using IntervalSet = boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - - struct Empty {}; - - struct OverlapResult { - std::vector ids; - VAddr begin; - VAddr end; - bool has_stream_leap = false; - }; - - struct Binding { - VAddr cpu_addr{}; - u32 size{}; - BufferId buffer_id; - }; - - struct TextureBufferBinding : Binding { - PixelFormat format; - }; - - static constexpr Binding NULL_BINDING{ - .cpu_addr = 0, - .size = 0, - .buffer_id = NULL_BUFFER_ID, - }; - -public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast(4_KiB); - - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, Runtime& runtime_); - - void TickFrame(); - - void WriteMemory(VAddr cpu_addr, u64 size); - - void CachedWriteMemory(VAddr cpu_addr, u64 size); - - void DownloadMemory(VAddr cpu_addr, u64 size); - - bool InlineMemory(VAddr dest_address, size_t copy_size, std::span inlined_buffer); - - void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - - void DisableGraphicsUniformBuffer(size_t stage, u32 index); - - void UpdateGraphicsBuffers(bool is_indexed); - - void UpdateComputeBuffers(); - - void BindHostGeometryBuffers(bool is_indexed); - - void BindHostStageBuffers(size_t stage); - - void BindHostComputeBuffers(); - - void SetUniformBuffersState(const std::array& mask, - const UniformBufferSizes* sizes); - - void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); - - void UnbindGraphicsStorageBuffers(size_t stage); - - void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindGraphicsTextureBuffers(size_t stage); - - void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, - PixelFormat format, bool is_written, bool is_image); - - void UnbindComputeStorageBuffers(); - - void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindComputeTextureBuffers(); - - void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, - bool is_written, bool is_image); - - void FlushCachedWrites(); - - /// Return true when there are uncommitted buffers to be downloaded - [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - - void AccumulateFlushes(); - - /// Return true when the caller should wait for async downloads - [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - - /// Commit asynchronous downloads - void CommitAsyncFlushes(); - void CommitAsyncFlushesHigh(); - - /// Pop asynchronous downloads - void PopAsyncFlushes(); - - bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); - - bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); - - [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); - - /// Return true when a CPU region is modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - - /// Return true when a region is registered on the cache - [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); - - /// Return true when a CPU region is modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); - - void SetDrawIndirect( - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { - current_draw_indirect = current_draw_indirect_; - } - - [[nodiscard]] std::pair GetDrawIndirectCount(); - - [[nodiscard]] std::pair GetDrawIndirectBuffer(); - - std::recursive_mutex mutex; - Runtime& runtime; - -private: - template - static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { - for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { - const int disabled_bits = std::countr_zero(enabled_mask); - index += disabled_bits; - enabled_mask >>= disabled_bits; - func(index); - } - } - - template - void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); - for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[buffer_id]; - func(buffer_id, buffer); - - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - } - - template - void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { - const VAddr start_address = cpu_addr; - const VAddr end_address = start_address + size; - const VAddr search_base = - static_cast(std::min(0LL, static_cast(start_address - size))); - const IntervalType search_interval{search_base, search_base + 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - for (; it != common_ranges.end(); it++) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } - } - - static bool IsRangeGranular(VAddr cpu_addr, size_t size) { - return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == - ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); - } - - void RunGarbageCollector(); - - void BindHostIndexBuffer(); - - void BindHostVertexBuffers(); - - void BindHostDrawIndirectBuffers(); - - void BindHostGraphicsUniformBuffers(size_t stage); - - void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - - void BindHostGraphicsStorageBuffers(size_t stage); - - void BindHostGraphicsTextureBuffers(size_t stage); - - void BindHostTransformFeedbackBuffers(); - - void BindHostComputeUniformBuffers(); - - void BindHostComputeStorageBuffers(); - - void BindHostComputeTextureBuffers(); - - void DoUpdateGraphicsBuffers(bool is_indexed); - - void DoUpdateComputeBuffers(); - - void UpdateIndexBuffer(); - - void UpdateVertexBuffers(); - - void UpdateVertexBuffer(u32 index); - - void UpdateDrawIndirect(); - - void UpdateUniformBuffers(size_t stage); - - void UpdateStorageBuffers(size_t stage); - - void UpdateTextureBuffers(size_t stage); - - void UpdateTransformFeedbackBuffers(); - - void UpdateTransformFeedbackBuffer(u32 index); - - void UpdateComputeUniformBuffers(); - - void UpdateComputeStorageBuffers(); - - void UpdateComputeTextureBuffers(); - - void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); - - [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); - - [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); - - void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); - - [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); - - void Register(BufferId buffer_id); - - void Unregister(BufferId buffer_id); - - template - void ChangeRegister(BufferId buffer_id); - - void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; - - bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); - - bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); - - void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, - std::span copies); - - void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span copies); - - void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span copies); - - void DownloadBufferMemory(Buffer& buffer_id); - - void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); - - void DeleteBuffer(BufferId buffer_id); - - void NotifyBufferDeletion(); - - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, - bool is_written = false) const; - - [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, - PixelFormat format); - - [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); - - [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); - - [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; - - void ClearDownload(IntervalType subtract_interval); - - VideoCore::RasterizerInterface& rasterizer; - Core::Memory::Memory& cpu_memory; - - SlotVector slot_buffers; - DelayedDestructionRing delayed_destruction_ring; - - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; - - u32 last_index_count = 0; - - Binding index_buffer; - std::array vertex_buffers; - std::array, NUM_STAGES> uniform_buffers; - std::array, NUM_STAGES> storage_buffers; - std::array, NUM_STAGES> texture_buffers; - std::array transform_feedback_buffers; - Binding count_buffer_binding; - Binding indirect_buffer_binding; - - std::array compute_uniform_buffers; - std::array compute_storage_buffers; - std::array compute_texture_buffers; - - std::array enabled_uniform_buffer_masks{}; - u32 enabled_compute_uniform_buffer_mask = 0; - - const UniformBufferSizes* uniform_buffer_sizes{}; - const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; - - std::array enabled_storage_buffers{}; - std::array written_storage_buffers{}; - u32 enabled_compute_storage_buffers = 0; - u32 written_compute_storage_buffers = 0; - - std::array enabled_texture_buffers{}; - std::array written_texture_buffers{}; - std::array image_texture_buffers{}; - u32 enabled_compute_texture_buffers = 0; - u32 written_compute_texture_buffers = 0; - u32 image_compute_texture_buffers = 0; - - std::array uniform_cache_hits{}; - std::array uniform_cache_shots{}; - - u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; - - bool has_deleted_buffers = false; - - std::conditional_t, Empty> - dirty_uniform_buffers{}; - std::conditional_t, Empty> fast_bound_uniform_buffers{}; - std::conditional_t, NUM_STAGES>, Empty> - uniform_buffer_binding_sizes{}; - - std::vector cached_write_buffer_ids; - - IntervalSet uncommitted_ranges; - IntervalSet common_ranges; - std::deque committed_ranges; - - Common::ScratchBuffer immediate_buffer_alloc; - - struct LRUItemParams { - using ObjectType = BufferId; - using TickType = u64; - }; - Common::LeastRecentlyUsedCache lru_cache; - u64 frame_tick = 0; - u64 total_used_memory = 0; - u64 minimum_memory = 0; - u64 critical_memory = 0; - - std::array> YUZU_PAGEBITS)> page_table; -}; +using Core::Memory::YUZU_PAGESIZE; template BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) - : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { + : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{ + rasterizer} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; critical_memory = DEFAULT_CRITICAL_MEMORY; @@ -531,6 +74,8 @@ void BufferCache

::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -547,31 +92,51 @@ void BufferCache

::TickFrame() { template void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - buffer.MarkRegionAsCpuModified(cpu_addr, size); - }); + memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + } } template void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - if (!buffer.HasCachedWrites()) { - cached_write_buffer_ids.push_back(buffer_id); - } - buffer.CachedCpuWrite(cpu_addr, size); - }); + memory_tracker.CachedCpuWrite(cpu_addr, size); + const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE), + Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)}; + cached_ranges.add(add_interval); } template void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { + WaitOnAsyncFlushes(cpu_addr, size); ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer, cpu_addr, size); }); } +template +void BufferCache

::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { + bool must_wait = false; + ForEachInOverlapCounter(async_downloads, cpu_addr, size, + [&](VAddr, VAddr, int) { must_wait = true; }); + bool must_release = false; + ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); + if (must_release) { + std::function tmp([]() {}); + rasterizer.SignalFence(std::move(tmp)); + } + if (must_wait || must_release) { + rasterizer.ReleaseFences(); + } +} + template void BufferCache

::ClearDownload(IntervalType subtract_interval) { + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); uncommitted_ranges.subtract(subtract_interval); + pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -591,6 +156,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + WaitOnAsyncFlushes(*cpu_src_address, static_cast(amount)); ClearDownload(subtract_interval); BufferId buffer_a; @@ -616,10 +182,11 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); }; - ForEachWrittenRange(*cpu_src_address, amount, mirror); + ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. common_ranges.subtract(subtract_interval); const bool has_new_downloads = tmp_intervals.size() != 0; @@ -628,7 +195,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } runtime.CopyBuffer(dest_buffer, src_buffer, copies); if (has_new_downloads) { - dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } std::vector tmp_buffer(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); @@ -866,10 +433,12 @@ void BufferCache

::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template void BufferCache

::FlushCachedWrites() { - for (const BufferId buffer_id : cached_write_buffer_ids) { - slot_buffers[buffer_id].FlushCachedWrites(); - } cached_write_buffer_ids.clear(); + memory_tracker.FlushCachedWrites(); + for (auto& interval : cached_ranges) { + ClearDownload(interval); + } + cached_ranges.clear(); } template @@ -879,10 +448,6 @@ bool BufferCache

::HasUncommittedFlushes() const noexcept { template void BufferCache

::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -891,7 +456,11 @@ void BufferCache

::AccumulateFlushes() { template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - return false; + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + return (!async_buffers.empty() && async_buffers.front().has_value()); + } else { + return false; + } } template @@ -899,12 +468,14 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional{}); + } return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; + pending_ranges.clear(); auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { auto& current_intervals = *it; @@ -926,11 +497,12 @@ void BufferCache

::CommitAsyncFlushesHigh() { const std::size_t size = interval.upper() - interval.lower(); const VAddr cpu_addr = interval.lower(); ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - buffer.ForEachDownloadRangeAndClear( - cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } + const VAddr buffer_start = buffer.CpuAddr(); + const VAddr buffer_end = buffer_start + buffer.SizeBytes(); + const VAddr new_start = std::max(buffer_start, cpu_addr); + const VAddr new_end = std::min(buffer_end, cpu_addr + size); + memory_tracker.ForEachDownloadRange( + new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -944,92 +516,137 @@ void BufferCache

::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); }; - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - common_ranges.subtract(subtract_interval); + ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download); }); }); } } committed_ranges.clear(); if (downloads.empty()) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional{}); + } return; } - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); - } - runtime.PostCopyBarrier(); - runtime.Finish(); - for (const auto& [copy, buffer_id] : downloads) { - const Buffer& buffer = slot_buffers[buffer_id]; - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + if (active_async_buffers) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + IntervalSet new_async_range{}; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + async_downloads += std::make_pair(base_interval, 1); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); } } else { - const std::span immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); + } + runtime.PostCopyBarrier(); + runtime.Finish(); + for (const auto& [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto& [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + } } } } template void BufferCache

::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); - } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); + CommitAsyncFlushesHigh(); +} + +template +void BufferCache

::PopAsyncFlushes() { + MICROPROFILE_SCOPE(GPU_DownloadMemory); + PopAsyncBuffers(); +} + +template +void BufferCache

::PopAsyncBuffers() { + if (async_buffers.empty()) { + return; + } + if (!async_buffers.front().has_value()) { + async_buffers.pop_front(); + return; + } + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto& downloads = pending_downloads.front(); + auto& async_buffer = async_buffers.front(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const VAddr cpu_addr = static_cast(copy.src_offset); + const u64 dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + ForEachInOverlapCounter( + async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], + end - start); + if (count == 1) { + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + } + }); + const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); + } + runtime.FreeDeferredStagingBuffer(*async_buffer); + async_buffers.pop_front(); + pending_downloads.pop_front(); } } -template -void BufferCache

::PopAsyncFlushes() {} - template bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionGpuModified(addr, size)) { - return true; - } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - return false; + bool is_dirty = false; + ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); + return is_dirty; } template bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -1041,28 +658,14 @@ bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + page = Common::DivCeil(end_addr, PAGE_SIZE); } return false; } template bool BufferCache

::IsRegionCpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionCpuModified(addr, size)) { - return true; - } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - return false; + return memory_tracker.IsRegionCpuModified(addr, size); } template @@ -1155,7 +758,7 @@ void BufferCache

::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 TouchBuffer(buffer, binding.buffer_id); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && - !buffer.IsRegionGpuModified(cpu_addr, size); + !memory_tracker.IsRegionGpuModified(cpu_addr, size); if (use_fast_buffer) { if constexpr (IS_OPENGL) { if (runtime.HasFastBufferSubData()) { @@ -1378,27 +981,28 @@ void BufferCache

::UpdateIndexBuffer() { // We have to check for the dirty flags and index count // The index count is currently changed without updating the dirty flags const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const auto& index_array = draw_state.index_buffer; + const auto& index_buffer_ref = draw_state.index_buffer; auto& flags = maxwell3d->dirty.flags; if (!flags[Dirty::IndexBuffer]) { return; } flags[Dirty::IndexBuffer] = false; - last_index_count = index_array.count; if (!draw_state.inline_index_draw_indexes.empty()) { auto inline_index_size = static_cast(draw_state.inline_index_draw_indexes.size()); index_buffer = Binding{ .cpu_addr = 0, .size = inline_index_size, - .buffer_id = CreateBuffer(0, inline_index_size), + .buffer_id = FindBuffer(0, inline_index_size), }; return; } - const GPUVAddr gpu_addr_begin = index_array.StartAddress(); - const GPUVAddr gpu_addr_end = index_array.EndAddress(); + + const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress(); + const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress(); const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); + const u32 draw_size = + (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; @@ -1434,17 +1038,15 @@ void BufferCache

::UpdateVertexBuffer(u32 index) { const GPUVAddr gpu_addr_begin = array.Address(); const GPUVAddr gpu_addr_end = limit.Address() + 1; const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); - u32 address_size = static_cast( - std::min(gpu_addr_end - gpu_addr_begin, static_cast(std::numeric_limits::max()))); - if (array.enable == 0 || address_size == 0 || !cpu_addr) { + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { vertex_buffers[index] = NULL_BINDING; return; } if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { - address_size = - static_cast(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size)); + size = static_cast(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); } - const u32 size = address_size; // TODO: Analyze stride and number of vertices vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, @@ -1591,17 +1193,16 @@ void BufferCache

::UpdateComputeTextureBuffers() { template void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsGpuModified(cpu_addr, size); + memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); + + if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { + SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); + } const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - - const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_async) { - return; - } uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } template @@ -1609,7 +1210,7 @@ BufferId BufferCache

::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> YUZU_PAGEBITS; + const u64 page = cpu_addr >> PAGE_BITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1638,9 +1239,8 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu .has_stream_leap = has_stream_leap, }; } - for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); - cpu_addr += YUZU_PAGESIZE) { - const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; + for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { + const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; if (!overlap_id) { continue; } @@ -1666,11 +1266,11 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - begin -= YUZU_PAGESIZE * 256; + begin -= PAGE_SIZE * 256; cpu_addr = begin; } if (expands_left) { - end += YUZU_PAGESIZE * 256; + end += PAGE_SIZE * 256; } } } @@ -1690,21 +1290,15 @@ void BufferCache

::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, if (accumulate_stream_score) { new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); } - std::vector copies; + boost::container::small_vector copies; const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); - overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = begin, - .dst_offset = dst_base_offset + begin, - .size = range_size, - }); - new_buffer.UnmarkRegionAsCpuModified(begin, range_size); - new_buffer.MarkRegionAsGpuModified(begin, range_size); + copies.push_back(BufferCopy{ + .src_offset = 0, + .dst_offset = dst_base_offset, + .size = overlap.SizeBytes(), }); - if (!copies.empty()) { - runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); - } - DeleteBuffer(overlap_id); + runtime.CopyBuffer(new_buffer, overlap, copies); + DeleteBuffer(overlap_id, true); } template @@ -1718,7 +1312,7 @@ BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); - TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); + TouchBuffer(new_buffer, new_buffer_id); return new_buffer_id; } @@ -1746,8 +1340,8 @@ void BufferCache

::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); + const u64 page_begin = cpu_addr_begin / PAGE_SIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; @@ -1766,9 +1360,6 @@ void BufferCache

::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { template bool BufferCache

::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { - if (buffer.CpuAddr() == 0) { - return true; - } return SynchronizeBufferImpl(buffer, cpu_addr, size); } @@ -1777,10 +1368,11 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + VAddr buffer_start = buffer.CpuAddr(); + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { copies.push_back(BufferCopy{ .src_offset = total_size_bytes, - .dst_offset = range_offset, + .dst_offset = cpu_addr_out - buffer_start, .size = range_size, }); total_size_bytes += range_size; @@ -1794,6 +1386,51 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s return false; } +template +bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr_ = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr_ - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max(largest_copy, sub_size); + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr base_adr = cpu_addr_out; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + template void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies) { @@ -1805,39 +1442,45 @@ void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg } template -void BufferCache

::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span copies) { - std::span immediate_buffer; - for (const BufferCopy& copy : copies) { - std::span upload_span; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - if (IsRangeGranular(cpu_addr, copy.size)) { - upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); - } else { - if (immediate_buffer.empty()) { - immediate_buffer = ImmediateBuffer(largest_copy); +void BufferCache

::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 largest_copy, + [[maybe_unused]] std::span copies) { + if constexpr (!USE_MEMORY_MAPS) { + std::span immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); + } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - upload_span = immediate_buffer.subspan(0, copy.size); + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - buffer.ImmediateUpload(copy.dst_offset, upload_span); } } template -void BufferCache

::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span copies) { - auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); - const std::span staging_pointer = upload_staging.mapped_span; - for (BufferCopy& copy : copies) { - u8* const src_pointer = staging_pointer.data() + copy.src_offset; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); +void BufferCache

::MappedUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 total_size_bytes, + [[maybe_unused]] std::span copies) { + if constexpr (USE_MEMORY_MAPS) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span staging_pointer = upload_staging.mapped_span; + for (BufferCopy& copy : copies) { + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); - // Apply the staging offset - copy.src_offset += upload_staging.offset; + // Apply the staging offset + copy.src_offset += upload_staging.offset; + } + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } template @@ -1847,7 +1490,9 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, if (!is_dirty) { return false; } - if (!IsRegionGpuModified(dest_address, copy_size)) { + VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { return false; } @@ -1886,30 +1531,31 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end) { - const u64 new_offset = start - buffer_addr; - const u64 new_size = end - start; - copies.push_back(BufferCopy{ - .src_offset = new_offset, - .dst_offset = total_size_bytes, - .size = new_size, - }); - // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (new_size + align - 1) & mask; - largest_copy = std::max(largest_copy, new_size); - }; + memory_tracker.ForEachDownloadRangeAndClear( + cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr buffer_addr = buffer.CpuAddr(); + const auto add_download = [&](VAddr start, VAddr end) { + const u64 new_offset = start - buffer_addr; + const u64 new_size = end - start; + copies.push_back(BufferCopy{ + .src_offset = new_offset, + .dst_offset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + largest_copy = std::max(largest_copy, new_size); + }; - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); - }); + const VAddr start_address = cpu_addr_out; + const VAddr end_address = start_address + range_size; + ForEachInRangeSet(common_ranges, start_address, range_size, add_download); + const IntervalType subtract_interval{start_address, end_address}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + }); if (total_size_bytes == 0) { return; } @@ -1943,7 +1589,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si } template -void BufferCache

::DeleteBuffer(BufferId buffer_id) { +void BufferCache

::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { binding.buffer_id = BufferId{}; @@ -1962,8 +1608,10 @@ void BufferCache

::DeleteBuffer(BufferId buffer_id) { std::erase(cached_write_buffer_ids, buffer_id); // Mark the whole buffer as CPU written to stop tracking CPU writes - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + if (!do_not_mark) { + Buffer& buffer = slot_buffers[buffer_id]; + memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + } Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); @@ -2011,7 +1659,7 @@ typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr s LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); return NULL_BINDING; } - const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); + const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, .size = is_written ? size : static_cast(cpu_end - *cpu_addr), diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h new file mode 100755 index 000000000..acff22d4f --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -0,0 +1,581 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#define BOOST_NO_MT +#include +#undef BOOST_NO_MT +#include +#include +#include +#include +#include +#include +#include + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/literals.h" +#include "common/lru_cache.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" + +namespace boost { +template +class fast_pool_allocator; +} + +namespace VideoCommon { + +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +using VideoCore::Surface::PixelFormat; +using namespace Common::Literals; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_TEXTURE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +using UniformBufferSizes = std::array, NUM_STAGES>; +using ComputeUniformBufferSizes = std::array; + +enum class ObtainBufferSynchronize : u32 { + NoSynchronize = 0, + FullSynchronize = 1, + SynchronizeNoDirty = 2, +}; + +enum class ObtainBufferOperation : u32 { + DoNothing = 0, + MarkAsWritten = 1, + DiscardWrite = 2, + MarkQuery = 3, +}; + +template +class BufferCache : public VideoCommon::ChannelSetupCaches { + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 PAGE_BITS = 16; + static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; + static constexpr u32 CPU_PAGE_BITS = 12; + static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS; + + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; + static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; + + static constexpr BufferId NULL_BUFFER_ID{0}; + + static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; + static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; + static constexpr s64 TARGET_THRESHOLD = 4_GiB; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + using Async_Buffer = typename P::Async_Buffer; + using MemoryTracker = typename P::MemoryTracker; + + using IntervalCompare = std::less; + using IntervalInstance = boost::icl::interval_type_default; + using IntervalAllocator = boost::fast_pool_allocator; + using IntervalSet = boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + + template + struct counter_add_functor : public boost::icl::identity_based_inplace_combine { + // types + typedef counter_add_functor type; + typedef boost::icl::identity_based_inplace_combine base_type; + + // public member functions + void operator()(Type& current, const Type& added) const { + current += added; + if (current < base_type::identity_element()) { + current = base_type::identity_element(); + } + } + + // public static functions + static void version(Type&){}; + }; + + using OverlapCombine = counter_add_functor; + using OverlapSection = boost::icl::inter_section; + using OverlapCounter = boost::icl::split_interval_map; + + struct Empty {}; + + struct OverlapResult { + std::vector ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; + }; + + struct TextureBufferBinding : Binding { + PixelFormat format; + }; + + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; + +public: + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast(4_KiB); + + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, Runtime& runtime_); + + void TickFrame(); + + void WriteMemory(VAddr cpu_addr, u64 size); + + void CachedWriteMemory(VAddr cpu_addr, u64 size); + + void DownloadMemory(VAddr cpu_addr, u64 size); + + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span inlined_buffer); + + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); + + void DisableGraphicsUniformBuffer(size_t stage, u32 index); + + void UpdateGraphicsBuffers(bool is_indexed); + + void UpdateComputeBuffers(); + + void BindHostGeometryBuffers(bool is_indexed); + + void BindHostStageBuffers(size_t stage); + + void BindHostComputeBuffers(); + + void SetUniformBuffersState(const std::array& mask, + const UniformBufferSizes* sizes); + + void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); + + void UnbindGraphicsStorageBuffers(size_t stage); + + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindGraphicsTextureBuffers(size_t stage); + + void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, + PixelFormat format, bool is_written, bool is_image); + + void UnbindComputeStorageBuffers(); + + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindComputeTextureBuffers(); + + void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, + bool is_written, bool is_image); + + [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); + void FlushCachedWrites(); + + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + void AccumulateFlushes(); + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + void CommitAsyncFlushesHigh(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + void PopAsyncBuffers(); + + bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + + bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + + /// Return true when a CPU region is modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + + void SetDrawIndirect( + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair GetDrawIndirectCount(); + + [[nodiscard]] std::pair GetDrawIndirectBuffer(); + + std::recursive_mutex mutex; + Runtime& runtime; + +private: + template + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); + } + } + + template + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); + for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); + } + } + + template + void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + + template + void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, + Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + VAddr inter_addr_end = inter.upper(); + VAddr inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + + void RemoveEachInOverlapCounter(OverlapCounter& current_range, + const IntervalType search_interval, int subtract_value) { + bool any_removals = false; + current_range.add(std::make_pair(search_interval, subtract_value)); + do { + any_removals = false; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + any_removals = true; + current_range.erase(it); + break; + } + } + } while (any_removals); + } + + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == + ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); + } + + void RunGarbageCollector(); + + void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); + + void BindHostIndexBuffer(); + + void BindHostVertexBuffers(); + + void BindHostDrawIndirectBuffers(); + + void BindHostGraphicsUniformBuffers(size_t stage); + + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); + + void BindHostGraphicsStorageBuffers(size_t stage); + + void BindHostGraphicsTextureBuffers(size_t stage); + + void BindHostTransformFeedbackBuffers(); + + void BindHostComputeUniformBuffers(); + + void BindHostComputeStorageBuffers(); + + void BindHostComputeTextureBuffers(); + + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateDrawIndirect(); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTextureBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void UpdateComputeTextureBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template + void ChangeRegister(BufferId buffer_id); + + void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; + + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span copies); + + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + + void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, + bool is_written) const; + + [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, + PixelFormat format); + + [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + void ClearDownload(IntervalType subtract_interval); + + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + + SlotVector slot_buffers; + DelayedDestructionRing delayed_destruction_ring; + + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array vertex_buffers; + std::array, NUM_STAGES> uniform_buffers; + std::array, NUM_STAGES> storage_buffers; + std::array, NUM_STAGES> texture_buffers; + std::array transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; + + std::array compute_uniform_buffers; + std::array compute_storage_buffers; + std::array compute_texture_buffers; + + std::array enabled_uniform_buffer_masks{}; + u32 enabled_compute_uniform_buffer_mask = 0; + + const UniformBufferSizes* uniform_buffer_sizes{}; + const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; + + std::array enabled_storage_buffers{}; + std::array written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array enabled_texture_buffers{}; + std::array written_texture_buffers{}; + std::array image_texture_buffers{}; + u32 enabled_compute_texture_buffers = 0; + u32 written_compute_texture_buffers = 0; + u32 image_compute_texture_buffers = 0; + + std::array uniform_cache_hits{}; + std::array uniform_cache_shots{}; + + u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; + + bool has_deleted_buffers = false; + + std::conditional_t, Empty> + dirty_uniform_buffers{}; + std::conditional_t, Empty> fast_bound_uniform_buffers{}; + std::conditional_t, NUM_STAGES>, Empty> + uniform_buffer_binding_sizes{}; + + std::vector cached_write_buffer_ids; + + MemoryTracker memory_tracker; + IntervalSet uncommitted_ranges; + IntervalSet common_ranges; + IntervalSet cached_ranges; + IntervalSet pending_ranges; + std::deque committed_ranges; + + // Async Buffers + OverlapCounter async_downloads; + std::deque> async_buffers; + std::deque> pending_downloads; + std::optional current_buffer; + + // queries + boost::container::small_vector, 8> pending_queries; + std::deque> committed_queries; + boost::container::small_vector flushed_queries; + std::deque> query_async_buffers; + + size_t immediate_buffer_capacity = 0; + Common::ScratchBuffer immediate_buffer_alloc; + + struct LRUItemParams { + using ObjectType = BufferId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache lru_cache; + u64 frame_tick = 0; + u64 total_used_memory = 0; + u64 minimum_memory = 0; + u64 critical_memory = 0; + + bool active_async_buffers = false; + + std::array> PAGE_BITS)> page_table; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100755 index 000000000..016d8430f --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -0,0 +1,266 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/word_manager.h" + +namespace VideoCommon { + +template +class MemoryTrackerBase { + static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t HIGHER_PAGE_BITS = 22; + static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; + static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; + static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); + static constexpr size_t MANAGER_POOL_SIZE = 32; + static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; + using Manager = WordManager; + +public: + MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {} + ~MemoryTrackerBase() = default; + + /// Returns the inclusive CPU modified range in a begin end pair + [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion(offset, size); + }); + } + + /// Returns the inclusive GPU modified range in a begin end pair + [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion(offset, size); + }); + } + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); + } + + /// Mark region as CPU modified, notifying the rasterizer about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as CPU modified, notifying the rasterizer about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages( + dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { + const VAddr cpu_address = manager->GetCpuAddr() + offset; + manager->template ChangeRegionState(cpu_address, size); + cached_pages.insert(static_cast(cpu_address >> HIGHER_PAGE_BITS)); + }); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept { + IteratePages(query_cpu_addr, query_size, + [](Manager* manager, [[maybe_unused]] u64 offset, + [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); }); + } + + void FlushCachedWrites() noexcept { + for (auto id : cached_pages) { + top_tier[id]->FlushCachedWrites(); + } + cached_pages.clear(); + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, true, func); + }); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func, clear](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, clear, func); + }); + } + + template + void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, true, func); + }); + } + +private: + template + bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool BOOL_BREAK = std::is_same_v; + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + while (remaining_size > 0) { + const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + if (manager) { + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return false; + } + + template + std::pair IteratePairs(VAddr cpu_address, size_t size, Func&& func) { + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + u64 begin = std::numeric_limits::max(); + u64 end = 0; + while (remaining_size > 0) { + const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + const auto execute = [&] { + auto [new_begin, new_end] = func(manager, page_offset, copy_amount); + if (new_begin != 0 || new_end != 0) { + const u64 base_address = page_index << HIGHER_PAGE_BITS; + begin = std::min(new_begin + base_address, begin); + end = std::max(new_end + base_address, end); + } + }; + if (manager) { + execute(); + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + execute(); + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + if (begin < end) { + return std::make_pair(begin, end); + } else { + return std::make_pair(0ULL, 0ULL); + } + } + + void CreateRegion(std::size_t page_index) { + const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; + top_tier[page_index] = GetNewManager(base_cpu_addr); + } + + Manager* GetNewManager(VAddr base_cpu_addess) { + const auto on_return = [&] { + auto* new_manager = free_managers.front(); + new_manager->SetCpuAddress(base_cpu_addess); + free_managers.pop_front(); + return new_manager; + }; + if (!free_managers.empty()) { + return on_return(); + } + manager_pool.emplace_back(); + auto& last_pool = manager_pool.back(); + for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { + new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE); + free_managers.push_back(&last_pool[i]); + } + return on_return(); + } + + std::deque> manager_pool; + std::deque free_managers; + + std::array top_tier{}; + + std::unordered_set cached_pages; + + RasterizerInterface* rasterizer = nullptr; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100755 index 000000000..21729752b --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h @@ -0,0 +1,476 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include + +#include "common/alignment.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "core/memory.h" + +namespace VideoCommon { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; +constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + +/// Vector tracking modified pages tightly packed with small vector optimization +template +union WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? stack.data() : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? stack.data() : heap; + } + + std::array stack{}; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage +}; + +template +struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + if (IsShort()) { + cpu.stack.fill(~u64{0}); + gpu.stack.fill(0); + cached_cpu.stack.fill(0); + untracked.stack.fill(~u64{0}); + } else { + const size_t num_words = NumWords(); + // Share allocation between CPU and GPU pages and set their default values + u64* const alloc = new u64[num_words * 4]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + cpu = rhs.cpu; + gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return size_bytes <= stack_words * BYTES_PER_WORD; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return Common::DivCeil(size_bytes, BYTES_PER_WORD); + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + u64 size_bytes = 0; + WordsArray cpu; + WordsArray gpu; + WordsArray cached_cpu; + WordsArray untracked; +}; + +enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, +}; + +template +class WordManager { +public: + explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes) + : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {} + + explicit WordManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template + void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { + const s64 difference = dirty_addr - cpu_addr; + const u64 offset = std::max(difference, 0); + size += std::min(difference, 0); + if (offset >= SizeBytes() || size < 0) { + return; + } + u64* const untracked_words = Array(); + u64* const state_words = Array(); + const u64 offset_end = std::min(offset + size, SizeBytes()); + const u64 begin_page_index = offset / BYTES_PER_PAGE; + const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; + const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); + const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); + u64 page_index = begin_page_index % PAGES_PER_WORD; + u64 word_index = begin_word_index; + while (word_index < end_word_index) { + const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; + const u64 left_offset = + std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; + const u64 right_offset = page_index; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], bits); + } + if constexpr (enable) { + state_words[word_index] |= bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] |= bits; + } + } else { + state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] &= ~bits; + } + } + page_index = 0; + ++word_index; + } + } + + /** + * Loop over each page in the given range, turn off those bits and notify the rasterizer if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { + static_assert(type != Type::Untracked); + + const s64 difference = query_cpu_range - cpu_addr; + const u64 query_begin = std::max(difference, 0); + size += std::min(difference, 0); + if (query_begin >= SizeBytes() || size < 0) { + return; + } + [[maybe_unused]] u64* const untracked_words = Array(); + [[maybe_unused]] u64* const cpu_words = Array(); + u64* const state_words = Array(); + const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); + u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; + u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); + u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; + + const auto modified = [](u64 word) { return word != 0; }; + const auto first_modified_word = std::find_if(words_begin, words_end, modified); + if (first_modified_word == words_end) { + // Exit early when the buffer is not modified + return; + } + if (first_modified_word != words_begin) { + first_page = 0; + } + std::reverse_iterator first_word_reverse(first_modified_word); + std::reverse_iterator last_word_iterator(words_end); + auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified); + u64* const last_modified_word = &(*last_word_result) + 1; + + const u64 word_index_begin = std::distance(state_words, first_modified_word); + const u64 word_index_end = std::distance(state_words, last_modified_word); + const unsigned local_page_begin = std::countr_zero(*first_modified_word); + const unsigned local_page_end = + static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); + const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; + const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; + const u64 query_page_begin = query_begin / BYTES_PER_PAGE; + const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); + const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); + const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); + const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; + const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; + + u64 page_begin = std::max(first_word_page_begin, first_page); + u64 current_base = 0; + u64 current_size = 0; + bool on_going = false; + for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { + const bool is_last_word = word_index + 1 == word_index_end; + const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; + const u64 right_offset = page_begin; + const u64 left_offset = PAGES_PER_WORD - page_end; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + + const u64 current_word = state_words[word_index] & bits; + if (clear) { + state_words[word_index] &= ~bits; + } + + if constexpr (type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], current_word); + untracked_words[word_index] |= current_word; + cpu_words[word_index] |= current_word; + } + + if constexpr (type == Type::CPU) { + const u64 current_bits = untracked_words[word_index] & bits; + untracked_words[word_index] &= ~bits; + NotifyRasterizer(word_index, current_bits, ~u64{0}); + } + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); + u64 page = page_begin; + page_begin = 0; + + while (page < page_end) { + const int empty_bits = std::countr_zero(word >> page); + if (on_going && empty_bits != 0) { + InvokeModifiedRange(func, current_size, current_base); + current_size = 0; + on_going = false; + } + if (empty_bits == PAGES_PER_WORD) { + break; + } + page += empty_bits; + + const int continuous_bits = std::countr_one(word >> page); + if (!on_going && continuous_bits != 0) { + current_base = word_index * PAGES_PER_WORD + page; + on_going = true; + } + current_size += continuous_bits; + page += continuous_bits; + } + } + if (on_going && current_size > 0) { + InvokeModifiedRange(func, current_size, current_base); + } + } + + template + void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { + const u64 current_size_bytes = current_size * BYTES_PER_PAGE; + const u64 offset_begin = current_base * BYTES_PER_PAGE; + const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); + func(cpu_addr + offset_begin, offset_end - offset_begin); + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array(); + const u64* const state_words = Array(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); + u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; + for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; + if (word == 0) { + continue; + } + const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); + const u64 local_page_end = page_end % PAGES_PER_WORD; + const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; + if (((word >> page_index) << page_index) << page_end_shift != 0) { + return true; + } + } + return false; + } + + /** + * Returns a begin end pair with the inclusive modified region + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + const u64* const state_words = Array(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_base = offset / BYTES_PER_PAGE; + u64 page_begin = page_base & (PAGES_PER_WORD - 1); + u64 page_end = + Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1)); + u64 begin = std::numeric_limits::max(); + u64 end = 0; + for (u64 word_index = word_begin; word_index < word_end; ++word_index) { + const u64 base_mask = (1ULL << page_begin) - 1ULL; + const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL); + const u64 off_word = end_mask | base_mask; + const u64 word = state_words[word_index] & ~off_word; + if (word == 0) { + page_begin = 0; + page_end -= PAGES_PER_WORD; + continue; + } + const u64 local_page_begin = std::countr_zero(word); + const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); + const u64 page_index = word_index * PAGES_PER_WORD; + begin = std::min(begin, page_index + local_page_begin); + end = page_index + local_page_end; + page_begin = 0; + page_end -= PAGES_PER_WORD; + } + static constexpr std::pair EMPTY{0, 0}; + return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; + } + + /// Returns the number of words of the manager + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns the size in bytes of the manager + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + + void FlushCachedWrites() noexcept { + const u64 num_words = NumWords(); + u64* const cached_words = Array(); + u64* const untracked_words = Array(); + u64* const cpu_words = Array(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + cached_words[word_index] = 0; + } + } + +private: + template + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Notify rasterizer about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the rasterizer + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages + */ + template + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + while (changed_bits != 0) { + const int empty_bits = std::countr_zero(changed_bits); + addr += empty_bits * BYTES_PER_PAGE; + changed_bits >>= empty_bits; + + const u32 continuous_bits = std::countr_one(changed_bits); + const u64 size = continuous_bits * BYTES_PER_PAGE; + const VAddr begin_addr = addr; + addr += size; + changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; + rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); + } + } + + VAddr cpu_addr = 0; + RasterizerInterface* rasterizer = nullptr; + Words words; +}; + +} // namespace VideoCommon diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 0150457f7..869a2f020 100755 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -267,7 +267,7 @@ size_t Maxwell3D::EstimateIndexBufferSize() { std::numeric_limits::max(), std::numeric_limits::max()}; const size_t byte_size = regs.index_buffer.FormatSizeInBytes(); - const size_t log2_byte_size = Common::Log2Floor64(byte_size); + const size_t log2_byte_size = Common::Log2Ceil64(byte_size); return std::min( memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[log2_byte_size]) / byte_size, diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 2128f8d4b..ae268a870 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -193,17 +193,17 @@ struct GPU::Impl { [[nodiscard]] u64 GetTicks() const { // This values were reversed engineered by fincs from NVN - // The gpu clock is reported in units of 385/625 nanoseconds - constexpr u64 gpu_ticks_num = 384; - constexpr u64 gpu_ticks_den = 625; + // The GPU clock is 614.4 MHz + using NsToGPUTickRatio = std::ratio<614'400'000, std::nano::den>; + static_assert(NsToGPUTickRatio::num == 384 && NsToGPUTickRatio::den == 625); + + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); - u64 nanoseconds = system.CoreTiming().GetCPUTimeNs().count(); if (Settings::values.use_fast_gpu_time.GetValue()) { nanoseconds /= 256; } - const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; - const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; - return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; + + return nanoseconds * NsToGPUTickRatio::num / NsToGPUTickRatio::den; } [[nodiscard]] bool IsAsync() const { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 0c6df648b..ea40a425e 100755 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -8,6 +8,7 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -200,6 +201,8 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; + using Async_Buffer = u32; + using MemoryTracker = VideoCommon::MemoryTrackerBase; static constexpr bool IS_OPENGL = true; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; @@ -208,6 +211,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; static constexpr bool USE_MEMORY_MAPS = false; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; }; using BufferCache = VideoCommon::BufferCache; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100755 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache; +} diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 732624bba..c63c239f3 100755 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { return staging_pool.Request(size, MemoryUsage::Upload); } -StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { - return staging_pool.Request(size, MemoryUsage::Download); +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { + return staging_pool.Request(size, MemoryUsage::Download, deferred); +} + +void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) { + staging_pool.FreeDeferred(ref); } u64 BufferCacheRuntime::GetDeviceLocalMemory() const { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 7e3d03bc0..ea7d4cd4c 100755 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -3,7 +3,8 @@ #pragma once -#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache_base.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -75,7 +76,9 @@ public: [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); + + void FreeDeferredStagingBuffer(StagingBufferRef& ref); void PreCopyBarrier(); @@ -142,6 +145,8 @@ private: struct BufferCacheParams { using Runtime = Vulkan::BufferCacheRuntime; using Buffer = Vulkan::Buffer; + using Async_Buffer = Vulkan::StagingBufferRef; + using MemoryTracker = VideoCommon::MemoryTrackerBase; static constexpr bool IS_OPENGL = false; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; @@ -150,6 +155,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; }; using BufferCache = VideoCommon::BufferCache; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp new file mode 100755 index 000000000..f9e271507 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache; +} diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 47ffeaa7f..045cd6b10 100755 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include "common/alignment.h" #include "common/settings.h" @@ -17,15 +18,10 @@ namespace VideoCommon { -using Tegra::Texture::SwizzleSource; -using Tegra::Texture::TextureType; using Tegra::Texture::TICEntry; using Tegra::Texture::TSCEntry; using VideoCore::Surface::GetFormatType; -using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; using namespace Common::Literals; @@ -674,7 +670,8 @@ void TextureCache

::CommitAsyncFlushes() { bool any_none_dma = false; for (PendingDownload& download_info : download_ids) { if (download_info.is_swizzle) { - total_size_bytes += slot_images[download_info.object_id].unswizzled_size_bytes; + total_size_bytes += + Common::AlignUp(slot_images[download_info.object_id].unswizzled_size_bytes, 64); any_none_dma = true; download_info.async_buffer_id = last_async_buffer_id; } @@ -868,12 +865,16 @@ std::pair::Image*, BufferImageCopy> TextureCache

::Dm } template -void TextureCache

::DownloadImageIntoBuffer( - typename TextureCache

::Image* image, typename TextureCache

::BufferType buffer, - size_t buffer_offset, std::span copies, GPUVAddr address, size_t size) { +void TextureCache

::DownloadImageIntoBuffer(typename TextureCache

::Image* image, + typename TextureCache

::BufferType buffer, + size_t buffer_offset, + std::span copies, + GPUVAddr address, size_t size) { if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto slot = slot_buffer_downloads.insert(address, size); - uncommitted_downloads.emplace_back(false, uncommitted_async_buffers.size(), slot); + const BufferDownload new_buffer_download{address, size}; + auto slot = slot_buffer_downloads.insert(new_buffer_download); + const PendingDownload new_download{false, uncommitted_async_buffers.size(), slot}; + uncommitted_downloads.emplace_back(new_download); auto download_map = runtime.DownloadStagingBuffer(size, true); uncommitted_async_buffers.emplace_back(download_map); std::array buffers{ @@ -2269,7 +2270,8 @@ void TextureCache

::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) if (new_id) { const ImageViewBase& old_view = slot_image_views[new_id]; if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { - uncommitted_downloads.emplace_back(true, 0, old_view.image_id); + const PendingDownload new_download{true, 0, old_view.image_id}; + uncommitted_downloads.emplace_back(new_download); } } *old_id = new_id; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index aee4d9d2c..8d619f409 100755 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -40,14 +40,9 @@ struct ChannelState; namespace VideoCommon { -using Tegra::Texture::SwizzleSource; using Tegra::Texture::TICEntry; using Tegra::Texture::TSCEntry; -using VideoCore::Surface::GetFormatType; -using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using namespace Common::Literals; struct ImageViewInOut {