From 01a0e00dbbb6eff442d5a921b249b2d38b6f2b5d Mon Sep 17 00:00:00 2001 From: Lander Gallastegi Date: Mon, 5 May 2025 22:23:29 +0200 Subject: [PATCH] Sync the whole buffer insteed of only the range --- src/video_core/buffer_cache/buffer_cache.cpp | 14 +++++++------- src/video_core/buffer_cache/buffer_cache.h | 2 +- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 5 +++-- src/video_core/renderer_vulkan/vk_scheduler.cpp | 16 +++++++++++----- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 70d45ad3e..d87f8f0ab 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -909,18 +909,18 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, return true; } -void BufferCache::SynchronizeRange(VAddr device_addr, u64 size) { +void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { if (device_addr == 0) { return; } VAddr device_addr_end = device_addr + size; ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - VAddr buffer_start = buffer.CpuAddr(); - VAddr buffer_end = buffer_start + buffer.SizeBytes(); - VAddr start = std::max(buffer_start, device_addr); - VAddr end = std::min(buffer_end, device_addr_end); - u32 size = static_cast(end - start); - SynchronizeBuffer(buffer, start, size, false); + // Note that this function synchronizes the whole buffer, not just the range. + // This is because this function is used to sync buffers before using a + // shader that uses DMA. + // The ideal solution would be to sync all the mapped regions but it is + // very slow. + SynchronizeBuffer(buffer, buffer.CpuAddr(), buffer.SizeBytes(), false); }); } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 9da7bd804..335764183 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -137,7 +137,7 @@ public: void ProcessFaultBuffer(); /// Synchronizes all buffers in the specified range. - void SynchronizeRange(VAddr device_addr, u64 size); + void SynchronizeBuffersInRange(VAddr device_addr, u64 size); /// Record memory barrier. Used for buffers when accessed via BDA. void MemoryBarrier(); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d789912bb..78b88d21d 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -474,14 +474,15 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { pipeline->BindResources(set_writes, buffer_barriers, push_data); - if (uses_dma && !fault_process_pending) { + if (uses_dma) { // We only use fault buffer for DMA right now. { std::shared_lock lock{dma_sync_mapped_ranges_mutex}; for (const auto& range : dma_sync_mapped_ranges) { - buffer_cache.SynchronizeRange(range.lower(), range.upper() - range.lower()); + buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower()); } } + buffer_cache.MemoryBarrier(); } fault_process_pending |= uses_dma; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 46bc573de..d20862a64 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -91,11 +91,17 @@ void Scheduler::Wait(u64 tick) { } master_semaphore.Wait(tick); - // Apply pending operations until the wait tick - while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { - pending_ops.front().callback(); - pending_ops.pop(); - } + // TODO: We should be applyting pending operations here because that gives us + // the ability to use mapped regions on stream buffers in deferred operations. + // We don't do that right now because it might introduce varioations in the + // timing and, since we don't sync the GPU some games might be affected by that. + // It shouldn't be an issue right now, because we only use mapped regions in + // deferred operations to download faulted addresses. That is only 8KB every tick + // and the stream buffer is 256MB. GPU doesn't go that behind. + // while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { + // pending_ops.front().callback(); + // pending_ops.pop(); + // } } void Scheduler::AllocateWorkerCommandBuffers() {