From 929853321be5e5b8cbe09199eeea05ad2f4b5ba2 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Sun, 22 Jun 2025 12:06:02 +0300 Subject: [PATCH] liverpool: Flush buffers on Rewind packet Rewind is used when command stream is modified by compute --- src/core/memory.cpp | 2 +- src/video_core/amdgpu/liverpool.cpp | 44 ++++--- src/video_core/buffer_cache/buffer_cache.cpp | 118 +++++++++++++++--- src/video_core/buffer_cache/buffer_cache.h | 13 +- src/video_core/page_manager.cpp | 2 +- .../renderer_vulkan/vk_rasterizer.cpp | 19 ++- .../renderer_vulkan/vk_rasterizer.h | 2 + 7 files changed, 160 insertions(+), 40 deletions(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index f70751f3a..02bef9626 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -138,7 +138,7 @@ void MemoryManager::CopySparseMemory(VAddr virtual_addr, u8* dest, u64 size) { bool MemoryManager::TryWriteBacking(void* address, const void* data, u32 num_bytes) { const VAddr virtual_addr = std::bit_cast(address); const auto& vma = FindVMA(virtual_addr)->second; - ASSERT_MSG(vma.Contains(virtual_addr, 0), + ASSERT_MSG(vma.Contains(virtual_addr, num_bytes), "Attempting to access out of bounds memory at address {:#x}", virtual_addr); if (vma.type != VMAType::Direct) { return false; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 464f02e3a..fb0b8d56b 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -638,9 +638,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, - dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { @@ -649,14 +648,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), - dma_data->NumBytes(), false); + rasterizer->CopyBuffer(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -702,6 +702,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header)); + const u32 flush_size = dcb.size_bytes(); + rasterizer->ReadMemory(flush_addr, flush_size); + } const PM4CmdRewind* rewind = reinterpret_cast(header); while (!rewind->Valid()) { YIELD_GFX(); @@ -872,28 +877,30 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq break; } if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), true); + rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), + true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, sizeof(u32), - false); + rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, + sizeof(u32), false); } else if (dma_data->src_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), dma_data->NumBytes(), - false); + rasterizer->CopyBuffer(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -904,6 +911,11 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq break; } case PM4ItOpcode::Rewind: { + if (rasterizer) { + const VAddr flush_addr = VAddr(reinterpret_cast(header)); + const u32 flush_size = acb_dwords * sizeof(u32); + rasterizer->ReadMemory(flush_addr, flush_size); + } const PM4CmdRewind* rewind = reinterpret_cast(header); while (!rewind->Valid()) { YIELD_ASC(vqid); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 23f9dc0bc..eb672c1b1 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -1,9 +1,10 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - +#pragma clang optimize off #include #include "common/alignment.h" #include "common/debug.h" +#include "common/div_ceil.h" #include "common/scope_exit.h" #include "common/types.h" #include "core/memory.h" @@ -27,9 +28,8 @@ static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, - TextureCache& texture_cache_, PageManager& tracker_) - : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, + AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, PageManager& tracker_) + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, @@ -129,18 +129,19 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s BufferCache::~BufferCache() = default; -void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) { +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { const bool is_tracked = IsRegionRegistered(device_addr, size); if (is_tracked) { // Mark the page as CPU modified to stop tracking writes. memory_tracker.MarkRegionAsCpuModified(device_addr, size); - - if (unmap) { - return; - } } } +void BufferCache::ReadMemory(VAddr device_addr, u64 size) { + Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)]; + DownloadBufferMemory(buffer, device_addr, size); +} + void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { boost::container::small_vector copies; u64 total_size_bytes = 0; @@ -155,7 +156,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si .dstOffset = total_size_bytes, .size = new_size, }); - total_size_bytes += new_size; + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; }; gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download); gpu_modified_ranges.Subtract(device_addr_out, range_size); @@ -176,7 +180,8 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si for (const auto& copy : copies) { const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; - std::memcpy(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); + ASSERT(memory->TryWriteBacking(std::bit_cast(copy_device_addr), + download + dst_offset, copy.size)); } } @@ -296,9 +301,12 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); - if (!is_gds && !IsRegionGpuModified(address, num_bytes)) { - memcpy(std::bit_cast(address), value, num_bytes); - return; + if (!is_gds) { + if (!IsRegionGpuModified(address, num_bytes)) { + memcpy(std::bit_cast(address), value, num_bytes); + } else { + ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + } } Buffer* buffer = [&] { if (is_gds) { @@ -310,6 +318,88 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo InlineDataBuffer(*buffer, address, value, num_bytes); } +void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) { + if (!src_gds && !IsRegionGpuModified(src, num_bytes)) { + // Both buffers were not transferred to GPU yet. Can safely copy in host memory. + memcpy(std::bit_cast(dst), std::bit_cast(src), num_bytes); + return; + } + // Without a readback there's nothing we can do with this + // Fallback to creating dst buffer on GPU to at least have this data there + } + const auto [src_buffer, src_offset] = [&] -> std::pair { + if (src_gds) { + return {&gds_buffer, src}; + } + return ObtainBuffer(src, num_bytes, false); + }(); + const auto [dst_buffer, dst_offset] = [&] -> std::pair { + if (dst_gds) { + return {&gds_buffer, dst}; + } + return ObtainBuffer(dst, num_bytes, true); + }(); + const vk::BufferCopy region{ + .srcOffset = src_offset, + .dstOffset = dst_offset, + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 buf_barriers_before[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = dst_buffer->Handle(), + .offset = dst_offset, + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = src_buffer->Handle(), + .offset = src_offset, + .size = num_bytes, + }, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_before, + }); + cmdbuf.copyBuffer(src_buffer->Handle(), dst_buffer->Handle(), region); + const vk::BufferMemoryBarrier2 buf_barriers_after[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = dst_buffer->Handle(), + .offset = dst_offset, + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .buffer = src_buffer->Handle(), + .offset = src_offset, + .size = num_bytes, + }, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_after, + }); +} + void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); if (!is_gds && !IsRegionRegistered(address, num_bytes)) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 651ba84dc..230dd70b0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -5,7 +5,6 @@ #include #include -#include "common/div_ceil.h" #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" @@ -71,8 +70,7 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool, - TextureCache& texture_cache, PageManager& tracker); + AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -110,7 +108,10 @@ public: } /// Invalidates any buffer in the logical page range. - void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); + void InvalidateMemory(VAddr device_addr, u64 size); + + /// Waits on pending downloads in the logical page range. + void ReadMemory(VAddr device_addr, u64 size); /// Binds host vertex buffers for the current draw. void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); @@ -121,6 +122,9 @@ public: /// Writes a value to GPU buffer. (uses command buffer to temporarily store the data) void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + /// Performs buffer to buffer data copy on the GPU. + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); + /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data) void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds); @@ -193,7 +197,6 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; - Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 145779070..268aa9a13 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -1,6 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - +#pragma clang optimize off #include #include "common/assert.h" #include "common/debug.h" diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 9dea5ceea..de90a4c3a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -947,6 +947,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo buffer_cache.InlineData(address, value, num_bytes, is_gds); } +void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); +} + u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { auto* gds_buf = buffer_cache.GetGdsBuffer(); u32 value; @@ -959,11 +963,20 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { // Not GPU mapped memory, can skip invalidation logic entirely. return false; } - buffer_cache.InvalidateMemory(addr, size, false); + buffer_cache.InvalidateMemory(addr, size); texture_cache.InvalidateMemory(addr, size); return true; } +bool Rasterizer::ReadMemory(VAddr addr, u64 size) { + if (!IsMapped(addr, size)) { + // Not GPU mapped memory, can skip invalidation logic entirely. + return false; + } + buffer_cache.ReadMemory(addr, size); + return true; +} + bool Rasterizer::IsMapped(VAddr addr, u64 size) { if (size == 0) { // There is no memory, so not mapped. @@ -984,7 +997,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) { } void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size, true); + buffer_cache.ReadMemory(addr, size); texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index fb9ca4bbe..c570ea368 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -56,8 +56,10 @@ public: bool from_guest = false); void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); u32 ReadDataFromGds(u32 gsd_offset); bool InvalidateMemory(VAddr addr, u64 size); + bool ReadMemory(VAddr addr, u64 size); bool IsMapped(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); void UnmapMemory(VAddr addr, u64 size);