buffer_cache: Use separate thread for handling downloads

2025-07-22 18:15:14 +00:00 · 2025-06-24 01:30:02 +03:00 · 2025-06-24 01:30:02 +03:00 · 1af9e17176
commit 1af9e17176
parent 0d83fbf61b
5 changed files with 104 additions and 22 deletions
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@ -166,14 +166,21 @@ void EmitGetGotoVariable(EmitContext&) {
 using PointerType = EmitContext::PointerType;

 Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
-    const u32 flatbuf_off_dw = inst->Flags<u32>();
+    const u32 flatbuf_offset = inst->Flags<u32>();
+    const auto& flatbuf_buffer{ctx.buffers.back()};
+    ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf);
+    const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
+    const auto ptr{ctx.OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, ctx.u32_zero_value,
+                                     ctx.ConstU32(flatbuf_offset))};
+    return ctx.OpLoad(ctx.U32[1], ptr);
+
    // We can only provide a fallback for immediate offsets.
-    if (flatbuf_off_dw == 0) {
-        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
-    } else {
-        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
-                                  ctx.ConstU32(flatbuf_off_dw));
-    }
+    // if (flatbuf_off_dw == 0) {
+    //    return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
+    //} else {
+    //    return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
+    //                              ctx.ConstU32(flatbuf_off_dw));
+    //}
 }

 template <PointerType type>
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@ -136,6 +136,10 @@ void CollectShaderInfoPass(IR::Program& program) {
        }
    }

+    program.info.readconst_types = Info::ReadConstType::None;
+    program.info.dma_types = IR::Type::Void;
+    return;
+
    if (program.info.dma_types != IR::Type::Void) {
        program.info.buffers.push_back({
            .used_types = IR::Type::U64,
--- a/src/video_core/amdgpu/pm4_cmds.h
+++ b/src/video_core/amdgpu/pm4_cmds.h
@ -421,8 +421,8 @@ struct PM4CmdEventWriteEop {
    PM4Type3Header header;
    union {
        u32 event_control;
-        BitField<0, 6, EventType> event_type;  ///< Event type written to VGT_EVENT_INITIATOR
-        BitField<8, 4, u32> event_index; ///< Event index
+        BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR
+        BitField<8, 4, u32> event_index;      ///< Event index
    };
    u32 address_lo;
    union {
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@ -7,6 +7,7 @@
 #include "common/alignment.h"
 #include "common/debug.h"
 #include "common/div_ceil.h"
+#include "common/thread.h"
 #include "common/types.h"
 #include "core/memory.h"
 #include "video_core/amdgpu/liverpool.h"
@ -27,7 +28,7 @@ static constexpr size_t UboStreamBufferSize = 128_MB;
 static constexpr size_t DownloadBufferSize = 128_MB;
 static constexpr size_t DeviceBufferSize = 128_MB;
 static constexpr size_t MaxPageFaults = 1024;
-static constexpr size_t DownloadSizeThreshold = 1_MB;
+static constexpr size_t DownloadSizeThreshold = 512_KB;

 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
                         AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
@ -128,16 +129,24 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
                          "Fault Buffer Parser Pipeline");

    instance.GetDevice().destroyShaderModule(module);
+
+    async_download_thread = std::jthread{std::bind_front(&BufferCache::DownloadThread, this)};
 }

 BufferCache::~BufferCache() = default;

 void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
    const bool is_tracked = IsRegionRegistered(device_addr, size);
-    if (is_tracked) {
-        // Mark the page as CPU modified to stop tracking writes.
-        memory_tracker.MarkRegionAsCpuModified(device_addr, size);
+    if (!is_tracked) {
+        return;
    }
+
+    // Wait for any pending downloads to this page.
+    const u64 target_tick = page_table[device_addr >> CACHING_PAGEBITS].target_tick;
+    WaitForTargetTick(target_tick);
+
+    // Mark the page as CPU modified to stop tracking writes.
+    memory_tracker.MarkRegionAsCpuModified(device_addr, size);
 }

 void BufferCache::ReadMemory(VAddr device_addr, u64 size) {
@ -215,6 +224,11 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
    pending_download_ranges.ForEach([&](VAddr interval_lower, VAddr interval_upper) {
        const std::size_t size = interval_upper - interval_lower;
        const VAddr device_addr = interval_lower;
+        const u64 page_begin = device_addr >> CACHING_PAGEBITS;
+        const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
+        for (u64 page = page_begin; page != page_end; ++page) {
+            page_table[page].target_tick = current_download_tick;
+        }
        ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
            const VAddr buffer_start = buffer.CpuAddr();
            const VAddr buffer_end = buffer_start + buffer.SizeBytes();
@ -257,14 +271,14 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
        Buffer& buffer = slot_buffers[buffer_id];
        cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), buffer_copies);
    }
-    scheduler.DeferOperation([this, download, offset, copies]() {
+    const auto writeback_host = [this, download, offset, copies = std::move(copies)]() {
        auto* memory = Core::Memory::Instance();
        for (auto it = copies.begin(); it != copies.end(); ++it) {
            auto& buffer_copies = it.value();
            const BufferId buffer_id = it.key();
-            Buffer& buffer = slot_buffers[buffer_id];
+            const VAddr buffer_base = slot_buffers[buffer_id].CpuAddr();
            for (auto& copy : buffer_copies) {
-                const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
+                const VAddr copy_device_addr = buffer_base + copy.srcOffset;
                const u64 dst_offset = copy.dstOffset - offset;
                if (!memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr),
                                             download + dst_offset, copy.size)) {
@ -273,12 +287,18 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
                }
            }
        }
-    });
-    if (wait_done) {
-        scheduler.Finish();
-    } else {
-        scheduler.Flush();
+    };
+    {
+        std::scoped_lock lk{queue_mutex};
+        async_downloads.emplace(std::move(writeback_host), scheduler.CurrentTick(),
+                                current_download_tick);
    }
+    queue_cv.notify_one();
+    scheduler.Flush();
+    if (wait_done) {
+        WaitForTargetTick(current_download_tick);
+    }
+    ++current_download_tick;
    return true;
 }

@ -1233,4 +1253,29 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
    buffer.is_deleted = true;
 }

+void BufferCache::DownloadThread(std::stop_token stoken) {
+    Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
+
+    while (!stoken.stop_requested()) {
+        PendingDownload download;
+        {
+            std::unique_lock lk{queue_mutex};
+            Common::CondvarWait(queue_cv, lk, stoken, [this] { return !async_downloads.empty(); });
+            if (stoken.stop_requested()) {
+                break;
+            }
+            download = std::move(async_downloads.front());
+            async_downloads.pop();
+        }
+
+        // Wait for GPU to complete its work and writeback data to host
+        scheduler.Wait(download.gpu_tick);
+        download.callback();
+
+        // Signal completion of download
+        download_tick.store(download.signal_tick);
+        download_tick.notify_all();
+    }
+}
+
 } // namespace VideoCore
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -3,10 +3,15 @@

 #pragma once

+#include <atomic>
+#include <condition_variable>
 #include <shared_mutex>
+#include <thread>
 #include <boost/container/small_vector.hpp>
+#include <queue>
 #include "common/slot_vector.h"
 #include "common/types.h"
+#include "common/unique_function.h"
 #include "video_core/buffer_cache/buffer.h"
 #include "video_core/buffer_cache/memory_tracker.h"
 #include "video_core/buffer_cache/range_set.h"
@ -51,7 +56,7 @@ public:

    struct PageData {
        BufferId buffer_id{};
-        u64 fence_tick;
+        u64 target_tick{};
    };

    struct Traits {
@ -176,6 +181,14 @@ private:
        return !buffer_id || slot_buffers[buffer_id].is_deleted;
    }

+    inline void WaitForTargetTick(u64 target_tick) {
+        u64 tick = download_tick.load();
+        while (tick < target_tick) {
+            download_tick.wait(tick);
+            tick = download_tick.load();
+        }
+    }
+
    void DownloadBufferMemory(const Buffer& buffer, VAddr device_addr, u64 size);

    [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -201,6 +214,8 @@ private:

    void DeleteBuffer(BufferId buffer_id);

+    void DownloadThread(std::stop_token token);
+
    const Vulkan::Instance& instance;
    Vulkan::Scheduler& scheduler;
    AmdGpu::Liverpool* liverpool;
@ -224,6 +239,17 @@ private:
    vk::UniqueDescriptorSetLayout fault_process_desc_layout;
    vk::UniquePipeline fault_process_pipeline;
    vk::UniquePipelineLayout fault_process_pipeline_layout;
+    std::jthread async_download_thread;
+    struct PendingDownload {
+        Common::UniqueFunction<void> callback;
+        u64 gpu_tick;
+        u64 signal_tick;
+    };
+    std::mutex queue_mutex;
+    std::condition_variable_any queue_cv;
+    std::queue<PendingDownload> async_downloads;
+    u64 current_download_tick{0};
+    std::atomic<u64> download_tick{1};
 };

 } // namespace VideoCore