Move presentation to separate thread/improve sync (#303)

* video_out: Move presentation to separate thread * liverpool: Better sync for CPU flips * driver: Make flip blocking * videoout: Proper flip rate and vblank management * config: Add vblank divider option * clang format * videoout: added `sceVideoOutWaitVblank` * clang format * vk_scheduler: Silly merge conflict * externals: Add renderdoc API * clang format * reuse * rdoc: manual capture trigger * clang fmt --------- Co-authored-by: psucien <168137814+psucien@users.noreply.github.com>
2025-12-10 21:58:45 +00:00 · 2024-07-28 16:54:09 +03:00
parent 361412031c
commit 0d6edaa0a0
32 changed files with 1259 additions and 224 deletions
--- a/src/video_core/amdgpu/liverpool.cpp
+++ b/src/video_core/amdgpu/liverpool.cpp
@@ -5,8 +5,10 @@
 #include "common/debug.h"
 #include "common/polyfill_thread.h"
 #include "common/thread.h"
+#include "core/libraries/videoout/driver.h"
 #include "video_core/amdgpu/liverpool.h"
 #include "video_core/amdgpu/pm4_cmds.h"
+#include "video_core/renderdoc.h"
 #include "video_core/renderer_vulkan/vk_rasterizer.h"

 namespace AmdGpu {
@@ -32,12 +34,15 @@ void Liverpool::Process(std::stop_token stoken) {
    while (!stoken.stop_requested()) {
        {
            std::unique_lock lk{submit_mutex};
-            Common::CondvarWait(submit_cv, lk, stoken, [this] { return num_submits != 0; });
+            Common::CondvarWait(submit_cv, lk, stoken,
+                                [this] { return num_submits != 0 || submit_done; });
        }
        if (stoken.stop_requested()) {
            break;
        }

+        VideoCore::StartCapture();
+
        int qid = -1;

        while (num_submits) {
@@ -48,11 +53,9 @@ void Liverpool::Process(std::stop_token stoken) {
            Task::Handle task{};
            {
                std::scoped_lock lock{queue.m_access};
-
                if (queue.submits.empty()) {
                    continue;
                }
-
                task = queue.submits.front();
            }
            task.resume();
@@ -64,9 +67,20 @@ void Liverpool::Process(std::stop_token stoken) {
                queue.submits.pop();

                --num_submits;
+                std::scoped_lock lock2{submit_mutex};
+                submit_cv.notify_all();
            }
        }

+        if (submit_done) {
+            VideoCore::EndCapture();
+
+            if (rasterizer) {
+                rasterizer->Flush();
+            }
+            submit_done = false;
+        }
+
        Platform::IrqC::Instance()->Signal(Platform::InterruptId::GpuIdle);
    }
 }
@@ -365,8 +379,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
            const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
            ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
            const u32 data_size = (header->type3.count.Value() - 2) * 4;
+            u64* address = write_data->Address<u64*>();
            if (!write_data->wr_one_addr.Value()) {
-                std::memcpy(write_data->Address<void*>(), write_data->data, data_size);
+                std::memcpy(address, write_data->data, data_size);
            } else {
                UNREACHABLE();
            }
@@ -379,6 +394,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
        case PM4ItOpcode::WaitRegMem: {
            const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
            ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
+            // Optimization: VO label waits are special because the emulator
+            // will write to the label when presentation is finished. So if
+            // there are no other submits to yield to we can sleep the thread
+            // instead and allow other tasks to run.
+            const u64* wait_addr = wait_reg_mem->Address<u64*>();
+            if (vo_port->IsVoLabel(wait_addr) && num_submits == 1) {
+                vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
+            }
            while (!wait_reg_mem->Test()) {
                TracyFiberLeave;
                co_yield {};
@@ -511,7 +534,7 @@ void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {

    auto task = ProcessGraphics(dcb, ccb);
    {
-        std::unique_lock lock{queue.m_access};
+        std::scoped_lock lock{queue.m_access};
        queue.submits.emplace(task.handle);
    }

@@ -526,7 +549,7 @@ void Liverpool::SubmitAsc(u32 vqid, std::span<const u32> acb) {

    const auto& task = ProcessCompute(acb, vqid);
    {
-        std::unique_lock lock{queue.m_access};
+        std::scoped_lock lock{queue.m_access};
        queue.submits.emplace(task.handle);
    }

--- a/src/video_core/amdgpu/liverpool.h
+++ b/src/video_core/amdgpu/liverpool.h
@@ -6,6 +6,7 @@
 #include <array>
 #include <condition_variable>
 #include <coroutine>
+#include <functional>
 #include <mutex>
 #include <span>
 #include <thread>
@@ -21,6 +22,10 @@ namespace Vulkan {
 class Rasterizer;
 }

+namespace Libraries::VideoOut {
+struct VideoOutPort;
+}
+
 namespace AmdGpu {

 #define GFX6_3D_REG_INDEX(field_name) (offsetof(AmdGpu::Liverpool::Regs, field_name) / sizeof(u32))
@@ -991,10 +996,25 @@ public:
    void SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb);
    void SubmitAsc(u32 vqid, std::span<const u32> acb);

+    void SubmitDone() noexcept {
+        std::scoped_lock lk{submit_mutex};
+        submit_done = true;
+        submit_cv.notify_one();
+    }
+
+    void WaitGpuIdle() noexcept {
+        std::unique_lock lk{submit_mutex};
+        submit_cv.wait(lk, [this] { return num_submits == 0; });
+    }
+
    bool IsGpuIdle() const {
        return num_submits == 0;
    }

+    void SetVoPort(Libraries::VideoOut::VideoOutPort* port) {
+        vo_port = port;
+    }
+
    void BindRasterizer(Vulkan::Rasterizer* rasterizer_) {
        rasterizer = rasterizer_;
    }
@@ -1059,8 +1079,10 @@ private:
    } cblock{};

    Vulkan::Rasterizer* rasterizer{};
+    Libraries::VideoOut::VideoOutPort* vo_port{};
    std::jthread process_thread{};
    std::atomic<u32> num_submits{};
+    std::atomic<bool> submit_done{};
    std::mutex submit_mutex;
    std::condition_variable_any submit_cv;
 };
--- a/src/video_core/amdgpu/pm4_cmds.h
+++ b/src/video_core/amdgpu/pm4_cmds.h
@@ -404,8 +404,9 @@ struct PM4CmdWaitRegMem {
    u32 mask;
    u32 poll_interval;

-    u32* Address() const {
-        return reinterpret_cast<u32*>((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo);
+    template <typename T = u32*>
+    T Address() const {
+        return reinterpret_cast<T>((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo);
    }

    bool Test() const {
@@ -464,8 +465,8 @@ struct PM4CmdWriteData {
    }

    template <typename T>
-    T* Address() const {
-        return reinterpret_cast<T*>(addr64);
+    T Address() const {
+        return reinterpret_cast<T>(addr64);
    }
 };

@@ -494,8 +495,9 @@ struct PM4CmdEventWriteEos {
        BitField<16, 16, u32> size; ///< Number of DWs to read from the GDS
    };

-    u32* Address() const {
-        return reinterpret_cast<u32*>(address_lo | u64(address_hi) << 32);
+    template <typename T = u32*>
+    T Address() const {
+        return reinterpret_cast<T>(address_lo | u64(address_hi) << 32);
    }

    u32 DataDWord() const {