Move presentation to separate thread/improve sync (#303)

* video_out: Move presentation to separate thread

* liverpool: Better sync for CPU flips

* driver: Make flip blocking

* videoout: Proper flip rate and vblank management

* config: Add vblank divider option

* clang format

* videoout: added `sceVideoOutWaitVblank`

* clang format

* vk_scheduler: Silly merge conflict

* externals: Add renderdoc API

* clang format

* reuse

* rdoc: manual capture trigger

* clang fmt

---------

Co-authored-by: psucien <168137814+psucien@users.noreply.github.com>
This commit is contained in:
TheTurtle
2024-07-28 16:54:09 +03:00
committed by GitHub
parent 361412031c
commit 0d6edaa0a0
32 changed files with 1259 additions and 224 deletions

View File

@@ -5,8 +5,10 @@
#include "common/debug.h"
#include "common/polyfill_thread.h"
#include "common/thread.h"
#include "core/libraries/videoout/driver.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/amdgpu/pm4_cmds.h"
#include "video_core/renderdoc.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
namespace AmdGpu {
@@ -32,12 +34,15 @@ void Liverpool::Process(std::stop_token stoken) {
while (!stoken.stop_requested()) {
{
std::unique_lock lk{submit_mutex};
Common::CondvarWait(submit_cv, lk, stoken, [this] { return num_submits != 0; });
Common::CondvarWait(submit_cv, lk, stoken,
[this] { return num_submits != 0 || submit_done; });
}
if (stoken.stop_requested()) {
break;
}
VideoCore::StartCapture();
int qid = -1;
while (num_submits) {
@@ -48,11 +53,9 @@ void Liverpool::Process(std::stop_token stoken) {
Task::Handle task{};
{
std::scoped_lock lock{queue.m_access};
if (queue.submits.empty()) {
continue;
}
task = queue.submits.front();
}
task.resume();
@@ -64,9 +67,20 @@ void Liverpool::Process(std::stop_token stoken) {
queue.submits.pop();
--num_submits;
std::scoped_lock lock2{submit_mutex};
submit_cv.notify_all();
}
}
if (submit_done) {
VideoCore::EndCapture();
if (rasterizer) {
rasterizer->Flush();
}
submit_done = false;
}
Platform::IrqC::Instance()->Signal(Platform::InterruptId::GpuIdle);
}
}
@@ -365,8 +379,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto* write_data = reinterpret_cast<const PM4CmdWriteData*>(header);
ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5);
const u32 data_size = (header->type3.count.Value() - 2) * 4;
u64* address = write_data->Address<u64*>();
if (!write_data->wr_one_addr.Value()) {
std::memcpy(write_data->Address<void*>(), write_data->data, data_size);
std::memcpy(address, write_data->data, data_size);
} else {
UNREACHABLE();
}
@@ -379,6 +394,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
case PM4ItOpcode::WaitRegMem: {
const auto* wait_reg_mem = reinterpret_cast<const PM4CmdWaitRegMem*>(header);
ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me);
// Optimization: VO label waits are special because the emulator
// will write to the label when presentation is finished. So if
// there are no other submits to yield to we can sleep the thread
// instead and allow other tasks to run.
const u64* wait_addr = wait_reg_mem->Address<u64*>();
if (vo_port->IsVoLabel(wait_addr) && num_submits == 1) {
vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
}
while (!wait_reg_mem->Test()) {
TracyFiberLeave;
co_yield {};
@@ -511,7 +534,7 @@ void Liverpool::SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb) {
auto task = ProcessGraphics(dcb, ccb);
{
std::unique_lock lock{queue.m_access};
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);
}
@@ -526,7 +549,7 @@ void Liverpool::SubmitAsc(u32 vqid, std::span<const u32> acb) {
const auto& task = ProcessCompute(acb, vqid);
{
std::unique_lock lock{queue.m_access};
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);
}

View File

@@ -6,6 +6,7 @@
#include <array>
#include <condition_variable>
#include <coroutine>
#include <functional>
#include <mutex>
#include <span>
#include <thread>
@@ -21,6 +22,10 @@ namespace Vulkan {
class Rasterizer;
}
namespace Libraries::VideoOut {
struct VideoOutPort;
}
namespace AmdGpu {
#define GFX6_3D_REG_INDEX(field_name) (offsetof(AmdGpu::Liverpool::Regs, field_name) / sizeof(u32))
@@ -991,10 +996,25 @@ public:
void SubmitGfx(std::span<const u32> dcb, std::span<const u32> ccb);
void SubmitAsc(u32 vqid, std::span<const u32> acb);
void SubmitDone() noexcept {
std::scoped_lock lk{submit_mutex};
submit_done = true;
submit_cv.notify_one();
}
void WaitGpuIdle() noexcept {
std::unique_lock lk{submit_mutex};
submit_cv.wait(lk, [this] { return num_submits == 0; });
}
bool IsGpuIdle() const {
return num_submits == 0;
}
void SetVoPort(Libraries::VideoOut::VideoOutPort* port) {
vo_port = port;
}
void BindRasterizer(Vulkan::Rasterizer* rasterizer_) {
rasterizer = rasterizer_;
}
@@ -1059,8 +1079,10 @@ private:
} cblock{};
Vulkan::Rasterizer* rasterizer{};
Libraries::VideoOut::VideoOutPort* vo_port{};
std::jthread process_thread{};
std::atomic<u32> num_submits{};
std::atomic<bool> submit_done{};
std::mutex submit_mutex;
std::condition_variable_any submit_cv;
};

View File

@@ -404,8 +404,9 @@ struct PM4CmdWaitRegMem {
u32 mask;
u32 poll_interval;
u32* Address() const {
return reinterpret_cast<u32*>((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo);
template <typename T = u32*>
T Address() const {
return reinterpret_cast<T>((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo);
}
bool Test() const {
@@ -464,8 +465,8 @@ struct PM4CmdWriteData {
}
template <typename T>
T* Address() const {
return reinterpret_cast<T*>(addr64);
T Address() const {
return reinterpret_cast<T>(addr64);
}
};
@@ -494,8 +495,9 @@ struct PM4CmdEventWriteEos {
BitField<16, 16, u32> size; ///< Number of DWs to read from the GDS
};
u32* Address() const {
return reinterpret_cast<u32*>(address_lo | u64(address_hi) << 32);
template <typename T = u32*>
T Address() const {
return reinterpret_cast<T>(address_lo | u64(address_hi) << 32);
}
u32 DataDWord() const {