Readbacks proof of concept rebased (#3178)

* Readbacks proof of concept

* liverpool: Use span for acb too

* config: Add readbacks config option

* config: Log readbacks
This commit is contained in:
TheTurtle
2025-07-01 23:41:00 +03:00
committed by GitHub
parent 5789fd881c
commit 0594dac405
17 changed files with 375 additions and 186 deletions

View File

@@ -72,8 +72,23 @@ Liverpool::~Liverpool() {
process_thread.join();
}
void Liverpool::ProcessCommands() {
// Process incoming commands with high priority
while (num_commands) {
Common::UniqueFunction<void> callback{};
{
std::scoped_lock lk{submit_mutex};
callback = std::move(command_queue.front());
command_queue.pop();
--num_commands;
}
callback();
}
}
void Liverpool::Process(std::stop_token stoken) {
Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
gpu_id = std::this_thread::get_id();
while (!stoken.stop_requested()) {
{
@@ -90,18 +105,7 @@ void Liverpool::Process(std::stop_token stoken) {
curr_qid = -1;
while (num_submits || num_commands) {
// Process incoming commands with high priority
while (num_commands) {
Common::UniqueFunction<void> callback{};
{
std::unique_lock lk{submit_mutex};
callback = std::move(command_queue.front());
command_queue.pop();
--num_commands;
}
callback();
}
ProcessCommands();
curr_qid = (curr_qid + 1) % num_mapped_queues;
@@ -147,6 +151,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
FIBER_ENTER(ccb_task_name);
while (!ccb.empty()) {
ProcessCommands();
const auto* header = reinterpret_cast<const PM4Header*>(ccb.data());
const u32 type = header->type;
if (type != 3) {
@@ -224,6 +230,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
while (!dcb.empty()) {
ProcessCommands();
const auto* header = reinterpret_cast<const PM4Header*>(dcb.data());
const u32 type = header->type;
@@ -638,9 +646,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo,
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), true, false);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@@ -649,14 +656,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
dma_data->NumBytes(), false, true);
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), false);
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<VAddr>(), dma_data->NumBytes(),
false, false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@@ -702,6 +710,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
break;
}
case PM4ItOpcode::Rewind: {
if (!rasterizer) {
break;
}
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_GFX();
@@ -801,29 +812,32 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
template <bool is_indirect>
Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) {
Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
FIBER_ENTER(acb_task_name[vqid]);
auto& queue = asc_queues[{vqid}];
auto base_addr = reinterpret_cast<VAddr>(acb);
while (acb_dwords > 0) {
auto* header = reinterpret_cast<const PM4Header*>(acb);
auto base_addr = reinterpret_cast<VAddr>(acb.data());
while (!acb.empty()) {
ProcessCommands();
auto* header = reinterpret_cast<const PM4Header*>(acb.data());
u32 next_dw_off = header->type3.NumWords() + 1;
// If we have a buffered packet, use it.
if (queue.tmp_dwords > 0) [[unlikely]] {
header = reinterpret_cast<const PM4Header*>(queue.tmp_packet.data());
next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords;
std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32));
std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb.data(),
next_dw_off * sizeof(u32));
queue.tmp_dwords = 0;
}
// If the packet is split across ring boundary, buffer until next submission
if (next_dw_off > acb_dwords) [[unlikely]] {
std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32));
queue.tmp_dwords = acb_dwords;
if (next_dw_off > acb.size()) [[unlikely]] {
std::memcpy(queue.tmp_packet.data(), acb.data(), acb.size_bytes());
queue.tmp_dwords = acb.size();
if constexpr (!is_indirect) {
*queue.read_addr += acb_dwords;
*queue.read_addr += acb.size();
*queue.read_addr %= queue.ring_size_dw;
}
break;
@@ -832,9 +846,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
if (header->type == 2) {
// Type-2 packet are used for padding purposes
next_dw_off = 1;
acb += next_dw_off;
acb_dwords -= next_dw_off;
acb = NextPacket(acb, next_dw_off);
if constexpr (!is_indirect) {
*queue.read_addr += next_dw_off;
*queue.read_addr %= queue.ring_size_dw;
@@ -856,8 +868,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
}
case PM4ItOpcode::IndirectBuffer: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCompute<true>(indirect_buffer->Address<const u32>(),
indirect_buffer->ib_size, vqid);
auto task = ProcessCompute<true>(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
RESUME_ASC(task, vqid);
while (!task.handle.done()) {
@@ -876,8 +888,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
dma_data->dst_sel == DmaDataDst::Gds) {
rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress<const void*>(),
dma_data->NumBytes(), true);
rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), true, false);
} else if (dma_data->src_sel == DmaDataSrc::Data &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
@@ -886,14 +898,14 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
} else if (dma_data->src_sel == DmaDataSrc::Gds &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
// LOG_WARNING(Render_Vulkan, "GDS memory read");
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->src_addr_lo,
dma_data->NumBytes(), false, true);
} else if ((dma_data->src_sel == DmaDataSrc::Memory ||
dma_data->src_sel == DmaDataSrc::MemoryUsingL2) &&
(dma_data->dst_sel == DmaDataDst::Memory ||
dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) {
rasterizer->InlineData(dma_data->DstAddress<VAddr>(),
dma_data->SrcAddress<const void*>(), dma_data->NumBytes(),
false);
rasterizer->CopyBuffer(dma_data->DstAddress<VAddr>(), dma_data->SrcAddress<VAddr>(),
dma_data->NumBytes(), false, false);
} else {
UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}",
u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value()));
@@ -904,6 +916,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
break;
}
case PM4ItOpcode::Rewind: {
if (!rasterizer) {
break;
}
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_ASC(vqid);
@@ -1016,8 +1031,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
static_cast<u32>(opcode), header->type3.NumWords());
}
acb += next_dw_off;
acb_dwords -= next_dw_off;
acb = NextPacket(acb, next_dw_off);
if constexpr (!is_indirect) {
*queue.read_addr += next_dw_off;
@@ -1087,7 +1101,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span<const u32> acb) {
auto& queue = mapped_queues[gnm_vqid];
const auto vqid = gnm_vqid - 1;
const auto& task = ProcessCompute(acb.data(), acb.size(), vqid);
const auto& task = ProcessCompute(acb, vqid);
{
std::scoped_lock lock{queue.m_access};
queue.submits.emplace(task.handle);

View File

@@ -8,6 +8,7 @@
#include <coroutine>
#include <exception>
#include <mutex>
#include <semaphore>
#include <span>
#include <thread>
#include <vector>
@@ -1512,14 +1513,32 @@ public:
rasterizer = rasterizer_;
}
void SendCommand(Common::UniqueFunction<void>&& func) {
std::scoped_lock lk{submit_mutex};
command_queue.emplace(std::move(func));
++num_commands;
submit_cv.notify_one();
template <bool wait_done = false>
void SendCommand(auto&& func) {
if (std::this_thread::get_id() == gpu_id) {
return func();
}
if constexpr (wait_done) {
std::binary_semaphore sem{0};
{
std::scoped_lock lk{submit_mutex};
command_queue.emplace([&sem, &func] {
func();
sem.release();
});
++num_commands;
submit_cv.notify_one();
}
sem.acquire();
} else {
std::scoped_lock lk{submit_mutex};
command_queue.emplace(std::move(func));
++num_commands;
submit_cv.notify_one();
}
}
void reserveCopyBufferSpace() {
void ReserveCopyBufferSpace() {
GpuQueue& gfx_queue = mapped_queues[GfxQueueId];
std::scoped_lock<std::mutex> lk(gfx_queue.m_access);
@@ -1581,8 +1600,9 @@ private:
Task ProcessGraphics(std::span<const u32> dcb, std::span<const u32> ccb);
Task ProcessCeUpdate(std::span<const u32> ccb);
template <bool is_indirect = false>
Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid);
Task ProcessCompute(std::span<const u32> acb, u32 vqid);
void ProcessCommands();
void Process(std::stop_token stoken);
struct GpuQueue {
@@ -1626,6 +1646,7 @@ private:
std::mutex submit_mutex;
std::condition_variable_any submit_cv;
std::queue<Common::UniqueFunction<void>> command_queue{};
std::thread::id gpu_id;
int curr_qid{-1};
};