buffer_cache: Use separate thread for handling downloads

This commit is contained in:
IndecisiveTurtle 2025-06-24 01:30:02 +03:00
parent 0d83fbf61b
commit 1af9e17176
5 changed files with 104 additions and 22 deletions

View File

@ -166,14 +166,21 @@ void EmitGetGotoVariable(EmitContext&) {
using PointerType = EmitContext::PointerType;
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
const u32 flatbuf_off_dw = inst->Flags<u32>();
const u32 flatbuf_offset = inst->Flags<u32>();
const auto& flatbuf_buffer{ctx.buffers.back()};
ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf);
const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
const auto ptr{ctx.OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, ctx.u32_zero_value,
ctx.ConstU32(flatbuf_offset))};
return ctx.OpLoad(ctx.U32[1], ptr);
// We can only provide a fallback for immediate offsets.
if (flatbuf_off_dw == 0) {
return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
} else {
return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
ctx.ConstU32(flatbuf_off_dw));
}
// if (flatbuf_off_dw == 0) {
// return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
//} else {
// return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
// ctx.ConstU32(flatbuf_off_dw));
//}
}
template <PointerType type>

View File

@ -136,6 +136,10 @@ void CollectShaderInfoPass(IR::Program& program) {
}
}
program.info.readconst_types = Info::ReadConstType::None;
program.info.dma_types = IR::Type::Void;
return;
if (program.info.dma_types != IR::Type::Void) {
program.info.buffers.push_back({
.used_types = IR::Type::U64,

View File

@ -421,8 +421,8 @@ struct PM4CmdEventWriteEop {
PM4Type3Header header;
union {
u32 event_control;
BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR
BitField<8, 4, u32> event_index; ///< Event index
BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR
BitField<8, 4, u32> event_index; ///< Event index
};
u32 address_lo;
union {

View File

@ -7,6 +7,7 @@
#include "common/alignment.h"
#include "common/debug.h"
#include "common/div_ceil.h"
#include "common/thread.h"
#include "common/types.h"
#include "core/memory.h"
#include "video_core/amdgpu/liverpool.h"
@ -27,7 +28,7 @@ static constexpr size_t UboStreamBufferSize = 128_MB;
static constexpr size_t DownloadBufferSize = 128_MB;
static constexpr size_t DeviceBufferSize = 128_MB;
static constexpr size_t MaxPageFaults = 1024;
static constexpr size_t DownloadSizeThreshold = 1_MB;
static constexpr size_t DownloadSizeThreshold = 512_KB;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
@ -128,16 +129,24 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
"Fault Buffer Parser Pipeline");
instance.GetDevice().destroyShaderModule(module);
async_download_thread = std::jthread{std::bind_front(&BufferCache::DownloadThread, this)};
}
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
const bool is_tracked = IsRegionRegistered(device_addr, size);
if (is_tracked) {
// Mark the page as CPU modified to stop tracking writes.
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
if (!is_tracked) {
return;
}
// Wait for any pending downloads to this page.
const u64 target_tick = page_table[device_addr >> CACHING_PAGEBITS].target_tick;
WaitForTargetTick(target_tick);
// Mark the page as CPU modified to stop tracking writes.
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
}
void BufferCache::ReadMemory(VAddr device_addr, u64 size) {
@ -215,6 +224,11 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
pending_download_ranges.ForEach([&](VAddr interval_lower, VAddr interval_upper) {
const std::size_t size = interval_upper - interval_lower;
const VAddr device_addr = interval_lower;
const u64 page_begin = device_addr >> CACHING_PAGEBITS;
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
for (u64 page = page_begin; page != page_end; ++page) {
page_table[page].target_tick = current_download_tick;
}
ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
const VAddr buffer_start = buffer.CpuAddr();
const VAddr buffer_end = buffer_start + buffer.SizeBytes();
@ -257,14 +271,14 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
Buffer& buffer = slot_buffers[buffer_id];
cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), buffer_copies);
}
scheduler.DeferOperation([this, download, offset, copies]() {
const auto writeback_host = [this, download, offset, copies = std::move(copies)]() {
auto* memory = Core::Memory::Instance();
for (auto it = copies.begin(); it != copies.end(); ++it) {
auto& buffer_copies = it.value();
const BufferId buffer_id = it.key();
Buffer& buffer = slot_buffers[buffer_id];
const VAddr buffer_base = slot_buffers[buffer_id].CpuAddr();
for (auto& copy : buffer_copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const VAddr copy_device_addr = buffer_base + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
if (!memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr),
download + dst_offset, copy.size)) {
@ -273,12 +287,18 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
}
}
}
});
if (wait_done) {
scheduler.Finish();
} else {
scheduler.Flush();
};
{
std::scoped_lock lk{queue_mutex};
async_downloads.emplace(std::move(writeback_host), scheduler.CurrentTick(),
current_download_tick);
}
queue_cv.notify_one();
scheduler.Flush();
if (wait_done) {
WaitForTargetTick(current_download_tick);
}
++current_download_tick;
return true;
}
@ -1233,4 +1253,29 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
buffer.is_deleted = true;
}
void BufferCache::DownloadThread(std::stop_token stoken) {
Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
while (!stoken.stop_requested()) {
PendingDownload download;
{
std::unique_lock lk{queue_mutex};
Common::CondvarWait(queue_cv, lk, stoken, [this] { return !async_downloads.empty(); });
if (stoken.stop_requested()) {
break;
}
download = std::move(async_downloads.front());
async_downloads.pop();
}
// Wait for GPU to complete its work and writeback data to host
scheduler.Wait(download.gpu_tick);
download.callback();
// Signal completion of download
download_tick.store(download.signal_tick);
download_tick.notify_all();
}
}
} // namespace VideoCore

View File

@ -3,10 +3,15 @@
#pragma once
#include <atomic>
#include <condition_variable>
#include <shared_mutex>
#include <thread>
#include <boost/container/small_vector.hpp>
#include <queue>
#include "common/slot_vector.h"
#include "common/types.h"
#include "common/unique_function.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/buffer_cache/memory_tracker.h"
#include "video_core/buffer_cache/range_set.h"
@ -51,7 +56,7 @@ public:
struct PageData {
BufferId buffer_id{};
u64 fence_tick;
u64 target_tick{};
};
struct Traits {
@ -176,6 +181,14 @@ private:
return !buffer_id || slot_buffers[buffer_id].is_deleted;
}
inline void WaitForTargetTick(u64 target_tick) {
u64 tick = download_tick.load();
while (tick < target_tick) {
download_tick.wait(tick);
tick = download_tick.load();
}
}
void DownloadBufferMemory(const Buffer& buffer, VAddr device_addr, u64 size);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@ -201,6 +214,8 @@ private:
void DeleteBuffer(BufferId buffer_id);
void DownloadThread(std::stop_token token);
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
AmdGpu::Liverpool* liverpool;
@ -224,6 +239,17 @@ private:
vk::UniqueDescriptorSetLayout fault_process_desc_layout;
vk::UniquePipeline fault_process_pipeline;
vk::UniquePipelineLayout fault_process_pipeline_layout;
std::jthread async_download_thread;
struct PendingDownload {
Common::UniqueFunction<void> callback;
u64 gpu_tick;
u64 signal_tick;
};
std::mutex queue_mutex;
std::condition_variable_any queue_cv;
std::queue<PendingDownload> async_downloads;
u64 current_download_tick{0};
std::atomic<u64> download_tick{1};
};
} // namespace VideoCore