diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index ccbe54d0a..1aba14bc5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -166,14 +166,21 @@ void EmitGetGotoVariable(EmitContext&) { using PointerType = EmitContext::PointerType; Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { - const u32 flatbuf_off_dw = inst->Flags(); + const u32 flatbuf_offset = inst->Flags(); + const auto& flatbuf_buffer{ctx.buffers.back()}; + ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf); + const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32]; + const auto ptr{ctx.OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, ctx.u32_zero_value, + ctx.ConstU32(flatbuf_offset))}; + return ctx.OpLoad(ctx.U32[1], ptr); + // We can only provide a fallback for immediate offsets. - if (flatbuf_off_dw == 0) { - return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset); - } else { - return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset, - ctx.ConstU32(flatbuf_off_dw)); - } + // if (flatbuf_off_dw == 0) { + // return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset); + //} else { + // return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset, + // ctx.ConstU32(flatbuf_off_dw)); + //} } template diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index b3b4ac36a..fa51119b2 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -136,6 +136,10 @@ void CollectShaderInfoPass(IR::Program& program) { } } + program.info.readconst_types = Info::ReadConstType::None; + program.info.dma_types = IR::Type::Void; + return; + if (program.info.dma_types != IR::Type::Void) { program.info.buffers.push_back({ .used_types = IR::Type::U64, diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 586980113..560689271 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -421,8 +421,8 @@ struct PM4CmdEventWriteEop { PM4Type3Header header; union { u32 event_control; - BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR - BitField<8, 4, u32> event_index; ///< Event index + BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR + BitField<8, 4, u32> event_index; ///< Event index }; u32 address_lo; union { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 484fae386..530925606 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -7,6 +7,7 @@ #include "common/alignment.h" #include "common/debug.h" #include "common/div_ceil.h" +#include "common/thread.h" #include "common/types.h" #include "core/memory.h" #include "video_core/amdgpu/liverpool.h" @@ -27,7 +28,7 @@ static constexpr size_t UboStreamBufferSize = 128_MB; static constexpr size_t DownloadBufferSize = 128_MB; static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; -static constexpr size_t DownloadSizeThreshold = 1_MB; +static constexpr size_t DownloadSizeThreshold = 512_KB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, @@ -128,16 +129,24 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s "Fault Buffer Parser Pipeline"); instance.GetDevice().destroyShaderModule(module); + + async_download_thread = std::jthread{std::bind_front(&BufferCache::DownloadThread, this)}; } BufferCache::~BufferCache() = default; void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { const bool is_tracked = IsRegionRegistered(device_addr, size); - if (is_tracked) { - // Mark the page as CPU modified to stop tracking writes. - memory_tracker.MarkRegionAsCpuModified(device_addr, size); + if (!is_tracked) { + return; } + + // Wait for any pending downloads to this page. + const u64 target_tick = page_table[device_addr >> CACHING_PAGEBITS].target_tick; + WaitForTargetTick(target_tick); + + // Mark the page as CPU modified to stop tracking writes. + memory_tracker.MarkRegionAsCpuModified(device_addr, size); } void BufferCache::ReadMemory(VAddr device_addr, u64 size) { @@ -215,6 +224,11 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) { pending_download_ranges.ForEach([&](VAddr interval_lower, VAddr interval_upper) { const std::size_t size = interval_upper - interval_lower; const VAddr device_addr = interval_lower; + const u64 page_begin = device_addr >> CACHING_PAGEBITS; + const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE); + for (u64 page = page_begin; page != page_end; ++page) { + page_table[page].target_tick = current_download_tick; + } ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { const VAddr buffer_start = buffer.CpuAddr(); const VAddr buffer_end = buffer_start + buffer.SizeBytes(); @@ -257,14 +271,14 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) { Buffer& buffer = slot_buffers[buffer_id]; cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), buffer_copies); } - scheduler.DeferOperation([this, download, offset, copies]() { + const auto writeback_host = [this, download, offset, copies = std::move(copies)]() { auto* memory = Core::Memory::Instance(); for (auto it = copies.begin(); it != copies.end(); ++it) { auto& buffer_copies = it.value(); const BufferId buffer_id = it.key(); - Buffer& buffer = slot_buffers[buffer_id]; + const VAddr buffer_base = slot_buffers[buffer_id].CpuAddr(); for (auto& copy : buffer_copies) { - const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; + const VAddr copy_device_addr = buffer_base + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; if (!memory->TryWriteBacking(std::bit_cast(copy_device_addr), download + dst_offset, copy.size)) { @@ -273,12 +287,18 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) { } } } - }); - if (wait_done) { - scheduler.Finish(); - } else { - scheduler.Flush(); + }; + { + std::scoped_lock lk{queue_mutex}; + async_downloads.emplace(std::move(writeback_host), scheduler.CurrentTick(), + current_download_tick); } + queue_cv.notify_one(); + scheduler.Flush(); + if (wait_done) { + WaitForTargetTick(current_download_tick); + } + ++current_download_tick; return true; } @@ -1233,4 +1253,29 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) { buffer.is_deleted = true; } +void BufferCache::DownloadThread(std::stop_token stoken) { + Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor"); + + while (!stoken.stop_requested()) { + PendingDownload download; + { + std::unique_lock lk{queue_mutex}; + Common::CondvarWait(queue_cv, lk, stoken, [this] { return !async_downloads.empty(); }); + if (stoken.stop_requested()) { + break; + } + download = std::move(async_downloads.front()); + async_downloads.pop(); + } + + // Wait for GPU to complete its work and writeback data to host + scheduler.Wait(download.gpu_tick); + download.callback(); + + // Signal completion of download + download_tick.store(download.signal_tick); + download_tick.notify_all(); + } +} + } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 48e27ba81..674f288b5 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -3,10 +3,15 @@ #pragma once +#include +#include #include +#include #include +#include #include "common/slot_vector.h" #include "common/types.h" +#include "common/unique_function.h" #include "video_core/buffer_cache/buffer.h" #include "video_core/buffer_cache/memory_tracker.h" #include "video_core/buffer_cache/range_set.h" @@ -51,7 +56,7 @@ public: struct PageData { BufferId buffer_id{}; - u64 fence_tick; + u64 target_tick{}; }; struct Traits { @@ -176,6 +181,14 @@ private: return !buffer_id || slot_buffers[buffer_id].is_deleted; } + inline void WaitForTargetTick(u64 target_tick) { + u64 tick = download_tick.load(); + while (tick < target_tick) { + download_tick.wait(tick); + tick = download_tick.load(); + } + } + void DownloadBufferMemory(const Buffer& buffer, VAddr device_addr, u64 size); [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); @@ -201,6 +214,8 @@ private: void DeleteBuffer(BufferId buffer_id); + void DownloadThread(std::stop_token token); + const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; AmdGpu::Liverpool* liverpool; @@ -224,6 +239,17 @@ private: vk::UniqueDescriptorSetLayout fault_process_desc_layout; vk::UniquePipeline fault_process_pipeline; vk::UniquePipelineLayout fault_process_pipeline_layout; + std::jthread async_download_thread; + struct PendingDownload { + Common::UniqueFunction callback; + u64 gpu_tick; + u64 signal_tick; + }; + std::mutex queue_mutex; + std::condition_variable_any queue_cv; + std::queue async_downloads; + u64 current_download_tick{0}; + std::atomic download_tick{1}; }; } // namespace VideoCore