From 3f86c2e94ad645d8a26b676b026b39fb1f2b8071 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Tue, 18 Nov 2025 08:46:51 +0200 Subject: [PATCH] buffer_cache: Split DMA fault handling code from buffer cache (#3809) Its better not to have that raw code there --- CMakeLists.txt | 2 + .../ir/passes/shader_info_collection_pass.cpp | 2 +- src/video_core/buffer_cache/buffer_cache.cpp | 238 +----------------- src/video_core/buffer_cache/buffer_cache.h | 21 +- src/video_core/buffer_cache/fault_manager.cpp | 177 +++++++++++++ src/video_core/buffer_cache/fault_manager.h | 42 ++++ .../host_shaders/fault_buffer_process.comp | 27 +- .../renderer_vulkan/vk_rasterizer.cpp | 13 +- .../renderer_vulkan/vk_scheduler.cpp | 17 +- 9 files changed, 249 insertions(+), 290 deletions(-) create mode 100644 src/video_core/buffer_cache/fault_manager.cpp create mode 100644 src/video_core/buffer_cache/fault_manager.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1491a3e1e..ddaf2422c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -960,6 +960,8 @@ set(VIDEO_CORE src/video_core/amdgpu/cb_db_extent.h src/video_core/buffer_cache/buffer.h src/video_core/buffer_cache/buffer_cache.cpp src/video_core/buffer_cache/buffer_cache.h + src/video_core/buffer_cache/fault_manager.cpp + src/video_core/buffer_cache/fault_manager.h src/video_core/buffer_cache/memory_tracker.h src/video_core/buffer_cache/range_set.h src/video_core/buffer_cache/region_definitions.h diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 3df4f8b86..c298a1092 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -190,7 +190,7 @@ void CollectShaderInfoPass(IR::Program& program, const Profile& profile) { }); info.buffers.push_back({ .used_types = IR::Type::U32, - .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE), + .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits::max()), .buffer_type = BufferType::FaultBuffer, .is_written = true, }); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index cb18bc190..ac3fac5b1 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -2,49 +2,42 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include #include "common/alignment.h" #include "common/debug.h" #include "common/scope_exit.h" -#include "common/types.h" #include "core/memory.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/buffer_cache/memory_tracker.h" -#include "video_core/host_shaders/fault_buffer_process_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" -#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/texture_cache.h" namespace VideoCore { static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; +static constexpr size_t DownloadBufferSize = 32_MB; static constexpr size_t UboStreamBufferSize = 64_MB; -static constexpr size_t DownloadBufferSize = 128_MB; static constexpr size_t DeviceBufferSize = 128_MB; -static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, PageManager& tracker) : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, + fault_manager{instance, scheduler, *this, CACHING_PAGEBITS, CACHING_NUMPAGES}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize}, device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, - 0, AllFlags, BDA_PAGETABLE_SIZE}, - fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) { + 0, AllFlags, BDA_PAGETABLE_SIZE} { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(), "BDA Page Table Buffer"); - Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer"); memory_tracker = std::make_unique(tracker); @@ -57,80 +50,6 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s const vk::Buffer& null_buffer = slot_buffers[null_id].buffer; Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer"); - // Prepare the fault buffer parsing pipeline - boost::container::static_vector bindings{ - { - .binding = 0, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .descriptorCount = 1, - .stageFlags = vk::ShaderStageFlagBits::eCompute, - }, - { - .binding = 1, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .descriptorCount = 1, - .stageFlags = vk::ShaderStageFlagBits::eCompute, - }, - }; - - const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { - .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, - .bindingCount = static_cast(bindings.size()), - .pBindings = bindings.data(), - }; - auto [desc_layout_result, desc_layout] = - instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); - ASSERT_MSG(desc_layout_result == vk::Result::eSuccess, - "Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result)); - fault_process_desc_layout = std::move(desc_layout); - - const auto& module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP, - vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); - Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser"); - - const vk::SpecializationMapEntry specialization_map_entry = { - .constantID = 0, - .offset = 0, - .size = sizeof(u32), - }; - - const vk::SpecializationInfo specialization_info = { - .mapEntryCount = 1, - .pMapEntries = &specialization_map_entry, - .dataSize = sizeof(u32), - .pData = &CACHING_PAGEBITS, - }; - - const vk::PipelineShaderStageCreateInfo shader_ci = { - .stage = vk::ShaderStageFlagBits::eCompute, - .module = module, - .pName = "main", - .pSpecializationInfo = &specialization_info, - }; - - const vk::PipelineLayoutCreateInfo layout_info = { - .setLayoutCount = 1U, - .pSetLayouts = &(*fault_process_desc_layout), - }; - auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info); - ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", - vk::to_string(layout_result)); - fault_process_pipeline_layout = std::move(layout); - - const vk::ComputePipelineCreateInfo pipeline_info = { - .stage = shader_ci, - .layout = *fault_process_pipeline_layout, - }; - auto [pipeline_result, pipeline] = - instance.GetDevice().createComputePipelineUnique({}, pipeline_info); - ASSERT_MSG(pipeline_result == vk::Result::eSuccess, "Failed to create compute pipeline: {}", - vk::to_string(pipeline_result)); - fault_process_pipeline = std::move(pipeline); - Vulkan::SetObjectName(instance.GetDevice(), *fault_process_pipeline, - "Fault Buffer Parser Pipeline"); - - instance.GetDevice().destroyShaderModule(module); - // Set up garbage collection parameters if (!instance.CanReportMemoryUsage()) { trigger_gc_memory = DEFAULT_TRIGGER_GC_MEMORY; @@ -656,14 +575,10 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { wanted_size = static_cast(device_addr_end - device_addr); const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size); const u32 size = static_cast(overlap.end - overlap.begin); - const BufferId new_buffer_id = [&] { - std::scoped_lock lk{slot_buffers_mutex}; - return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, - AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size); - }(); + const BufferId new_buffer_id = + slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, + AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size); auto& new_buffer = slot_buffers[new_buffer_id]; - const size_t size_bytes = new_buffer.SizeBytes(); - const auto cmdbuf = scheduler.CommandBuffer(); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } @@ -672,126 +587,7 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { } void BufferCache::ProcessFaultBuffer() { - // Run fault processing shader - const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64)); - vk::BufferMemoryBarrier2 fault_buffer_barrier{ - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, - .dstAccessMask = vk::AccessFlagBits2::eShaderRead, - .buffer = fault_buffer.Handle(), - .offset = 0, - .size = FAULT_BUFFER_SIZE, - }; - vk::BufferMemoryBarrier2 download_barrier{ - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, - .dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite, - .buffer = download_buffer.Handle(), - .offset = offset, - .size = MaxPageFaults * sizeof(u64), - }; - std::array barriers{fault_buffer_barrier, download_barrier}; - vk::DescriptorBufferInfo fault_buffer_info{ - .buffer = fault_buffer.Handle(), - .offset = 0, - .range = FAULT_BUFFER_SIZE, - }; - vk::DescriptorBufferInfo download_info{ - .buffer = download_buffer.Handle(), - .offset = offset, - .range = MaxPageFaults * sizeof(u64), - }; - boost::container::small_vector writes{ - { - .dstSet = VK_NULL_HANDLE, - .dstBinding = 0, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &fault_buffer_info, - }, - { - .dstSet = VK_NULL_HANDLE, - .dstBinding = 1, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &download_info, - }, - }; - download_buffer.Commit(); - scheduler.EndRendering(); - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 2, - .pBufferMemoryBarriers = barriers.data(), - }); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline); - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0, - writes); - constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup - constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u); - cmdbuf.dispatch(num_workgroups, 1, 1); - - // Reset fault buffer - const vk::BufferMemoryBarrier2 reset_pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, - .srcAccessMask = vk::AccessFlagBits2::eShaderRead, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = fault_buffer.Handle(), - .offset = 0, - .size = FAULT_BUFFER_SIZE, - }; - const vk::BufferMemoryBarrier2 reset_post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, - .buffer = fault_buffer.Handle(), - .offset = 0, - .size = FAULT_BUFFER_SIZE, - }; - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &reset_pre_barrier, - }); - cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &reset_post_barrier, - }); - - // Defer creating buffers - scheduler.DeferOperation([this, mapped]() { - // Create the fault buffers batched - boost::icl::interval_set fault_ranges; - const u64* fault_ptr = std::bit_cast(mapped); - const u32 fault_count = static_cast(*(fault_ptr++)); - for (u32 i = 0; i < fault_count; ++i) { - const VAddr fault = *(fault_ptr++); - const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted - fault_ranges += - boost::icl::interval_set::interval_type::right_open(fault, fault_end); - LOG_INFO(Render_Vulkan, "Accessed non-GPU cached memory at {:#x}", fault); - } - for (const auto& range : fault_ranges) { - const VAddr start = range.lower(); - const VAddr end = range.upper(); - const u64 page_start = start >> CACHING_PAGEBITS; - const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE); - // Buffer size is in 32 bits - ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits::max(), - "Buffer size is too large"); - CreateBuffer(start, static_cast(end - start)); - } - }); + fault_manager.ProcessFaultBuffer(); } void BufferCache::Register(BufferId buffer_id) { @@ -972,10 +768,7 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, } void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { - if (device_addr == 0) { - return; - } - VAddr device_addr_end = device_addr + size; + const VAddr device_addr_end = device_addr + size; ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { RENDERER_TRACE; VAddr start = std::max(buffer.CpuAddr(), device_addr); @@ -985,21 +778,6 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { }); } -void BufferCache::MemoryBarrier() { - scheduler.EndRendering(); - const auto cmdbuf = scheduler.CommandBuffer(); - vk::MemoryBarrier2 barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, - }; - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .memoryBarrierCount = 1, - .pMemoryBarriers = &barrier, - }); -} - void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) { scheduler.EndRendering(); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index ccf77b4f5..6954f979e 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -3,12 +3,12 @@ #pragma once -#include #include #include "common/lru_cache.h" #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" +#include "video_core/buffer_cache/fault_manager.h" #include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" @@ -40,9 +40,7 @@ public: static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; static constexpr u64 DEVICE_PAGESIZE = 16_KB; static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS); - static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress); - static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page // Default values for garbage collection static constexpr s64 DEFAULT_TRIGGER_GC_MEMORY = 1_GB; @@ -68,12 +66,6 @@ public: bool has_stream_leap = false; }; - using IntervalSet = - boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, @@ -92,7 +84,7 @@ public: /// Retrieves the fault buffer. [[nodiscard]] Buffer* GetFaultBuffer() noexcept { - return &fault_buffer; + return fault_manager.GetFaultBuffer(); } /// Retrieves the buffer with the specified id. @@ -160,9 +152,6 @@ public: /// Synchronizes all buffers neede for DMA. void SynchronizeDmaBuffers(); - /// Record memory barrier. Used for buffers when accessed via BDA. - void MemoryBarrier(); - /// Runs the garbage collector. void RunGarbageCollector(); @@ -217,6 +206,7 @@ private: AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; + FaultManager fault_manager; std::unique_ptr memory_tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; @@ -224,8 +214,6 @@ private: StreamBuffer device_buffer; Buffer gds_buffer; Buffer bda_pagetable_buffer; - Buffer fault_buffer; - std::shared_mutex slot_buffers_mutex; Common::SlotVector slot_buffers; u64 total_used_memory = 0; u64 trigger_gc_memory = 0; @@ -235,9 +223,6 @@ private: RangeSet gpu_modified_ranges; SplitRangeMap buffer_ranges; PageTable page_table; - vk::UniqueDescriptorSetLayout fault_process_desc_layout; - vk::UniquePipeline fault_process_pipeline; - vk::UniquePipelineLayout fault_process_pipeline_layout; }; } // namespace VideoCore diff --git a/src/video_core/buffer_cache/fault_manager.cpp b/src/video_core/buffer_cache/fault_manager.cpp new file mode 100644 index 000000000..e967ffd0e --- /dev/null +++ b/src/video_core/buffer_cache/fault_manager.cpp @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/div_ceil.h" +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/fault_manager.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_platform.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" + +#include "video_core/host_shaders/fault_buffer_process_comp.h" + +namespace VideoCore { + +static constexpr size_t MaxPageFaults = 1024; +static constexpr size_t PageFaultAreaSize = MaxPageFaults * sizeof(u64); + +FaultManager::FaultManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_, + BufferCache& buffer_cache_, u32 caching_pagebits, u64 caching_num_pages_) + : scheduler{scheduler_}, buffer_cache{buffer_cache_}, + caching_pagesize{1ULL << caching_pagebits}, caching_num_pages{caching_num_pages_}, + fault_buffer_size{caching_num_pages_ / 8}, + fault_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, fault_buffer_size}, + download_buffer{instance, scheduler, MemoryUsage::Download, + 0, AllFlags, MaxPendingFaults * PageFaultAreaSize} { + const auto device = instance.GetDevice(); + Vulkan::SetObjectName(device, fault_buffer.Handle(), "Fault Buffer"); + + const std::array bindings = {{ + { + .binding = 0, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + { + .binding = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }}; + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { + .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .bindingCount = 2, + .pBindings = bindings.data(), + }; + fault_process_desc_layout = + Vulkan::Check(device.createDescriptorSetLayoutUnique(desc_layout_ci)); + + std::vector defines{{fmt::format("CACHING_PAGEBITS={}", caching_pagebits), + fmt::format("MAX_PAGE_FAULTS={}", MaxPageFaults)}}; + const auto module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP, + vk::ShaderStageFlagBits::eCompute, device, defines); + Vulkan::SetObjectName(device, module, "Fault Buffer Parser"); + + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &(*fault_process_desc_layout), + }; + fault_process_pipeline_layout = Vulkan::Check(device.createPipelineLayoutUnique(layout_info)); + + const vk::ComputePipelineCreateInfo pipeline_info = { + .stage = shader_ci, + .layout = *fault_process_pipeline_layout, + }; + fault_process_pipeline = Vulkan::Check(device.createComputePipelineUnique({}, pipeline_info)); + Vulkan::SetObjectName(device, *fault_process_pipeline, "Fault Buffer Parser Pipeline"); + + device.destroyShaderModule(module); +} + +void FaultManager::ProcessFaultBuffer() { + if (u64 wait_tick = fault_areas[current_area]) { + scheduler.Wait(wait_tick); + scheduler.PopPendingOperations(); + } + + const u32 offset = current_area * PageFaultAreaSize; + u8* mapped = download_buffer.mapped_data.data() + offset; + std::memset(mapped, 0, PageFaultAreaSize); + + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .buffer = fault_buffer.Handle(), + .offset = 0, + .size = fault_buffer_size, + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eShaderWrite, + .buffer = fault_buffer.Handle(), + .offset = 0, + .size = fault_buffer_size, + }; + const vk::DescriptorBufferInfo fault_buffer_info = { + .buffer = fault_buffer.Handle(), + .offset = 0, + .range = fault_buffer_size, + }; + const vk::DescriptorBufferInfo download_info = { + .buffer = download_buffer.Handle(), + .offset = offset, + .range = PageFaultAreaSize, + }; + const std::array writes = {{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &fault_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &download_info, + }, + }}; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline); + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0, + writes); + // 1 bit per page, 32 pages per workgroup + const u32 num_threads = caching_num_pages / 32; + const u32 num_workgroups = Common::DivCeil(num_threads, 64u); + cmdbuf.dispatch(num_workgroups, 1, 1); + + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); + + scheduler.DeferOperation([this, mapped, area = current_area] { + fault_ranges.Clear(); + const u64* fault_buf = std::bit_cast(mapped); + const u32 fault_count = fault_buf[0]; + for (u32 i = 1; i <= fault_count; ++i) { + fault_ranges.Add(fault_buf[i], caching_pagesize); + LOG_INFO(Render_Vulkan, "Accessed non-GPU cached memory at {:#x}", fault_buf[i]); + } + fault_ranges.ForEach([&](VAddr start, VAddr end) { + ASSERT_MSG((end - start) <= std::numeric_limits::max(), + "Buffer size is too large"); + buffer_cache.FindBuffer(start, static_cast(end - start)); + }); + fault_areas[area] = 0; + }); + + fault_areas[current_area++] = scheduler.CurrentTick(); + current_area %= MaxPendingFaults; +} + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/fault_manager.h b/src/video_core/buffer_cache/fault_manager.h new file mode 100644 index 000000000..4fd545433 --- /dev/null +++ b/src/video_core/buffer_cache/fault_manager.h @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "video_core/buffer_cache/buffer.h" +#include "video_core/buffer_cache/range_set.h" + +namespace VideoCore { + +class BufferCache; + +class FaultManager { + static constexpr size_t MaxPendingFaults = 8; + +public: + explicit FaultManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + BufferCache& buffer_cache, u32 caching_pagebits, u64 caching_num_pages); + + [[nodiscard]] Buffer* GetFaultBuffer() noexcept { + return &fault_buffer; + } + + void ProcessFaultBuffer(); + +private: + Vulkan::Scheduler& scheduler; + BufferCache& buffer_cache; + RangeSet fault_ranges; + u64 caching_pagesize; + u64 caching_num_pages; + u64 fault_buffer_size; + Buffer fault_buffer; + Buffer download_buffer; + std::array fault_areas{}; + u32 current_area{}; + vk::UniqueDescriptorSetLayout fault_process_desc_layout; + vk::UniquePipeline fault_process_pipeline; + vk::UniquePipelineLayout fault_process_pipeline_layout; +}; + +} // namespace VideoCore diff --git a/src/video_core/host_shaders/fault_buffer_process.comp b/src/video_core/host_shaders/fault_buffer_process.comp index a712cf441..04a86bad3 100644 --- a/src/video_core/host_shaders/fault_buffer_process.comp +++ b/src/video_core/host_shaders/fault_buffer_process.comp @@ -13,30 +13,23 @@ layout(std430, binding = 0) buffer input_buf { layout(std430, binding = 1) buffer output_buf { uint64_t download_buffer[]; }; - -// Overlap for 32 bit atomics layout(std430, binding = 1) buffer output_buf32 { uint download_buffer32[]; }; -layout(constant_id = 0) const uint CACHING_PAGEBITS = 0; - void main() { - uint id = gl_GlobalInvocationID.x; + const uint id = gl_GlobalInvocationID.x; uint word = fault_buffer[id]; - if (word == 0u) { - return; - } - // 1 page per bit - uint base_bit = id * 32u; + fault_buffer[id] = 0u; + const uint base_bit = id * 32u; while (word != 0u) { - uint bit = findLSB(word); - word &= word - 1; - uint page = base_bit + bit; - uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u; - // It is very unlikely, but should we check for overflow? - if (store_index < 1024u) { // only support 1024 page faults - download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS; + const uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u; + if (store_index >= MAX_PAGE_FAULTS) { + return; } + const uint bit = findLSB(word); + word &= word - 1; + const uint page = base_bit + bit; + download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS; } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 37b8051e8..8d00ff2d0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -407,18 +407,13 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { if (uses_dma) { // We only use fault buffer for DMA right now. - { - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - for (auto& range : mapped_ranges) { - buffer_cache.SynchronizeBuffersInRange(range.lower(), - range.upper() - range.lower()); - } + Common::RecursiveSharedLock lock{mapped_ranges_mutex}; + for (auto& range : mapped_ranges) { + buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower()); } - buffer_cache.MemoryBarrier(); + fault_process_pending = true; } - fault_process_pending |= uses_dma; - return true; } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 78286957f..da7467dfb 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -84,15 +84,6 @@ void Scheduler::Wait(u64 tick) { Flush(info); } master_semaphore.Wait(tick); - - // CAUTION: This can introduce unexpected variation in the wait time. - // We don't currently sync the GPU, and some games are very sensitive to this. - // If this becomes a problem, it can be commented out. - // Idealy we would implement proper gpu sync. - while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { - pending_ops.front().callback(); - pending_ops.pop(); - } } void Scheduler::PopPendingOperations() { @@ -109,9 +100,7 @@ void Scheduler::AllocateWorkerCommandBuffers() { }; current_cmdbuf = command_pool.Commit(); - auto begin_result = current_cmdbuf.begin(begin_info); - ASSERT_MSG(begin_result == vk::Result::eSuccess, "Failed to begin command buffer: {}", - vk::to_string(begin_result)); + Check(current_cmdbuf.begin(begin_info)); // Invalidate dynamic state so it gets applied to the new command buffer. dynamic_state.Invalidate(); @@ -139,9 +128,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { #endif EndRendering(); - auto end_result = current_cmdbuf.end(); - ASSERT_MSG(end_result == vk::Result::eSuccess, "Failed to end command buffer: {}", - vk::to_string(end_result)); + Check(current_cmdbuf.end()); const vk::Semaphore timeline = master_semaphore.Handle(); info.AddSignal(timeline, signal_value);