From 69a39a3afe549d1f6253af8e914cb0b604b3ecac Mon Sep 17 00:00:00 2001 From: Lander Gallastegi Date: Mon, 28 Apr 2025 01:01:56 +0200 Subject: [PATCH] Use compute to parse fault buffer --- src/video_core/buffer_cache/buffer_cache.cpp | 178 ++++++++++++++---- src/video_core/buffer_cache/buffer_cache.h | 7 +- src/video_core/host_shaders/CMakeLists.txt | 1 + .../host_shaders/fault_buffer_parser.comp | 40 ++++ 4 files changed, 187 insertions(+), 39 deletions(-) create mode 100644 src/video_core/host_shaders/fault_buffer_parser.comp diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8bc2e2ec7..06d384c6a 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -7,9 +7,11 @@ #include "common/types.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/host_shaders/fault_buffer_parser_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/texture_cache.h" namespace VideoCore { @@ -17,6 +19,8 @@ namespace VideoCore { static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 128_MB; +static constexpr size_t DownloadBufferSize = 128_MB; +static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, @@ -25,6 +29,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, + download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize), gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, @@ -43,6 +48,78 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s ASSERT(null_id.index == 0); const vk::Buffer& null_buffer = slot_buffers[null_id].buffer; Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer"); + + // Prepare the fault buffer parsing pipeline + boost::container::static_vector bindings{ + { + .binding = 0, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + { + .binding = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }; + + const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { + .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .bindingCount = static_cast(bindings.size()), + .pBindings = bindings.data(), + }; + auto [desc_layout_result, desc_layout] = + instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + ASSERT_MSG(desc_layout_result == vk::Result::eSuccess, + "Failed to create descriptor set layout: {}", + vk::to_string(desc_layout_result)); + fault_parse_desc_layout = std::move(desc_layout); + + const auto& module = Vulkan::Compile( + HostShaders::FAULT_BUFFER_PARSER_COMP, vk::ShaderStageFlagBits::eCompute, + instance.GetDevice()); + Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser"); + + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + + const vk::PushConstantRange push_constants = { + .stageFlags = vk::ShaderStageFlagBits::eCompute, + .offset = 0, + .size = sizeof(u32), + }; + + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &(*fault_parse_desc_layout), + .pushConstantRangeCount = 1, + .pPushConstantRanges = &push_constants, + }; + auto [layout_result, layout] = + instance.GetDevice().createPipelineLayoutUnique(layout_info); + ASSERT_MSG(layout_result == vk::Result::eSuccess, + "Failed to create pipeline layout: {}", + vk::to_string(layout_result)); + fault_parse_pipeline_layout = std::move(layout); + + const vk::ComputePipelineCreateInfo pipeline_info = { + .stage = shader_ci, + .layout = *fault_parse_pipeline_layout, + }; + auto [pipeline_result, pipeline] = + instance.GetDevice().createComputePipelineUnique({}, pipeline_info); + ASSERT_MSG(pipeline_result == vk::Result::eSuccess, + "Failed to create compute pipeline: {}", + vk::to_string(pipeline_result)); + fault_parse_pipeline = std::move(pipeline); + Vulkan::SetObjectName(instance.GetDevice(), *fault_parse_pipeline, "Fault Buffer Parser Pipeline"); + + instance.GetDevice().destroyShaderModule(module); } BufferCache::~BufferCache() = default; @@ -77,20 +154,20 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si if (total_size_bytes == 0) { return; } - const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + const auto [download, offset] = download_buffer.Map(total_size_bytes); for (auto& copy : copies) { // Modify copies to have the staging offset in mind copy.dstOffset += offset; } - staging_buffer.Commit(); + download_buffer.Commit(); scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies); + cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies); scheduler.Finish(); for (const auto& copy : copies) { const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; - std::memcpy(std::bit_cast(copy_device_addr), staging + dst_offset, copy.size); + std::memcpy(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); } } @@ -244,6 +321,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b bool is_texel_buffer, BufferId buffer_id) { // For small uniform buffers that have not been modified by gpu // use device local stream buffer to reduce renderpass breaks. + // Maybe we want to modify the threshold now that the page size is 16KB? static constexpr u64 StreamThreshold = CACHING_PAGESIZE; const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { @@ -537,51 +615,80 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { } void BufferCache::CreateFaultBuffers() { - // Download the fault readback buffer - const auto [mapped, offset] = staging_buffer.Map(FAULT_READBACK_SIZE); - vk::BufferCopy copy = { - .srcOffset = 0, - .dstOffset = offset, - .size = FAULT_READBACK_SIZE, - }; - vk::BufferMemoryBarrier2 barrier{ + const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64)); + vk::BufferMemoryBarrier2 fault_readback_barrier{ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, .srcAccessMask = vk::AccessFlagBits2::eShaderWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .buffer = fault_readback_buffer.Handle(), .offset = 0, .size = FAULT_READBACK_SIZE, }; + vk::BufferMemoryBarrier2 download_barrier{ + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eComputeShader, + .dstAccessMask = vk::AccessFlagBits2::eShaderRead, + .buffer = download_buffer.Handle(), + .offset = offset, + .size = MaxPageFaults * sizeof(u64), + }; + std::array barriers{fault_readback_barrier, download_barrier}; + vk::DescriptorBufferInfo fault_readback_info{ + .buffer = fault_readback_buffer.Handle(), + .offset = 0, + .range = FAULT_READBACK_SIZE, + }; + vk::DescriptorBufferInfo download_info{ + .buffer = download_buffer.Handle(), + .offset = offset, + .range = MaxPageFaults * sizeof(u64), + }; + boost::container::small_vector writes{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &fault_readback_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &download_info, + }, + }; + download_buffer.Commit(); scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &barrier, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = barriers.data(), }); - cmdbuf.copyBuffer(fault_readback_buffer.buffer, staging_buffer.Handle(), copy); - staging_buffer.Commit(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_parse_pipeline); + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_parse_pipeline_layout, 0, writes); + cmdbuf.pushConstants(*fault_parse_pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(u32), &CACHING_PAGEBITS); + constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup + constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u); + cmdbuf.dispatch(num_workgroups, 1, 1); scheduler.DeferOperation([this, mapped]() { - std::memcpy(fault_readback_cpu.data(), mapped, FAULT_READBACK_SIZE); // Create the fault buffers batched boost::icl::interval_set fault_ranges; - for (u64 i = 0; i < FAULT_READBACK_SIZE; ++i) { - if (fault_readback_cpu[i] == 0) { - continue; - } - // Each bit is a page - const u64 page = i * 8; - for (u8 j = 0; j < 8; ++j) { - if ((fault_readback_cpu[i] & (1 << j)) == 0) { - continue; - } - const VAddr start = (page + j) << CACHING_PAGEBITS; - const VAddr end = start + CACHING_PAGESIZE; - fault_ranges += - boost::icl::interval_set::interval_type::right_open(start, end); - LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at {:#x}", start); - } + const u64* fault_ptr = std::bit_cast(mapped); + u64 fault_count = *(fault_ptr++); + for (u64 i = 0; i < fault_count; ++i) { + const VAddr fault = *(fault_ptr++); + const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted + fault_ranges += + boost::icl::interval_set::interval_type::right_open(fault, fault_end); + LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at page {:#x}", fault); } for (const auto& range : fault_ranges) { const VAddr start = range.lower(); @@ -593,7 +700,6 @@ void BufferCache::CreateFaultBuffers() { } } }); - scheduler.Flush(); } void BufferCache::ResetFaultReadbackBuffer() { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 315246ea2..038128745 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include #include "common/div_ceil.h" @@ -200,11 +199,10 @@ private: PageManager& tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; + StreamBuffer download_buffer; Buffer gds_buffer; Buffer bda_pagetable_buffer; Buffer fault_readback_buffer; - // We need to define here to avoid stack underflow - std::array fault_readback_cpu; boost::icl::interval_set queued_converages; boost::icl::interval_set convered_regions; std::shared_mutex covered_regions_mutex; @@ -213,6 +211,9 @@ private: RangeSet gpu_modified_ranges; MemoryTracker memory_tracker; PageTable page_table; + vk::UniqueDescriptorSetLayout fault_parse_desc_layout; + vk::UniquePipeline fault_parse_pipeline; + vk::UniquePipelineLayout fault_parse_pipeline_layout; }; } // namespace VideoCore diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 3001bf773..c439c860c 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -11,6 +11,7 @@ set(SHADER_FILES detilers/micro_32bpp.comp detilers/micro_64bpp.comp detilers/micro_8bpp.comp + fault_buffer_parser.comp fs_tri.vert fsr.comp post_process.frag diff --git a/src/video_core/host_shaders/fault_buffer_parser.comp b/src/video_core/host_shaders/fault_buffer_parser.comp new file mode 100644 index 000000000..b381a1299 --- /dev/null +++ b/src/video_core/host_shaders/fault_buffer_parser.comp @@ -0,0 +1,40 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 +#extension GL_ARB_gpu_shader_int64 : enable +#extension GL_EXT_shader_atomic_int64 : enable + +layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +layout(std430, binding = 0) buffer input_buf { + uint fault_buffer[]; +}; + +layout(std430, binding = 1) buffer output_buf { + uint64_t parsed_buffer[]; +}; + +layout(push_constant) uniform parsing_info { + uint caching_pagebits; +}; + +void main() { + uint id = gl_GlobalInvocationID.x; + uint word = fault_buffer[id]; + if (word == 0u) { + return; + } + // 1 page per bit + uint base_bit = id * 32u; + while (word != 0u) { + uint bit = findLSB(word); + word &= word - 1; + uint page = base_bit + bit; + uint store_index = uint(atomicAdd(parsed_buffer[0], 1u)) + 1u; + // It is very unlikely, but should we check for overflow? + if (store_index < 1024u) { // only support 1024 page faults + parsed_buffer[store_index] = uint64_t(page) << caching_pagebits; + } + } +} \ No newline at end of file