Use compute to parse fault buffer

This commit is contained in:
Lander Gallastegi 2025-04-28 01:01:56 +02:00
parent 629dc6132e
commit 69a39a3afe
4 changed files with 187 additions and 39 deletions

View File

@ -7,9 +7,11 @@
#include "common/types.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/host_shaders/fault_buffer_parser_comp.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/texture_cache/texture_cache.h"
namespace VideoCore {
@ -17,6 +19,8 @@ namespace VideoCore {
static constexpr size_t DataShareBufferSize = 64_KB;
static constexpr size_t StagingBufferSize = 512_MB;
static constexpr size_t UboStreamBufferSize = 128_MB;
static constexpr size_t DownloadBufferSize = 128_MB;
static constexpr size_t MaxPageFaults = 1024;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_,
@ -25,6 +29,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
texture_cache{texture_cache_}, tracker{tracker_},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize),
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
0, AllFlags, BDA_PAGETABLE_SIZE},
@ -43,6 +48,78 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
ASSERT(null_id.index == 0);
const vk::Buffer& null_buffer = slot_buffers[null_id].buffer;
Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer");
// Prepare the fault buffer parsing pipeline
boost::container::static_vector<vk::DescriptorSetLayoutBinding, 2> bindings{
{
.binding = 0,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
{
.binding = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
};
const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
.flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
.bindingCount = static_cast<u32>(bindings.size()),
.pBindings = bindings.data(),
};
auto [desc_layout_result, desc_layout] =
instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
ASSERT_MSG(desc_layout_result == vk::Result::eSuccess,
"Failed to create descriptor set layout: {}",
vk::to_string(desc_layout_result));
fault_parse_desc_layout = std::move(desc_layout);
const auto& module = Vulkan::Compile(
HostShaders::FAULT_BUFFER_PARSER_COMP, vk::ShaderStageFlagBits::eCompute,
instance.GetDevice());
Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser");
const vk::PipelineShaderStageCreateInfo shader_ci = {
.stage = vk::ShaderStageFlagBits::eCompute,
.module = module,
.pName = "main",
};
const vk::PushConstantRange push_constants = {
.stageFlags = vk::ShaderStageFlagBits::eCompute,
.offset = 0,
.size = sizeof(u32),
};
const vk::PipelineLayoutCreateInfo layout_info = {
.setLayoutCount = 1U,
.pSetLayouts = &(*fault_parse_desc_layout),
.pushConstantRangeCount = 1,
.pPushConstantRanges = &push_constants,
};
auto [layout_result, layout] =
instance.GetDevice().createPipelineLayoutUnique(layout_info);
ASSERT_MSG(layout_result == vk::Result::eSuccess,
"Failed to create pipeline layout: {}",
vk::to_string(layout_result));
fault_parse_pipeline_layout = std::move(layout);
const vk::ComputePipelineCreateInfo pipeline_info = {
.stage = shader_ci,
.layout = *fault_parse_pipeline_layout,
};
auto [pipeline_result, pipeline] =
instance.GetDevice().createComputePipelineUnique({}, pipeline_info);
ASSERT_MSG(pipeline_result == vk::Result::eSuccess,
"Failed to create compute pipeline: {}",
vk::to_string(pipeline_result));
fault_parse_pipeline = std::move(pipeline);
Vulkan::SetObjectName(instance.GetDevice(), *fault_parse_pipeline, "Fault Buffer Parser Pipeline");
instance.GetDevice().destroyShaderModule(module);
}
BufferCache::~BufferCache() = default;
@ -77,20 +154,20 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
if (total_size_bytes == 0) {
return;
}
const auto [staging, offset] = staging_buffer.Map(total_size_bytes);
const auto [download, offset] = download_buffer.Map(total_size_bytes);
for (auto& copy : copies) {
// Modify copies to have the staging offset in mind
copy.dstOffset += offset;
}
staging_buffer.Commit();
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, staging_buffer.Handle(), copies);
cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
scheduler.Finish();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
std::memcpy(std::bit_cast<u8*>(copy_device_addr), staging + dst_offset, copy.size);
std::memcpy(std::bit_cast<u8*>(copy_device_addr), download + dst_offset, copy.size);
}
}
@ -244,6 +321,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
bool is_texel_buffer, BufferId buffer_id) {
// For small uniform buffers that have not been modified by gpu
// use device local stream buffer to reduce renderpass breaks.
// Maybe we want to modify the threshold now that the page size is 16KB?
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size);
if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
@ -537,51 +615,80 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
}
void BufferCache::CreateFaultBuffers() {
// Download the fault readback buffer
const auto [mapped, offset] = staging_buffer.Map(FAULT_READBACK_SIZE);
vk::BufferCopy copy = {
.srcOffset = 0,
.dstOffset = offset,
.size = FAULT_READBACK_SIZE,
};
vk::BufferMemoryBarrier2 barrier{
const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64));
vk::BufferMemoryBarrier2 fault_readback_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferRead,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead,
.buffer = fault_readback_buffer.Handle(),
.offset = 0,
.size = FAULT_READBACK_SIZE,
};
vk::BufferMemoryBarrier2 download_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead,
.buffer = download_buffer.Handle(),
.offset = offset,
.size = MaxPageFaults * sizeof(u64),
};
std::array<vk::BufferMemoryBarrier2, 2> barriers{fault_readback_barrier, download_barrier};
vk::DescriptorBufferInfo fault_readback_info{
.buffer = fault_readback_buffer.Handle(),
.offset = 0,
.range = FAULT_READBACK_SIZE,
};
vk::DescriptorBufferInfo download_info{
.buffer = download_buffer.Handle(),
.offset = offset,
.range = MaxPageFaults * sizeof(u64),
};
boost::container::small_vector<vk::WriteDescriptorSet, 2> writes{
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &fault_readback_info,
},
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 1,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &download_info,
},
};
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &barrier,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = barriers.data(),
});
cmdbuf.copyBuffer(fault_readback_buffer.buffer, staging_buffer.Handle(), copy);
staging_buffer.Commit();
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_parse_pipeline);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_parse_pipeline_layout, 0, writes);
cmdbuf.pushConstants(*fault_parse_pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(u32), &CACHING_PAGEBITS);
constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup
constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u);
cmdbuf.dispatch(num_workgroups, 1, 1);
scheduler.DeferOperation([this, mapped]() {
std::memcpy(fault_readback_cpu.data(), mapped, FAULT_READBACK_SIZE);
// Create the fault buffers batched
boost::icl::interval_set<VAddr> fault_ranges;
for (u64 i = 0; i < FAULT_READBACK_SIZE; ++i) {
if (fault_readback_cpu[i] == 0) {
continue;
}
// Each bit is a page
const u64 page = i * 8;
for (u8 j = 0; j < 8; ++j) {
if ((fault_readback_cpu[i] & (1 << j)) == 0) {
continue;
}
const VAddr start = (page + j) << CACHING_PAGEBITS;
const VAddr end = start + CACHING_PAGESIZE;
fault_ranges +=
boost::icl::interval_set<VAddr>::interval_type::right_open(start, end);
LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at {:#x}", start);
}
const u64* fault_ptr = std::bit_cast<const u64*>(mapped);
u64 fault_count = *(fault_ptr++);
for (u64 i = 0; i < fault_count; ++i) {
const VAddr fault = *(fault_ptr++);
const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted
fault_ranges +=
boost::icl::interval_set<VAddr>::interval_type::right_open(fault, fault_end);
LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at page {:#x}", fault);
}
for (const auto& range : fault_ranges) {
const VAddr start = range.lower();
@ -593,7 +700,6 @@ void BufferCache::CreateFaultBuffers() {
}
}
});
scheduler.Flush();
}
void BufferCache::ResetFaultReadbackBuffer() {

View File

@ -3,7 +3,6 @@
#pragma once
#include <bitset>
#include <shared_mutex>
#include <boost/container/small_vector.hpp>
#include "common/div_ceil.h"
@ -200,11 +199,10 @@ private:
PageManager& tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
StreamBuffer download_buffer;
Buffer gds_buffer;
Buffer bda_pagetable_buffer;
Buffer fault_readback_buffer;
// We need to define here to avoid stack underflow
std::array<u8, FAULT_READBACK_SIZE> fault_readback_cpu;
boost::icl::interval_set<VAddr> queued_converages;
boost::icl::interval_set<u64> convered_regions;
std::shared_mutex covered_regions_mutex;
@ -213,6 +211,9 @@ private:
RangeSet gpu_modified_ranges;
MemoryTracker memory_tracker;
PageTable page_table;
vk::UniqueDescriptorSetLayout fault_parse_desc_layout;
vk::UniquePipeline fault_parse_pipeline;
vk::UniquePipelineLayout fault_parse_pipeline_layout;
};
} // namespace VideoCore

View File

@ -11,6 +11,7 @@ set(SHADER_FILES
detilers/micro_32bpp.comp
detilers/micro_64bpp.comp
detilers/micro_8bpp.comp
fault_buffer_parser.comp
fs_tri.vert
fsr.comp
post_process.frag

View File

@ -0,0 +1,40 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
#extension GL_ARB_gpu_shader_int64 : enable
#extension GL_EXT_shader_atomic_int64 : enable
layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint fault_buffer[];
};
layout(std430, binding = 1) buffer output_buf {
uint64_t parsed_buffer[];
};
layout(push_constant) uniform parsing_info {
uint caching_pagebits;
};
void main() {
uint id = gl_GlobalInvocationID.x;
uint word = fault_buffer[id];
if (word == 0u) {
return;
}
// 1 page per bit
uint base_bit = id * 32u;
while (word != 0u) {
uint bit = findLSB(word);
word &= word - 1;
uint page = base_bit + bit;
uint store_index = uint(atomicAdd(parsed_buffer[0], 1u)) + 1u;
// It is very unlikely, but should we check for overflow?
if (store_index < 1024u) { // only support 1024 page faults
parsed_buffer[store_index] = uint64_t(page) << caching_pagebits;
}
}
}