buffer_cache: Split DMA fault handling code from buffer cache (#3809)

Its better not to have that raw code there
This commit is contained in:
TheTurtle
2025-11-18 08:46:51 +02:00
committed by GitHub
parent 5b699090e6
commit 3f86c2e94a
9 changed files with 249 additions and 290 deletions

View File

@@ -960,6 +960,8 @@ set(VIDEO_CORE src/video_core/amdgpu/cb_db_extent.h
src/video_core/buffer_cache/buffer.h
src/video_core/buffer_cache/buffer_cache.cpp
src/video_core/buffer_cache/buffer_cache.h
src/video_core/buffer_cache/fault_manager.cpp
src/video_core/buffer_cache/fault_manager.h
src/video_core/buffer_cache/memory_tracker.h
src/video_core/buffer_cache/range_set.h
src/video_core/buffer_cache/region_definitions.h

View File

@@ -190,7 +190,7 @@ void CollectShaderInfoPass(IR::Program& program, const Profile& profile) {
});
info.buffers.push_back({
.used_types = IR::Type::U32,
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::FAULT_BUFFER_SIZE),
.inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits<u32>::max()),
.buffer_type = BufferType::FaultBuffer,
.is_written = true,
});

View File

@@ -2,49 +2,42 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <mutex>
#include "common/alignment.h"
#include "common/debug.h"
#include "common/scope_exit.h"
#include "common/types.h"
#include "core/memory.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/buffer_cache/memory_tracker.h"
#include "video_core/host_shaders/fault_buffer_process_comp.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/texture_cache/texture_cache.h"
namespace VideoCore {
static constexpr size_t DataShareBufferSize = 64_KB;
static constexpr size_t StagingBufferSize = 512_MB;
static constexpr size_t DownloadBufferSize = 32_MB;
static constexpr size_t UboStreamBufferSize = 64_MB;
static constexpr size_t DownloadBufferSize = 128_MB;
static constexpr size_t DeviceBufferSize = 128_MB;
static constexpr size_t MaxPageFaults = 1024;
BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
PageManager& tracker)
: instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_},
memory{Core::Memory::Instance()}, texture_cache{texture_cache_},
fault_manager{instance, scheduler, *this, CACHING_PAGEBITS, CACHING_NUMPAGES},
staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize},
device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize},
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
0, AllFlags, BDA_PAGETABLE_SIZE},
fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) {
0, AllFlags, BDA_PAGETABLE_SIZE} {
Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(),
"BDA Page Table Buffer");
Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer");
memory_tracker = std::make_unique<MemoryTracker>(tracker);
@@ -57,80 +50,6 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
const vk::Buffer& null_buffer = slot_buffers[null_id].buffer;
Vulkan::SetObjectName(instance.GetDevice(), null_buffer, "Null Buffer");
// Prepare the fault buffer parsing pipeline
boost::container::static_vector<vk::DescriptorSetLayoutBinding, 2> bindings{
{
.binding = 0,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
{
.binding = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
};
const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
.flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
.bindingCount = static_cast<u32>(bindings.size()),
.pBindings = bindings.data(),
};
auto [desc_layout_result, desc_layout] =
instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci);
ASSERT_MSG(desc_layout_result == vk::Result::eSuccess,
"Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result));
fault_process_desc_layout = std::move(desc_layout);
const auto& module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP,
vk::ShaderStageFlagBits::eCompute, instance.GetDevice());
Vulkan::SetObjectName(instance.GetDevice(), module, "Fault Buffer Parser");
const vk::SpecializationMapEntry specialization_map_entry = {
.constantID = 0,
.offset = 0,
.size = sizeof(u32),
};
const vk::SpecializationInfo specialization_info = {
.mapEntryCount = 1,
.pMapEntries = &specialization_map_entry,
.dataSize = sizeof(u32),
.pData = &CACHING_PAGEBITS,
};
const vk::PipelineShaderStageCreateInfo shader_ci = {
.stage = vk::ShaderStageFlagBits::eCompute,
.module = module,
.pName = "main",
.pSpecializationInfo = &specialization_info,
};
const vk::PipelineLayoutCreateInfo layout_info = {
.setLayoutCount = 1U,
.pSetLayouts = &(*fault_process_desc_layout),
};
auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info);
ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}",
vk::to_string(layout_result));
fault_process_pipeline_layout = std::move(layout);
const vk::ComputePipelineCreateInfo pipeline_info = {
.stage = shader_ci,
.layout = *fault_process_pipeline_layout,
};
auto [pipeline_result, pipeline] =
instance.GetDevice().createComputePipelineUnique({}, pipeline_info);
ASSERT_MSG(pipeline_result == vk::Result::eSuccess, "Failed to create compute pipeline: {}",
vk::to_string(pipeline_result));
fault_process_pipeline = std::move(pipeline);
Vulkan::SetObjectName(instance.GetDevice(), *fault_process_pipeline,
"Fault Buffer Parser Pipeline");
instance.GetDevice().destroyShaderModule(module);
// Set up garbage collection parameters
if (!instance.CanReportMemoryUsage()) {
trigger_gc_memory = DEFAULT_TRIGGER_GC_MEMORY;
@@ -656,14 +575,10 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
wanted_size = static_cast<u32>(device_addr_end - device_addr);
const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size);
const u32 size = static_cast<u32>(overlap.end - overlap.begin);
const BufferId new_buffer_id = [&] {
std::scoped_lock lk{slot_buffers_mutex};
return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin,
AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
}();
const BufferId new_buffer_id =
slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin,
AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
auto& new_buffer = slot_buffers[new_buffer_id];
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
for (const BufferId overlap_id : overlap.ids) {
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
}
@@ -672,126 +587,7 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
}
void BufferCache::ProcessFaultBuffer() {
// Run fault processing shader
const auto [mapped, offset] = download_buffer.Map(MaxPageFaults * sizeof(u64));
vk::BufferMemoryBarrier2 fault_buffer_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
vk::BufferMemoryBarrier2 download_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eShaderWrite,
.buffer = download_buffer.Handle(),
.offset = offset,
.size = MaxPageFaults * sizeof(u64),
};
std::array<vk::BufferMemoryBarrier2, 2> barriers{fault_buffer_barrier, download_barrier};
vk::DescriptorBufferInfo fault_buffer_info{
.buffer = fault_buffer.Handle(),
.offset = 0,
.range = FAULT_BUFFER_SIZE,
};
vk::DescriptorBufferInfo download_info{
.buffer = download_buffer.Handle(),
.offset = offset,
.range = MaxPageFaults * sizeof(u64),
};
boost::container::small_vector<vk::WriteDescriptorSet, 2> writes{
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &fault_buffer_info,
},
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 1,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &download_info,
},
};
download_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.fillBuffer(download_buffer.Handle(), offset, MaxPageFaults * sizeof(u64), 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 2,
.pBufferMemoryBarriers = barriers.data(),
});
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0,
writes);
constexpr u32 num_threads = CACHING_NUMPAGES / 32; // 1 bit per page, 32 pages per workgroup
constexpr u32 num_workgroups = Common::DivCeil(num_threads, 64u);
cmdbuf.dispatch(num_workgroups, 1, 1);
// Reset fault buffer
const vk::BufferMemoryBarrier2 reset_pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.srcAccessMask = vk::AccessFlagBits2::eShaderRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
const vk::BufferMemoryBarrier2 reset_post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = FAULT_BUFFER_SIZE,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_pre_barrier,
});
cmdbuf.fillBuffer(fault_buffer.buffer, 0, FAULT_BUFFER_SIZE, 0);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &reset_post_barrier,
});
// Defer creating buffers
scheduler.DeferOperation([this, mapped]() {
// Create the fault buffers batched
boost::icl::interval_set<VAddr> fault_ranges;
const u64* fault_ptr = std::bit_cast<const u64*>(mapped);
const u32 fault_count = static_cast<u32>(*(fault_ptr++));
for (u32 i = 0; i < fault_count; ++i) {
const VAddr fault = *(fault_ptr++);
const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted
fault_ranges +=
boost::icl::interval_set<VAddr>::interval_type::right_open(fault, fault_end);
LOG_INFO(Render_Vulkan, "Accessed non-GPU cached memory at {:#x}", fault);
}
for (const auto& range : fault_ranges) {
const VAddr start = range.lower();
const VAddr end = range.upper();
const u64 page_start = start >> CACHING_PAGEBITS;
const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE);
// Buffer size is in 32 bits
ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits<u32>::max(),
"Buffer size is too large");
CreateBuffer(start, static_cast<u32>(end - start));
}
});
fault_manager.ProcessFaultBuffer();
}
void BufferCache::Register(BufferId buffer_id) {
@@ -972,10 +768,7 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
}
void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
if (device_addr == 0) {
return;
}
VAddr device_addr_end = device_addr + size;
const VAddr device_addr_end = device_addr + size;
ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
RENDERER_TRACE;
VAddr start = std::max(buffer.CpuAddr(), device_addr);
@@ -985,21 +778,6 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
});
}
void BufferCache::MemoryBarrier() {
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
vk::MemoryBarrier2 barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.memoryBarrierCount = 1,
.pMemoryBarriers = &barrier,
});
}
void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value,
u32 num_bytes) {
scheduler.EndRendering();

View File

@@ -3,12 +3,12 @@
#pragma once
#include <shared_mutex>
#include <boost/container/small_vector.hpp>
#include "common/lru_cache.h"
#include "common/slot_vector.h"
#include "common/types.h"
#include "video_core/buffer_cache/buffer.h"
#include "video_core/buffer_cache/fault_manager.h"
#include "video_core/buffer_cache/range_set.h"
#include "video_core/multi_level_page_table.h"
@@ -40,9 +40,7 @@ public:
static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
static constexpr u64 DEVICE_PAGESIZE = 16_KB;
static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS);
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page
// Default values for garbage collection
static constexpr s64 DEFAULT_TRIGGER_GC_MEMORY = 1_GB;
@@ -68,12 +66,6 @@ public:
bool has_stream_leap = false;
};
using IntervalSet =
boost::icl::interval_set<VAddr, std::less,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalSet::interval_type;
public:
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
AmdGpu::Liverpool* liverpool, TextureCache& texture_cache,
@@ -92,7 +84,7 @@ public:
/// Retrieves the fault buffer.
[[nodiscard]] Buffer* GetFaultBuffer() noexcept {
return &fault_buffer;
return fault_manager.GetFaultBuffer();
}
/// Retrieves the buffer with the specified id.
@@ -160,9 +152,6 @@ public:
/// Synchronizes all buffers neede for DMA.
void SynchronizeDmaBuffers();
/// Record memory barrier. Used for buffers when accessed via BDA.
void MemoryBarrier();
/// Runs the garbage collector.
void RunGarbageCollector();
@@ -217,6 +206,7 @@ private:
AmdGpu::Liverpool* liverpool;
Core::MemoryManager* memory;
TextureCache& texture_cache;
FaultManager fault_manager;
std::unique_ptr<MemoryTracker> memory_tracker;
StreamBuffer staging_buffer;
StreamBuffer stream_buffer;
@@ -224,8 +214,6 @@ private:
StreamBuffer device_buffer;
Buffer gds_buffer;
Buffer bda_pagetable_buffer;
Buffer fault_buffer;
std::shared_mutex slot_buffers_mutex;
Common::SlotVector<Buffer> slot_buffers;
u64 total_used_memory = 0;
u64 trigger_gc_memory = 0;
@@ -235,9 +223,6 @@ private:
RangeSet gpu_modified_ranges;
SplitRangeMap<BufferId> buffer_ranges;
PageTable page_table;
vk::UniqueDescriptorSetLayout fault_process_desc_layout;
vk::UniquePipeline fault_process_pipeline;
vk::UniquePipelineLayout fault_process_pipeline_layout;
};
} // namespace VideoCore

View File

@@ -0,0 +1,177 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/div_ceil.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/buffer_cache/fault_manager.h"
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_platform.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/host_shaders/fault_buffer_process_comp.h"
namespace VideoCore {
static constexpr size_t MaxPageFaults = 1024;
static constexpr size_t PageFaultAreaSize = MaxPageFaults * sizeof(u64);
FaultManager::FaultManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_,
BufferCache& buffer_cache_, u32 caching_pagebits, u64 caching_num_pages_)
: scheduler{scheduler_}, buffer_cache{buffer_cache_},
caching_pagesize{1ULL << caching_pagebits}, caching_num_pages{caching_num_pages_},
fault_buffer_size{caching_num_pages_ / 8},
fault_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, fault_buffer_size},
download_buffer{instance, scheduler, MemoryUsage::Download,
0, AllFlags, MaxPendingFaults * PageFaultAreaSize} {
const auto device = instance.GetDevice();
Vulkan::SetObjectName(device, fault_buffer.Handle(), "Fault Buffer");
const std::array<vk::DescriptorSetLayoutBinding, 2> bindings = {{
{
.binding = 0,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
{
.binding = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
},
}};
const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = {
.flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR,
.bindingCount = 2,
.pBindings = bindings.data(),
};
fault_process_desc_layout =
Vulkan::Check(device.createDescriptorSetLayoutUnique(desc_layout_ci));
std::vector<std::string> defines{{fmt::format("CACHING_PAGEBITS={}", caching_pagebits),
fmt::format("MAX_PAGE_FAULTS={}", MaxPageFaults)}};
const auto module = Vulkan::Compile(HostShaders::FAULT_BUFFER_PROCESS_COMP,
vk::ShaderStageFlagBits::eCompute, device, defines);
Vulkan::SetObjectName(device, module, "Fault Buffer Parser");
const vk::PipelineShaderStageCreateInfo shader_ci = {
.stage = vk::ShaderStageFlagBits::eCompute,
.module = module,
.pName = "main",
};
const vk::PipelineLayoutCreateInfo layout_info = {
.setLayoutCount = 1U,
.pSetLayouts = &(*fault_process_desc_layout),
};
fault_process_pipeline_layout = Vulkan::Check(device.createPipelineLayoutUnique(layout_info));
const vk::ComputePipelineCreateInfo pipeline_info = {
.stage = shader_ci,
.layout = *fault_process_pipeline_layout,
};
fault_process_pipeline = Vulkan::Check(device.createComputePipelineUnique({}, pipeline_info));
Vulkan::SetObjectName(device, *fault_process_pipeline, "Fault Buffer Parser Pipeline");
device.destroyShaderModule(module);
}
void FaultManager::ProcessFaultBuffer() {
if (u64 wait_tick = fault_areas[current_area]) {
scheduler.Wait(wait_tick);
scheduler.PopPendingOperations();
}
const u32 offset = current_area * PageFaultAreaSize;
u8* mapped = download_buffer.mapped_data.data() + offset;
std::memset(mapped, 0, PageFaultAreaSize);
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.dstAccessMask = vk::AccessFlagBits2::eShaderRead,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = fault_buffer_size,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
.srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eShaderWrite,
.buffer = fault_buffer.Handle(),
.offset = 0,
.size = fault_buffer_size,
};
const vk::DescriptorBufferInfo fault_buffer_info = {
.buffer = fault_buffer.Handle(),
.offset = 0,
.range = fault_buffer_size,
};
const vk::DescriptorBufferInfo download_info = {
.buffer = download_buffer.Handle(),
.offset = offset,
.range = PageFaultAreaSize,
};
const std::array<vk::WriteDescriptorSet, 2> writes = {{
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &fault_buffer_info,
},
{
.dstSet = VK_NULL_HANDLE,
.dstBinding = 1,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eStorageBuffer,
.pBufferInfo = &download_info,
},
}};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *fault_process_pipeline);
cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *fault_process_pipeline_layout, 0,
writes);
// 1 bit per page, 32 pages per workgroup
const u32 num_threads = caching_num_pages / 32;
const u32 num_workgroups = Common::DivCeil(num_threads, 64u);
cmdbuf.dispatch(num_workgroups, 1, 1);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
scheduler.DeferOperation([this, mapped, area = current_area] {
fault_ranges.Clear();
const u64* fault_buf = std::bit_cast<const u64*>(mapped);
const u32 fault_count = fault_buf[0];
for (u32 i = 1; i <= fault_count; ++i) {
fault_ranges.Add(fault_buf[i], caching_pagesize);
LOG_INFO(Render_Vulkan, "Accessed non-GPU cached memory at {:#x}", fault_buf[i]);
}
fault_ranges.ForEach([&](VAddr start, VAddr end) {
ASSERT_MSG((end - start) <= std::numeric_limits<u32>::max(),
"Buffer size is too large");
buffer_cache.FindBuffer(start, static_cast<u32>(end - start));
});
fault_areas[area] = 0;
});
fault_areas[current_area++] = scheduler.CurrentTick();
current_area %= MaxPendingFaults;
}
} // namespace VideoCore

View File

@@ -0,0 +1,42 @@
// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include "video_core/buffer_cache/buffer.h"
#include "video_core/buffer_cache/range_set.h"
namespace VideoCore {
class BufferCache;
class FaultManager {
static constexpr size_t MaxPendingFaults = 8;
public:
explicit FaultManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
BufferCache& buffer_cache, u32 caching_pagebits, u64 caching_num_pages);
[[nodiscard]] Buffer* GetFaultBuffer() noexcept {
return &fault_buffer;
}
void ProcessFaultBuffer();
private:
Vulkan::Scheduler& scheduler;
BufferCache& buffer_cache;
RangeSet fault_ranges;
u64 caching_pagesize;
u64 caching_num_pages;
u64 fault_buffer_size;
Buffer fault_buffer;
Buffer download_buffer;
std::array<u64, MaxPendingFaults> fault_areas{};
u32 current_area{};
vk::UniqueDescriptorSetLayout fault_process_desc_layout;
vk::UniquePipeline fault_process_pipeline;
vk::UniquePipelineLayout fault_process_pipeline_layout;
};
} // namespace VideoCore

View File

@@ -13,30 +13,23 @@ layout(std430, binding = 0) buffer input_buf {
layout(std430, binding = 1) buffer output_buf {
uint64_t download_buffer[];
};
// Overlap for 32 bit atomics
layout(std430, binding = 1) buffer output_buf32 {
uint download_buffer32[];
};
layout(constant_id = 0) const uint CACHING_PAGEBITS = 0;
void main() {
uint id = gl_GlobalInvocationID.x;
const uint id = gl_GlobalInvocationID.x;
uint word = fault_buffer[id];
if (word == 0u) {
return;
}
// 1 page per bit
uint base_bit = id * 32u;
fault_buffer[id] = 0u;
const uint base_bit = id * 32u;
while (word != 0u) {
uint bit = findLSB(word);
word &= word - 1;
uint page = base_bit + bit;
uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u;
// It is very unlikely, but should we check for overflow?
if (store_index < 1024u) { // only support 1024 page faults
download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS;
const uint store_index = atomicAdd(download_buffer32[0], 1u) + 1u;
if (store_index >= MAX_PAGE_FAULTS) {
return;
}
const uint bit = findLSB(word);
word &= word - 1;
const uint page = base_bit + bit;
download_buffer[store_index] = uint64_t(page) << CACHING_PAGEBITS;
}
}

View File

@@ -407,18 +407,13 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
if (uses_dma) {
// We only use fault buffer for DMA right now.
{
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
for (auto& range : mapped_ranges) {
buffer_cache.SynchronizeBuffersInRange(range.lower(),
range.upper() - range.lower());
}
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
for (auto& range : mapped_ranges) {
buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower());
}
buffer_cache.MemoryBarrier();
fault_process_pending = true;
}
fault_process_pending |= uses_dma;
return true;
}

View File

@@ -84,15 +84,6 @@ void Scheduler::Wait(u64 tick) {
Flush(info);
}
master_semaphore.Wait(tick);
// CAUTION: This can introduce unexpected variation in the wait time.
// We don't currently sync the GPU, and some games are very sensitive to this.
// If this becomes a problem, it can be commented out.
// Idealy we would implement proper gpu sync.
while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) {
pending_ops.front().callback();
pending_ops.pop();
}
}
void Scheduler::PopPendingOperations() {
@@ -109,9 +100,7 @@ void Scheduler::AllocateWorkerCommandBuffers() {
};
current_cmdbuf = command_pool.Commit();
auto begin_result = current_cmdbuf.begin(begin_info);
ASSERT_MSG(begin_result == vk::Result::eSuccess, "Failed to begin command buffer: {}",
vk::to_string(begin_result));
Check(current_cmdbuf.begin(begin_info));
// Invalidate dynamic state so it gets applied to the new command buffer.
dynamic_state.Invalidate();
@@ -139,9 +128,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) {
#endif
EndRendering();
auto end_result = current_cmdbuf.end();
ASSERT_MSG(end_result == vk::Result::eSuccess, "Failed to end command buffer: {}",
vk::to_string(end_result));
Check(current_cmdbuf.end());
const vk::Semaphore timeline = master_semaphore.Handle();
info.AddSignal(timeline, signal_value);