mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-07-27 20:44:28 +00:00
Better sync (WIP, breaks PR now)
This commit is contained in:
parent
5947526a4d
commit
19473b2672
@ -204,6 +204,8 @@ void EmitContext::DefineArithmeticTypes() {
|
||||
// Used to calculate fault readback buffer position and mask
|
||||
u32_three_value = ConstU32(3U);
|
||||
u32_seven_value = ConstU32(7U);
|
||||
bda_first_time_mask = Constant(U64, 0x1ULL);
|
||||
bda_first_time_inv_mask = Constant(U64, ~0x1ULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -175,8 +175,12 @@ public:
|
||||
|
||||
template <typename Func>
|
||||
Id EmitMemoryAccess(Id type, Id address, Func&& fallback) {
|
||||
const Id first_time_label = OpLabel();
|
||||
const Id after_first_time_label = OpLabel();
|
||||
const Id fallback_label = OpLabel();
|
||||
const Id available_label = OpLabel();
|
||||
const Id save_unmasked_label = OpLabel();
|
||||
const Id after_save_unmasked_label = OpLabel();
|
||||
const Id merge_label = OpLabel();
|
||||
|
||||
// Get page BDA
|
||||
@ -187,13 +191,14 @@ public:
|
||||
const Id bda_ptr = OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32);
|
||||
const Id bda = OpLoad(U64, bda_ptr);
|
||||
|
||||
// Check if the value is available
|
||||
const Id bda_eq_zero = OpIEqual(U1[1], bda, u64_zero_value);
|
||||
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(bda_eq_zero, fallback_label, available_label);
|
||||
// Check if it is the first time we access this page
|
||||
const Id bda_and_mask = OpBitwiseAnd(U64, bda, bda_first_time_mask);
|
||||
const Id first_time = OpIEqual(U1[1], bda_and_mask, u64_zero_value);
|
||||
OpSelectionMerge(after_first_time_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(first_time, first_time_label, after_first_time_label);
|
||||
|
||||
// Fallback (and mark on faul buffer)
|
||||
AddLabel(fallback_label);
|
||||
// First time access
|
||||
AddLabel(first_time_label);
|
||||
const auto& fault_buffer = buffers[fault_readback_index];
|
||||
const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8];
|
||||
const Id page_div8 = OpShiftRightLogical(U32[1], page32, u32_three_value);
|
||||
@ -205,11 +210,35 @@ public:
|
||||
const Id page_mask8 = OpUConvert(U8, page_mask);
|
||||
const Id fault_value_masked = OpBitwiseOr(U8, fault_value, page_mask8);
|
||||
OpStore(fault_ptr, fault_value_masked);
|
||||
OpBranch(after_first_time_label);
|
||||
|
||||
// Check if the value is available
|
||||
AddLabel(after_first_time_label);
|
||||
const Id bda_eq_zero = OpIEqual(U1[1], bda, u64_zero_value);
|
||||
OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(bda_eq_zero, fallback_label, available_label);
|
||||
|
||||
// Fallback (and mark on faul buffer)
|
||||
AddLabel(fallback_label);
|
||||
const Id fallback_result = fallback();
|
||||
OpBranch(merge_label);
|
||||
|
||||
// Get value from memory
|
||||
// Value is available
|
||||
AddLabel(available_label);
|
||||
const Id unmasked_bda = OpBitwiseAnd(U64, bda, bda_first_time_inv_mask);
|
||||
|
||||
// Check if BDA was masked
|
||||
const Id had_mask = OpIEqual(U1[1], bda, unmasked_bda);
|
||||
OpSelectionMerge(save_unmasked_label, spv::SelectionControlMask::MaskNone);
|
||||
OpBranchConditional(had_mask, save_unmasked_label, after_save_unmasked_label);
|
||||
|
||||
// Save unmasked BDA
|
||||
AddLabel(save_unmasked_label);
|
||||
OpStore(bda_ptr, unmasked_bda);
|
||||
OpBranch(after_save_unmasked_label);
|
||||
|
||||
// Load value
|
||||
AddLabel(after_save_unmasked_label);
|
||||
const Id offset_in_bda = OpBitwiseAnd(U64, address, caching_pagemask_value);
|
||||
const Id addr = OpIAdd(U64, bda, offset_in_bda);
|
||||
const PointerType pointer_type = PointerTypeFromType(type);
|
||||
@ -261,9 +290,12 @@ public:
|
||||
Id u32_zero_value{};
|
||||
Id f32_zero_value{};
|
||||
Id u64_zero_value{};
|
||||
Id u64_one_value{};
|
||||
|
||||
Id caching_pagebits_value{};
|
||||
Id caching_pagemask_value{};
|
||||
Id bda_first_time_mask{};
|
||||
Id bda_first_time_inv_mask{};
|
||||
|
||||
Id shared_u8{};
|
||||
Id shared_u16{};
|
||||
|
@ -115,6 +115,7 @@ void CollectShaderInfoPass(IR::Program& program) {
|
||||
.used_types = IR::Type::U64,
|
||||
.inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE),
|
||||
.buffer_type = BufferType::BdaPagetable,
|
||||
.is_written = true,
|
||||
});
|
||||
program.info.buffers.push_back({
|
||||
.used_types = IR::Type::U8,
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "video_core/host_shaders/fault_buffer_process_comp.h"
|
||||
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
|
||||
#include "video_core/renderer_vulkan/vk_instance.h"
|
||||
#include "video_core/renderer_vulkan/vk_rasterizer.h"
|
||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||
#include "video_core/renderer_vulkan/vk_shader_util.h"
|
||||
#include "video_core/texture_cache/texture_cache.h"
|
||||
@ -350,7 +351,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
|
||||
std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
|
||||
// Check if any buffer contains the full requested range.
|
||||
const u64 page = gpu_addr >> CACHING_PAGEBITS;
|
||||
const BufferId buffer_id = page_table[page];
|
||||
const BufferId buffer_id = page_table[page].buffer_id;
|
||||
if (buffer_id) {
|
||||
Buffer& buffer = slot_buffers[buffer_id];
|
||||
if (buffer.IsInBounds(gpu_addr, size)) {
|
||||
@ -373,7 +374,7 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
|
||||
const VAddr end_addr = addr + size;
|
||||
const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
|
||||
for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
|
||||
const BufferId buffer_id = page_table[page];
|
||||
const BufferId buffer_id = page_table[page].buffer_id;
|
||||
if (!buffer_id) {
|
||||
++page;
|
||||
continue;
|
||||
@ -403,7 +404,7 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
|
||||
return NULL_BUFFER_ID;
|
||||
}
|
||||
const u64 page = device_addr >> CACHING_PAGEBITS;
|
||||
const BufferId buffer_id = page_table[page];
|
||||
const BufferId buffer_id = page_table[page].buffer_id;
|
||||
if (!buffer_id) {
|
||||
return CreateBuffer(device_addr, size);
|
||||
}
|
||||
@ -488,7 +489,7 @@ BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 w
|
||||
}
|
||||
for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE);
|
||||
device_addr += CACHING_PAGESIZE) {
|
||||
const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS];
|
||||
const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS].buffer_id;
|
||||
if (!overlap_id) {
|
||||
continue;
|
||||
}
|
||||
@ -599,7 +600,11 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
|
||||
const u64 size_pages = size >> CACHING_PAGEBITS;
|
||||
bda_addrs.reserve(size_pages);
|
||||
for (u64 i = 0; i < size_pages; ++i) {
|
||||
bda_addrs.push_back(new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS));
|
||||
vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS);
|
||||
const bool is_dma_synced = page_table[start_page + i].is_dma_synced;
|
||||
// Use LSB to mark if the page is DMA synced. If it is not synced, it
|
||||
// we haven't accessed it yet.
|
||||
bda_addrs.push_back(addr | (is_dma_synced ? 0x1 : 0x0));
|
||||
}
|
||||
WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
|
||||
bda_addrs.size() * sizeof(vk::DeviceAddress));
|
||||
@ -727,17 +732,24 @@ void BufferCache::ProcessFaultBuffer() {
|
||||
const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted
|
||||
fault_ranges +=
|
||||
boost::icl::interval_set<VAddr>::interval_type::right_open(fault, fault_end);
|
||||
LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at page {:#x}", fault);
|
||||
LOG_WARNING(Render_Vulkan, "First time DMA access to memory at page {:#x}", fault);
|
||||
}
|
||||
for (const auto& range : fault_ranges) {
|
||||
const VAddr start = range.lower();
|
||||
const VAddr end = range.upper();
|
||||
// Buffer size is 32 bits
|
||||
for (VAddr addr = start; addr < end; addr += std::numeric_limits<u32>::max()) {
|
||||
const u32 size_buffer = std::min<u32>(end - addr, std::numeric_limits<u32>::max());
|
||||
CreateBuffer(addr, size_buffer);
|
||||
const VAddr page_start = start >> CACHING_PAGEBITS;
|
||||
const VAddr page_end = Common::DivCeil(end, CACHING_PAGESIZE);
|
||||
// Mark the pages as synced
|
||||
for (u64 page = page_start; page < page_end; ++page) {
|
||||
page_table[page].is_dma_synced = true;
|
||||
}
|
||||
// Buffer size is in 32 bits
|
||||
ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits<u32>::max(),
|
||||
"Buffer size is too large");
|
||||
// Only create a buffer is the current range doesn't fit in an existing one
|
||||
FindBuffer(start, static_cast<u32>(end - start));
|
||||
}
|
||||
rasterizer.AddDmaSyncRanges(fault_ranges);
|
||||
});
|
||||
}
|
||||
|
||||
@ -759,9 +771,9 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
|
||||
const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
|
||||
for (u64 page = page_begin; page != page_end; ++page) {
|
||||
if constexpr (insert) {
|
||||
page_table[page] = buffer_id;
|
||||
page_table[page].buffer_id = buffer_id;
|
||||
} else {
|
||||
page_table[page] = BufferId{};
|
||||
page_table[page].buffer_id = BufferId{};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -46,8 +46,13 @@ public:
|
||||
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
|
||||
static constexpr u64 FAULT_READBACK_SIZE = CACHING_NUMPAGES / 8; // Bit per page
|
||||
|
||||
struct PageData {
|
||||
BufferId buffer_id{};
|
||||
bool is_dma_synced = false;
|
||||
};
|
||||
|
||||
struct Traits {
|
||||
using Entry = BufferId;
|
||||
using Entry = PageData;
|
||||
static constexpr size_t AddressSpaceBits = 40;
|
||||
static constexpr size_t FirstLevelBits = 16;
|
||||
static constexpr size_t PageBits = CACHING_PAGEBITS;
|
||||
@ -126,7 +131,7 @@ public:
|
||||
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
|
||||
|
||||
/// Return buffer id for the specified region
|
||||
[[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
|
||||
BufferId FindBuffer(VAddr device_addr, u32 size);
|
||||
|
||||
/// Queue a region for coverage for DMA.
|
||||
void QueueMemoryCoverage(VAddr device_addr, u64 size);
|
||||
@ -148,7 +153,7 @@ private:
|
||||
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
|
||||
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
|
||||
for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
|
||||
const BufferId buffer_id = page_table[page];
|
||||
const BufferId buffer_id = page_table[page].buffer_id;
|
||||
if (!buffer_id) {
|
||||
++page;
|
||||
continue;
|
||||
|
@ -456,7 +456,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
|
||||
buffer_infos.clear();
|
||||
image_infos.clear();
|
||||
|
||||
bool fault_enable = false;
|
||||
bool uses_dma = false;
|
||||
|
||||
// Bind resource buffers and textures.
|
||||
Shader::Backend::Bindings binding{};
|
||||
@ -469,26 +469,28 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
|
||||
BindBuffers(*stage, binding, push_data);
|
||||
BindTextures(*stage, binding);
|
||||
|
||||
fault_enable |= stage->dma_types != Shader::IR::Type::Void;
|
||||
uses_dma |= stage->dma_types != Shader::IR::Type::Void;
|
||||
}
|
||||
|
||||
pipeline->BindResources(set_writes, buffer_barriers, push_data);
|
||||
|
||||
if (!fault_process_pending && fault_enable) {
|
||||
if (uses_dma) {
|
||||
fault_process_pending = true;
|
||||
// We only use fault buffer for DMA right now.
|
||||
// First, import any queued host memory, then sync every mapped
|
||||
// region that is cached on GPU memory.
|
||||
buffer_cache.CoverQueuedRegions();
|
||||
{
|
||||
std::shared_lock lock{mapped_ranges_mutex};
|
||||
for (const auto& range : mapped_ranges) {
|
||||
std::shared_lock lock{dma_sync_mapped_ranges_mutex};
|
||||
for (const auto& range : dma_sync_mapped_ranges) {
|
||||
buffer_cache.SynchronizeRange(range.lower(), range.upper() - range.lower());
|
||||
}
|
||||
}
|
||||
buffer_cache.MemoryBarrier();
|
||||
}
|
||||
|
||||
fault_process_pending |= uses_dma;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -727,6 +729,15 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin
|
||||
}
|
||||
}
|
||||
|
||||
void Rasterizer::AddDmaSyncRanges(const boost::icl::interval_set<VAddr>& ranges) {
|
||||
dma_sync_ranges += ranges;
|
||||
{
|
||||
std::scoped_lock lock{dma_sync_mapped_ranges_mutex};
|
||||
std::shared_lock lock2(mapped_ranges_mutex);
|
||||
dma_sync_mapped_ranges = mapped_ranges & ranges;
|
||||
}
|
||||
}
|
||||
|
||||
void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& state) {
|
||||
int cb_index = 0;
|
||||
for (auto attach_idx = 0u; attach_idx < state.num_color_attachments; ++attach_idx) {
|
||||
@ -969,14 +980,15 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) {
|
||||
}
|
||||
const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
|
||||
|
||||
std::shared_lock lock{mapped_ranges_ismapped_mutex};
|
||||
std::shared_lock lock{mapped_ranges_mutex};
|
||||
return boost::icl::contains(mapped_ranges, range);
|
||||
}
|
||||
|
||||
void Rasterizer::MapMemory(VAddr addr, u64 size) {
|
||||
{
|
||||
std::scoped_lock lock{mapped_ranges_mutex, mapped_ranges_ismapped_mutex};
|
||||
std::scoped_lock lock{mapped_ranges_mutex, dma_sync_mapped_ranges_mutex};
|
||||
mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
|
||||
dma_sync_mapped_ranges = mapped_ranges & dma_sync_ranges;
|
||||
}
|
||||
page_manager.OnGpuMap(addr, size);
|
||||
buffer_cache.QueueMemoryCoverage(addr, size);
|
||||
@ -987,8 +999,9 @@ void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
|
||||
texture_cache.UnmapMemory(addr, size);
|
||||
page_manager.OnGpuUnmap(addr, size);
|
||||
{
|
||||
std::scoped_lock lock{mapped_ranges_mutex, mapped_ranges_ismapped_mutex};
|
||||
std::scoped_lock lock{mapped_ranges_mutex, dma_sync_mapped_ranges_mutex};
|
||||
mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
|
||||
dma_sync_mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -100,7 +100,11 @@ private:
|
||||
|
||||
bool IsComputeMetaClear(const Pipeline* pipeline);
|
||||
|
||||
void AddDmaSyncRanges(const boost::icl::interval_set<VAddr>& ranges);
|
||||
|
||||
private:
|
||||
friend class VideoCore::BufferCache;
|
||||
|
||||
const Instance& instance;
|
||||
Scheduler& scheduler;
|
||||
VideoCore::PageManager page_manager;
|
||||
@ -109,9 +113,11 @@ private:
|
||||
AmdGpu::Liverpool* liverpool;
|
||||
Core::MemoryManager* memory;
|
||||
boost::icl::interval_set<VAddr> mapped_ranges;
|
||||
boost::icl::interval_set<VAddr> dma_sync_ranges;
|
||||
boost::icl::interval_set<VAddr> dma_sync_mapped_ranges;
|
||||
// use 2 mutexes to avoid undefined behavior when using shared lock
|
||||
std::shared_mutex mapped_ranges_mutex;
|
||||
std::shared_mutex mapped_ranges_ismapped_mutex;
|
||||
std::shared_mutex dma_sync_mapped_ranges_mutex;
|
||||
PipelineCache pipeline_cache;
|
||||
|
||||
boost::container::static_vector<
|
||||
|
Loading…
Reference in New Issue
Block a user