diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 63b05a3c9..19295b0c3 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -204,6 +204,8 @@ void EmitContext::DefineArithmeticTypes() { // Used to calculate fault readback buffer position and mask u32_three_value = ConstU32(3U); u32_seven_value = ConstU32(7U); + bda_first_time_mask = Constant(U64, 0x1ULL); + bda_first_time_inv_mask = Constant(U64, ~0x1ULL); } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index b0e96e0ad..c73f54f3c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -175,8 +175,12 @@ public: template Id EmitMemoryAccess(Id type, Id address, Func&& fallback) { + const Id first_time_label = OpLabel(); + const Id after_first_time_label = OpLabel(); const Id fallback_label = OpLabel(); const Id available_label = OpLabel(); + const Id save_unmasked_label = OpLabel(); + const Id after_save_unmasked_label = OpLabel(); const Id merge_label = OpLabel(); // Get page BDA @@ -187,13 +191,14 @@ public: const Id bda_ptr = OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32); const Id bda = OpLoad(U64, bda_ptr); - // Check if the value is available - const Id bda_eq_zero = OpIEqual(U1[1], bda, u64_zero_value); - OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); - OpBranchConditional(bda_eq_zero, fallback_label, available_label); + // Check if it is the first time we access this page + const Id bda_and_mask = OpBitwiseAnd(U64, bda, bda_first_time_mask); + const Id first_time = OpIEqual(U1[1], bda_and_mask, u64_zero_value); + OpSelectionMerge(after_first_time_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(first_time, first_time_label, after_first_time_label); - // Fallback (and mark on faul buffer) - AddLabel(fallback_label); + // First time access + AddLabel(first_time_label); const auto& fault_buffer = buffers[fault_readback_index]; const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8]; const Id page_div8 = OpShiftRightLogical(U32[1], page32, u32_three_value); @@ -205,11 +210,35 @@ public: const Id page_mask8 = OpUConvert(U8, page_mask); const Id fault_value_masked = OpBitwiseOr(U8, fault_value, page_mask8); OpStore(fault_ptr, fault_value_masked); + OpBranch(after_first_time_label); + + // Check if the value is available + AddLabel(after_first_time_label); + const Id bda_eq_zero = OpIEqual(U1[1], bda, u64_zero_value); + OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(bda_eq_zero, fallback_label, available_label); + + // Fallback (and mark on faul buffer) + AddLabel(fallback_label); const Id fallback_result = fallback(); OpBranch(merge_label); - // Get value from memory + // Value is available AddLabel(available_label); + const Id unmasked_bda = OpBitwiseAnd(U64, bda, bda_first_time_inv_mask); + + // Check if BDA was masked + const Id had_mask = OpIEqual(U1[1], bda, unmasked_bda); + OpSelectionMerge(save_unmasked_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(had_mask, save_unmasked_label, after_save_unmasked_label); + + // Save unmasked BDA + AddLabel(save_unmasked_label); + OpStore(bda_ptr, unmasked_bda); + OpBranch(after_save_unmasked_label); + + // Load value + AddLabel(after_save_unmasked_label); const Id offset_in_bda = OpBitwiseAnd(U64, address, caching_pagemask_value); const Id addr = OpIAdd(U64, bda, offset_in_bda); const PointerType pointer_type = PointerTypeFromType(type); @@ -261,9 +290,12 @@ public: Id u32_zero_value{}; Id f32_zero_value{}; Id u64_zero_value{}; + Id u64_one_value{}; Id caching_pagebits_value{}; Id caching_pagemask_value{}; + Id bda_first_time_mask{}; + Id bda_first_time_inv_mask{}; Id shared_u8{}; Id shared_u16{}; diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 6bdcda8da..a7258bee2 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -115,6 +115,7 @@ void CollectShaderInfoPass(IR::Program& program) { .used_types = IR::Type::U64, .inline_cbuf = AmdGpu::Buffer::Placeholder(VideoCore::BufferCache::BDA_PAGETABLE_SIZE), .buffer_type = BufferType::BdaPagetable, + .is_written = true, }); program.info.buffers.push_back({ .used_types = IR::Type::U8, diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 5f7ad4fbd..b75a8a710 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -10,6 +10,7 @@ #include "video_core/host_shaders/fault_buffer_process_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/texture_cache.h" @@ -350,7 +351,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) { // Check if any buffer contains the full requested range. const u64 page = gpu_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (buffer_id) { Buffer& buffer = slot_buffers[buffer_id]; if (buffer.IsInBounds(gpu_addr, size)) { @@ -373,7 +374,7 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (!buffer_id) { ++page; continue; @@ -403,7 +404,7 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { return NULL_BUFFER_ID; } const u64 page = device_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (!buffer_id) { return CreateBuffer(device_addr, size); } @@ -488,7 +489,7 @@ BufferCache::OverlapResult BufferCache::ResolveOverlaps(VAddr device_addr, u32 w } for (; device_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); device_addr += CACHING_PAGESIZE) { - const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS]; + const BufferId overlap_id = page_table[device_addr >> CACHING_PAGEBITS].buffer_id; if (!overlap_id) { continue; } @@ -599,7 +600,11 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { const u64 size_pages = size >> CACHING_PAGEBITS; bda_addrs.reserve(size_pages); for (u64 i = 0; i < size_pages; ++i) { - bda_addrs.push_back(new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS)); + vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS); + const bool is_dma_synced = page_table[start_page + i].is_dma_synced; + // Use LSB to mark if the page is DMA synced. If it is not synced, it + // we haven't accessed it yet. + bda_addrs.push_back(addr | (is_dma_synced ? 0x1 : 0x0)); } WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(), bda_addrs.size() * sizeof(vk::DeviceAddress)); @@ -727,17 +732,24 @@ void BufferCache::ProcessFaultBuffer() { const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted fault_ranges += boost::icl::interval_set::interval_type::right_open(fault, fault_end); - LOG_WARNING(Render_Vulkan, "Accessed non GPU-local memory at page {:#x}", fault); + LOG_WARNING(Render_Vulkan, "First time DMA access to memory at page {:#x}", fault); } for (const auto& range : fault_ranges) { const VAddr start = range.lower(); const VAddr end = range.upper(); - // Buffer size is 32 bits - for (VAddr addr = start; addr < end; addr += std::numeric_limits::max()) { - const u32 size_buffer = std::min(end - addr, std::numeric_limits::max()); - CreateBuffer(addr, size_buffer); + const VAddr page_start = start >> CACHING_PAGEBITS; + const VAddr page_end = Common::DivCeil(end, CACHING_PAGESIZE); + // Mark the pages as synced + for (u64 page = page_start; page < page_end; ++page) { + page_table[page].is_dma_synced = true; } + // Buffer size is in 32 bits + ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits::max(), + "Buffer size is too large"); + // Only create a buffer is the current range doesn't fit in an existing one + FindBuffer(start, static_cast(end - start)); } + rasterizer.AddDmaSyncRanges(fault_ranges); }); } @@ -759,9 +771,9 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { - page_table[page] = buffer_id; + page_table[page].buffer_id = buffer_id; } else { - page_table[page] = BufferId{}; + page_table[page].buffer_id = BufferId{}; } } } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 3daa27ef2..1baa365f3 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -46,8 +46,13 @@ public: static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress); static constexpr u64 FAULT_READBACK_SIZE = CACHING_NUMPAGES / 8; // Bit per page + struct PageData { + BufferId buffer_id{}; + bool is_dma_synced = false; + }; + struct Traits { - using Entry = BufferId; + using Entry = PageData; static constexpr size_t AddressSpaceBits = 40; static constexpr size_t FirstLevelBits = 16; static constexpr size_t PageBits = CACHING_PAGEBITS; @@ -126,7 +131,7 @@ public: [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); /// Return buffer id for the specified region - [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); + BufferId FindBuffer(VAddr device_addr, u32 size); /// Queue a region for coverage for DMA. void QueueMemoryCoverage(VAddr device_addr, u64 size); @@ -148,7 +153,7 @@ private: void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE); for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; + const BufferId buffer_id = page_table[page].buffer_id; if (!buffer_id) { ++page; continue; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 155d78a3f..5766f776a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -456,7 +456,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { buffer_infos.clear(); image_infos.clear(); - bool fault_enable = false; + bool uses_dma = false; // Bind resource buffers and textures. Shader::Backend::Bindings binding{}; @@ -469,26 +469,28 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { BindBuffers(*stage, binding, push_data); BindTextures(*stage, binding); - fault_enable |= stage->dma_types != Shader::IR::Type::Void; + uses_dma |= stage->dma_types != Shader::IR::Type::Void; } pipeline->BindResources(set_writes, buffer_barriers, push_data); - if (!fault_process_pending && fault_enable) { + if (uses_dma) { fault_process_pending = true; // We only use fault buffer for DMA right now. // First, import any queued host memory, then sync every mapped // region that is cached on GPU memory. buffer_cache.CoverQueuedRegions(); { - std::shared_lock lock{mapped_ranges_mutex}; - for (const auto& range : mapped_ranges) { + std::shared_lock lock{dma_sync_mapped_ranges_mutex}; + for (const auto& range : dma_sync_mapped_ranges) { buffer_cache.SynchronizeRange(range.lower(), range.upper() - range.lower()); } } buffer_cache.MemoryBarrier(); } + fault_process_pending |= uses_dma; + return true; } @@ -727,6 +729,15 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin } } +void Rasterizer::AddDmaSyncRanges(const boost::icl::interval_set& ranges) { + dma_sync_ranges += ranges; + { + std::scoped_lock lock{dma_sync_mapped_ranges_mutex}; + std::shared_lock lock2(mapped_ranges_mutex); + dma_sync_mapped_ranges = mapped_ranges & ranges; + } +} + void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& state) { int cb_index = 0; for (auto attach_idx = 0u; attach_idx < state.num_color_attachments; ++attach_idx) { @@ -969,14 +980,15 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) { } const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - std::shared_lock lock{mapped_ranges_ismapped_mutex}; + std::shared_lock lock{mapped_ranges_mutex}; return boost::icl::contains(mapped_ranges, range); } void Rasterizer::MapMemory(VAddr addr, u64 size) { { - std::scoped_lock lock{mapped_ranges_mutex, mapped_ranges_ismapped_mutex}; + std::scoped_lock lock{mapped_ranges_mutex, dma_sync_mapped_ranges_mutex}; mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + dma_sync_mapped_ranges = mapped_ranges & dma_sync_ranges; } page_manager.OnGpuMap(addr, size); buffer_cache.QueueMemoryCoverage(addr, size); @@ -987,8 +999,9 @@ void Rasterizer::UnmapMemory(VAddr addr, u64 size) { texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { - std::scoped_lock lock{mapped_ranges_mutex, mapped_ranges_ismapped_mutex}; + std::scoped_lock lock{mapped_ranges_mutex, dma_sync_mapped_ranges_mutex}; mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + dma_sync_mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 2e6de69f0..5d587d666 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -100,7 +100,11 @@ private: bool IsComputeMetaClear(const Pipeline* pipeline); + void AddDmaSyncRanges(const boost::icl::interval_set& ranges); + private: + friend class VideoCore::BufferCache; + const Instance& instance; Scheduler& scheduler; VideoCore::PageManager page_manager; @@ -109,9 +113,11 @@ private: AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; boost::icl::interval_set mapped_ranges; + boost::icl::interval_set dma_sync_ranges; + boost::icl::interval_set dma_sync_mapped_ranges; // use 2 mutexes to avoid undefined behavior when using shared lock std::shared_mutex mapped_ranges_mutex; - std::shared_mutex mapped_ranges_ismapped_mutex; + std::shared_mutex dma_sync_mapped_ranges_mutex; PipelineCache pipeline_cache; boost::container::static_vector<