diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 57cce4be9..e9aea3fb8 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -204,8 +204,6 @@ void EmitContext::DefineArithmeticTypes() { // Used to calculate fault buffer position and mask u32_three_value = ConstU32(3U); u32_seven_value = ConstU32(7U); - bda_first_time_mask = Constant(U64, 0x1ULL); - bda_first_time_inv_mask = Constant(U64, ~0x1ULL); } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 09d2b9cf5..84545a534 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -175,12 +175,8 @@ public: template Id EmitMemoryAccess(Id type, Id address, Func&& fallback) { - const Id first_time_label = OpLabel(); - const Id after_first_time_label = OpLabel(); - const Id fallback_label = OpLabel(); + const Id fault_label = OpLabel(); const Id available_label = OpLabel(); - const Id save_masked_label = OpLabel(); - const Id after_save_masked_label = OpLabel(); const Id merge_label = OpLabel(); // Get page BDA @@ -191,14 +187,13 @@ public: const Id bda_ptr = OpAccessChain(bda_pointer_type, bda_buffer_id, u32_zero_value, page32); const Id bda = OpLoad(U64, bda_ptr); - // Check if it is the first time we access this page - const Id bda_and_mask = OpBitwiseAnd(U64, bda, bda_first_time_mask); - const Id first_time = OpIEqual(U1[1], bda_and_mask, u64_zero_value); - OpSelectionMerge(after_first_time_label, spv::SelectionControlMask::MaskNone); - OpBranchConditional(first_time, first_time_label, after_first_time_label); + // Check if the page is GPU mapped + const Id is_fault = OpIEqual(U1[1], bda, u64_zero_value); + OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(is_fault, fault_label, available_label); // First time access - AddLabel(first_time_label); + AddLabel(fault_label); const auto& fault_buffer = buffers[fault_buffer_index]; const auto [fault_buffer_id, fault_pointer_type] = fault_buffer[PointerType::U8]; const Id page_div8 = OpShiftRightLogical(U32[1], page32, u32_three_value); @@ -210,35 +205,14 @@ public: const Id page_mask8 = OpUConvert(U8, page_mask); const Id fault_value_masked = OpBitwiseOr(U8, fault_value, page_mask8); OpStore(fault_ptr, fault_value_masked); - OpBranch(after_first_time_label); - - // Check if the value is available - AddLabel(after_first_time_label); - const Id bda_eq_zero = OpIEqual(U1[1], bda, u64_zero_value); - OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); - OpBranchConditional(bda_eq_zero, fallback_label, available_label); - - // Fallback (and mark on faul buffer) - AddLabel(fallback_label); + // Fallback (we are not able to access the page) const Id fallback_result = fallback(); OpBranch(merge_label); // Value is available AddLabel(available_label); - OpSelectionMerge(after_save_masked_label, spv::SelectionControlMask::MaskNone); - OpBranchConditional(first_time, save_masked_label, after_save_masked_label); - - // Save unmasked BDA - AddLabel(save_masked_label); - const Id masked_bda = OpBitwiseOr(U64, bda, bda_first_time_mask); - OpStore(bda_ptr, masked_bda); - OpBranch(after_save_masked_label); - - // Load value - AddLabel(after_save_masked_label); - const Id unmasked_bda = OpBitwiseAnd(U64, bda, bda_first_time_inv_mask); const Id offset_in_bda = OpBitwiseAnd(U64, address, caching_pagemask_value); - const Id addr = OpIAdd(U64, unmasked_bda, offset_in_bda); + const Id addr = OpIAdd(U64, bda, offset_in_bda); const PointerType pointer_type = PointerTypeFromType(type); const Id pointer_type_id = physical_pointer_types[pointer_type]; const Id addr_ptr = OpConvertUToPtr(pointer_type_id, addr); @@ -247,8 +221,7 @@ public: // Merge AddLabel(merge_label); - const Id final_result = - OpPhi(type, fallback_result, fallback_label, result, after_save_masked_label); + const Id final_result = OpPhi(type, fallback_result, fault_label, result, available_label); return final_result; } @@ -292,8 +265,6 @@ public: Id caching_pagebits_value{}; Id caching_pagemask_value{}; - Id bda_first_time_mask{}; - Id bda_first_time_inv_mask{}; Id shared_u8{}; Id shared_u16{}; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index afb53d10e..45863d8e8 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -135,14 +135,6 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) { if (unmap) { return; } - - { - // This is a temporary workaround... - std::scoped_lock lock(dma_sync_ranges_mutex); - const VAddr aligned_addr = Common::AlignDown(device_addr, 4_KB); - const u64 aligned_size = Common::AlignUp(device_addr + size, 4_KB) - aligned_addr; - dma_sync_ranges.Add(aligned_addr, aligned_size); - } } } @@ -553,10 +545,7 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { bda_addrs.reserve(size_pages); for (u64 i = 0; i < size_pages; ++i) { vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS); - const bool is_dma_synced = page_table[start_page + i].is_dma_synced; - // Use LSB to mark if the page is DMA synced. If it is not synced, it - // we haven't accessed it yet. - bda_addrs.push_back(addr | (is_dma_synced ? 0x1 : 0x0)); + bda_addrs.push_back(addr); } WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(), bda_addrs.size() * sizeof(vk::DeviceAddress)); @@ -568,10 +557,6 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); - { - std::scoped_lock lk(dma_sync_ranges_mutex); - dma_sync_ranges.Add(overlap.begin, overlap.end); - } return new_buffer_id; } @@ -683,22 +668,17 @@ void BufferCache::ProcessFaultBuffer() { const VAddr fault_end = fault + CACHING_PAGESIZE; // This can be adjusted fault_ranges += boost::icl::interval_set::interval_type::right_open(fault, fault_end); - LOG_WARNING(Render_Vulkan, "First time DMA access to memory at page {:#x}", fault); + LOG_INFO(Render_Vulkan, "Accessed non-GPU mapped memory at {:#x}", fault); } for (const auto& range : fault_ranges) { const VAddr start = range.lower(); const VAddr end = range.upper(); const u64 page_start = start >> CACHING_PAGEBITS; const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE); - // Mark the pages as synced - for (u64 page = page_start; page < page_end; ++page) { - page_table[page].is_dma_synced = true; - } // Buffer size is in 32 bits ASSERT_MSG((range.upper() - range.lower()) <= std::numeric_limits::max(), "Buffer size is too large"); - // Only create a buffer is the current range doesn't fit in an existing one - FindBuffer(start, static_cast(end - start)); + CreateBuffer(start, static_cast(end - start)); } }); } @@ -923,18 +903,6 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { }); } -void BufferCache::SynchronizeDmaBuffers() { - RENDERER_TRACE; - std::scoped_lock lk(dma_sync_ranges_mutex); - LOG_WARNING(Render_Vulkan, "Synchronizing ranges"); - dma_sync_ranges.ForEach([&](VAddr device_addr, u64 end_addr) { - RENDERER_TRACE; - SynchronizeBuffersInRange(device_addr, end_addr - device_addr); - LOG_WARNING(Render_Vulkan, "Sync range {:#x} - {:#x}", device_addr, end_addr); - }); - dma_sync_ranges.Clear(); -} - void BufferCache::MemoryBarrier() { // Vulkan doesn't know which buffer we access in a shader if we use // BufferDeviceAddress. We need a full memory barrier. diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index ef705d412..2d6551a7f 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -48,7 +48,6 @@ public: struct PageData { BufferId buffer_id{}; - bool is_dma_synced = false; }; struct Traits { @@ -194,8 +193,6 @@ private: Buffer fault_buffer; std::shared_mutex slot_buffers_mutex; Common::SlotVector slot_buffers; - std::shared_mutex dma_sync_ranges_mutex; - RangeSet dma_sync_ranges; RangeSet gpu_modified_ranges; SplitRangeMap buffer_ranges; MemoryTracker memory_tracker; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 88da7acd2..4bdb08bf2 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -477,9 +477,13 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { if (uses_dma && !fault_process_pending) { // We only use fault buffer for DMA right now. { - // We don't want the mapped ranges to be modified while we are syncing + // TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP) + // we need to account for that and synchronize. Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - buffer_cache.SynchronizeDmaBuffers(); + for (auto& range : mapped_ranges) { + buffer_cache.SynchronizeBuffersInRange(range.lower(), + range.upper() - range.lower()); + } } buffer_cache.MemoryBarrier(); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index d20862a64..e75a69924 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -91,17 +91,14 @@ void Scheduler::Wait(u64 tick) { } master_semaphore.Wait(tick); - // TODO: We should be applyting pending operations here because that gives us - // the ability to use mapped regions on stream buffers in deferred operations. - // We don't do that right now because it might introduce varioations in the - // timing and, since we don't sync the GPU some games might be affected by that. - // It shouldn't be an issue right now, because we only use mapped regions in - // deferred operations to download faulted addresses. That is only 8KB every tick - // and the stream buffer is 256MB. GPU doesn't go that behind. - // while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { - // pending_ops.front().callback(); - // pending_ops.pop(); - // } + // CAUTION: This can introduce unexpected variation in the wait time. + // We don't currently sync the GPU, and some games are very sensitive to this. + // If this becomes a problem, it can be commented out. + // Idealy we would implement proper gpu sync. + while (!pending_ops.empty() && pending_ops.front().gpu_tick <= tick) { + pending_ops.front().callback(); + pending_ops.pop(); + } } void Scheduler::AllocateWorkerCommandBuffers() {