diff --git a/CMakeLists.txt b/CMakeLists.txt index 24a81243f..9417df405 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -926,7 +926,6 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/buffer_cache/buffer_cache.cpp src/video_core/buffer_cache/buffer_cache.h src/video_core/buffer_cache/memory_tracker.h - src/video_core/buffer_cache/range_set.h src/video_core/buffer_cache/region_definitions.h src/video_core/buffer_cache/region_manager.h src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -985,6 +984,7 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/page_manager.cpp src/video_core/page_manager.h src/video_core/multi_level_page_table.h + src/video_core/range_set.h src/video_core/renderdoc.cpp src/video_core/renderdoc.h ) diff --git a/src/common/func_traits.h b/src/common/func_traits.h index 407b2dbe6..58f1ddeb3 100644 --- a/src/common/func_traits.h +++ b/src/common/func_traits.h @@ -3,23 +3,42 @@ #pragma once +#include #include namespace Common { -template -struct FuncTraits {}; +template +struct FuncTraits; +// Function type template -struct FuncTraits { +struct FuncTraits { using ReturnType = ReturnType_; - static constexpr size_t NUM_ARGS = sizeof...(Args); template using ArgType = std::tuple_element_t>; }; +// Function pointer +template +struct FuncTraits : FuncTraits {}; + +// Member function pointer +template +struct FuncTraits : FuncTraits {}; + +template +struct FuncTraits + : FuncTraits {}; + +// Catch-all for callables +template +struct FuncTraits::operator())>> + : FuncTraits::operator())> {}; + +// For lambdas: for compat (may be removed) template struct LambdaTraits : LambdaTraits::operator())> {}; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 3e66fba6a..212ea454a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -124,6 +124,10 @@ void Liverpool::Process(std::stop_token stoken) { if (task.done()) { task.destroy(); + if (rasterizer) { + rasterizer->ProcessFaultBuffer(); + } + std::scoped_lock lock{queue.m_access}; queue.submits.pop(); @@ -136,7 +140,7 @@ void Liverpool::Process(std::stop_token stoken) { if (submit_done) { VideoCore::EndCapture(); if (rasterizer) { - rasterizer->EndCommandList(); + rasterizer->ProcessDownloadImages(); rasterizer->Flush(); } submit_done = false; @@ -174,8 +178,14 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { } case PM4ItOpcode::DumpConstRam: { const auto* dump_const = reinterpret_cast(header); - memcpy(dump_const->Address(), - cblock.constants_heap.data() + dump_const->Offset(), dump_const->Size()); + if (rasterizer) { + rasterizer->InlineData(dump_const->Address(), + cblock.constants_heap.data() + dump_const->Offset(), + dump_const->Size(), false); + } else { + memcpy(dump_const->Address(), + cblock.constants_heap.data() + dump_const->Offset(), dump_const->Size()); + } break; } case PM4ItOpcode::IncrementCeCounter: { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c5e5d18f8..3e7b8ba60 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -29,9 +29,9 @@ static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, - PageManager& tracker) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, + TextureCache& texture_cache_, PageManager& tracker) + : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, @@ -155,9 +155,8 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si memory_tracker->ForEachDownloadRange( device_addr, size, [&](u64 device_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end) { + const auto add_download = [&](VAddr start, u64 new_size) { const u64 new_offset = start - buffer_addr; - const u64 new_size = end - start; copies.push_back(vk::BufferCopy{ .srcOffset = new_offset, .dstOffset = total_size_bytes, @@ -1036,6 +1035,79 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { }); } +void BufferCache::SynchronizeBuffersForDma() { + RENDERER_TRACE; + boost::container::small_vector copies; + const auto& mapped_ranges = rasterizer.GetMappedRanges(); + BufferId last_buffer_id = NULL_BUFFER_ID; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + const auto upload_pending = [&]() { + RENDERER_TRACE; + if (last_buffer_id == NULL_BUFFER_ID) { + return; + } + Buffer& buffer = slot_buffers[last_buffer_id]; + const vk::BufferMemoryBarrier2 barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | + vk::AccessFlagBits2::eTransferRead | + vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &barrier, + }); + cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.Handle(), copies); + copies.clear(); + }; + mapped_ranges.ForEach([&](VAddr device_addr, u64 size) { + RENDERER_TRACE; + memory_tracker->ForEachUploadRange( + device_addr, size, false, + [&](u64 device_addr_out, u64 range_size, RegionBits& clear_mask) { + RENDERER_TRACE; + ForEachBufferInRange( + device_addr_out, range_size, [&](BufferId buffer_id, Buffer& buffer) { + RENDERER_TRACE; + if (last_buffer_id != buffer_id) { + upload_pending(); + last_buffer_id = buffer_id; + } + const VAddr copy_start = std::max(buffer.CpuAddr(), device_addr_out); + const VAddr copy_end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), + device_addr_out + range_size); + const u32 copy_size = static_cast(copy_end - copy_start); + if (copy_size == 0) { + return; + } + const u64 offset = staging_buffer.Copy(copy_start, copy_size); + copies.push_back(vk::BufferCopy{ + .srcOffset = offset, + .dstOffset = copy_start - buffer.CpuAddr(), + .size = copy_size, + }); + + // We need to use tracker page size here, we are marking the clear mask + const u64 page_start = + (copy_start & TRACKER_HIGHER_PAGE_MASK) >> TRACKER_PAGE_BITS; + const u64 page_end = Common::DivCeil( + (copy_end - 1) & TRACKER_HIGHER_PAGE_MASK, TRACKER_BYTES_PER_PAGE); + ASSERT(page_start < page_end); + clear_mask.SetRange(page_start, page_end); + }); + }, + upload_pending); + }); + MemoryBarrier(); +} + void BufferCache::MemoryBarrier() { // Vulkan doesn't know which buffer we access in a shader if we use // BufferDeviceAddress. We need a full memory barrier. diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b509ce2d0..32d08ca38 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -5,12 +5,11 @@ #include #include -#include "common/div_ceil.h" #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" -#include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" +#include "video_core/range_set.h" namespace AmdGpu { struct Liverpool; @@ -22,7 +21,8 @@ class MemoryManager; namespace Vulkan { class GraphicsPipeline; -} +class Rasterizer; +} // namespace Vulkan namespace VideoCore { @@ -71,8 +71,8 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, - PageManager& tracker); + Vulkan::Rasterizer& rasterizer, AmdGpu::Liverpool* liverpool, + TextureCache& texture_cache, PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -156,8 +156,8 @@ public: /// Synchronizes all buffers in the specified range. void SynchronizeBuffersInRange(VAddr device_addr, u64 size); - /// Synchronizes all buffers neede for DMA. - void SynchronizeDmaBuffers(); + /// Synchronizes all buffers for DMA. + void SynchronizeBuffersForDma(); /// Record memory barrier. Used for buffers when accessed via BDA. void MemoryBarrier(); @@ -207,6 +207,7 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; diff --git a/src/video_core/buffer_cache/region_definitions.h b/src/video_core/buffer_cache/region_definitions.h index 76e7ee263..136d8236c 100644 --- a/src/video_core/buffer_cache/region_definitions.h +++ b/src/video_core/buffer_cache/region_definitions.h @@ -4,6 +4,7 @@ #pragma once #include "common/bit_array.h" +#include "common/enum.h" #include "common/types.h" namespace VideoCore { @@ -17,6 +18,7 @@ constexpr u64 TRACKER_HIGHER_PAGE_MASK = TRACKER_HIGHER_PAGE_SIZE - 1ULL; constexpr u64 NUM_PAGES_PER_REGION = TRACKER_HIGHER_PAGE_SIZE / TRACKER_BYTES_PER_PAGE; enum class Type { + None, CPU, GPU, }; diff --git a/src/video_core/buffer_cache/region_manager.h b/src/video_core/buffer_cache/region_manager.h index 608b16fb3..619acd802 100644 --- a/src/video_core/buffer_cache/region_manager.h +++ b/src/video_core/buffer_cache/region_manager.h @@ -5,7 +5,7 @@ #include "common/config.h" #include "common/div_ceil.h" -#include "common/logging/log.h" +#include "common/func_traits.h" #ifdef __linux__ #include "common/adaptive_mutex.h" @@ -108,9 +108,11 @@ public: * @param size Size in bytes of the CPU range to loop over * @param func Function to call for each turned off region */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) { + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, F&& func) { RENDERER_TRACE; + using FuncTraits = Common::FuncTraits; + constexpr bool uses_clear_mask = FuncTraits::NUM_ARGS == 3; const size_t offset = query_cpu_range - cpu_addr; const size_t start_page = SanitizeAddress(offset) / TRACKER_BYTES_PER_PAGE; const size_t end_page = @@ -122,18 +124,31 @@ public: RegionBits& bits = GetRegionBits(); RegionBits mask(bits, start_page, end_page); + if constexpr (uses_clear_mask) { + static_assert(clear, "Function must not use clear mask when not clearing"); + RegionBits clear_mask; + for (const auto& [start, end] : mask) { + func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, + (end - start) * TRACKER_BYTES_PER_PAGE, clear_mask); + } + bits &= ~clear_mask; + } else { + for (const auto& [start, end] : mask) { + func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, + (end - start) * TRACKER_BYTES_PER_PAGE); + } + if constexpr (clear) { + bits &= ~mask; + } + } + if constexpr (clear) { - bits.UnsetRange(start_page, end_page); if constexpr (type == Type::CPU) { UpdateProtection(); } else if (Config::readbacks()) { UpdateProtection(); } } - - for (const auto& [start, end] : mask) { - func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE); - } } /** @@ -186,6 +201,7 @@ private: PageManager* tracker; VAddr cpu_addr = 0; + Type deferred_protection = Type::None; RegionBits cpu; RegionBits gpu; RegionBits writeable; diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 63297bfdc..daa1218cc 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -4,6 +4,7 @@ #include #include "common/assert.h" #include "common/debug.h" +#include "common/div_ceil.h" #include "common/range_lock.h" #include "common/signal_context.h" #include "core/memory.h" diff --git a/src/video_core/buffer_cache/range_set.h b/src/video_core/range_set.h similarity index 96% rename from src/video_core/buffer_cache/range_set.h rename to src/video_core/range_set.h index 5c8e78c7c..711c1cb04 100644 --- a/src/video_core/buffer_cache/range_set.h +++ b/src/video_core/range_set.h @@ -66,7 +66,7 @@ struct RangeSet { for (const auto& set : m_ranges_set) { const VAddr inter_addr_end = set.upper(); const VAddr inter_addr = set.lower(); - func(inter_addr, inter_addr_end); + func(inter_addr, inter_addr_end - inter_addr); } } @@ -92,7 +92,7 @@ struct RangeSet { if (inter_addr < start_address) { inter_addr = start_address; } - func(inter_addr, inter_addr_end); + func(inter_addr, inter_addr_end - inter_addr); } } @@ -170,7 +170,7 @@ public: for (const auto& [interval, value] : m_ranges_map) { const VAddr inter_addr_end = interval.upper(); const VAddr inter_addr = interval.lower(); - func(inter_addr, inter_addr_end, value); + func(inter_addr, inter_addr_end - inter_addr, value); } } @@ -196,7 +196,7 @@ public: if (inter_addr < start_address) { inter_addr = start_address; } - func(inter_addr, inter_addr_end, it->second); + func(inter_addr, inter_addr_end - inter_addr, it->second); } } @@ -274,7 +274,7 @@ public: for (const auto& [interval, value] : m_ranges_map) { const VAddr inter_addr_end = interval.upper(); const VAddr inter_addr = interval.lower(); - func(inter_addr, inter_addr_end, value); + func(inter_addr, inter_addr_end - inter_addr, value); } } @@ -300,7 +300,7 @@ public: if (inter_addr < start_address) { inter_addr = start_address; } - func(inter_addr, inter_addr_end, it->second); + func(inter_addr, inter_addr_end - inter_addr, it->second); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c3e221739..6fd937d91 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -33,7 +33,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -446,11 +446,14 @@ void Rasterizer::Finish() { scheduler.Finish(); } -void Rasterizer::EndCommandList() { +void Rasterizer::ProcessFaultBuffer() { if (fault_process_pending) { fault_process_pending = false; buffer_cache.ProcessFaultBuffer(); } +} + +void Rasterizer::ProcessDownloadImages() { texture_cache.ProcessDownloadImages(); } @@ -479,16 +482,12 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { uses_dma |= stage->uses_dma; } - if (uses_dma) { + if (uses_dma && !fault_process_pending) { // We only use fault buffer for DMA right now. { Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - for (auto& range : mapped_ranges) { - buffer_cache.SynchronizeBuffersInRange(range.lower(), - range.upper() - range.lower()); - } + buffer_cache.SynchronizeBuffersForDma(); } - buffer_cache.MemoryBarrier(); } fault_process_pending |= uses_dma; @@ -995,16 +994,14 @@ bool Rasterizer::IsMapped(VAddr addr, u64 size) { // There is no memory, so not mapped. return false; } - const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - return boost::icl::contains(mapped_ranges, range); + return mapped_ranges.Contains(addr, size); } void Rasterizer::MapMemory(VAddr addr, u64 size) { { std::scoped_lock lock{mapped_ranges_mutex}; - mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + mapped_ranges.Add(addr, size); } page_manager.OnGpuMap(addr, size); } @@ -1015,7 +1012,7 @@ void Rasterizer::UnmapMemory(VAddr addr, u64 size) { page_manager.OnGpuUnmap(addr, size); { std::scoped_lock lock{mapped_ranges_mutex}; - mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); + mapped_ranges.Subtract(addr, size); } } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 79e7722b8..b4f34edd6 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -3,11 +3,11 @@ #pragma once -#include #include "common/recursive_lock.h" #include "common/shared_first_mutex.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" +#include "video_core/range_set.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/texture_cache/texture_cache.h" @@ -43,6 +43,10 @@ public: return texture_cache; } + [[nodiscard]] const VideoCore::RangeSet& GetMappedRanges() const noexcept { + return mapped_ranges; + } + void Draw(bool is_indexed, u32 index_offset = 0); void DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u32 size, u32 max_count, VAddr count_address); @@ -68,7 +72,8 @@ public: void CpSync(); u64 Flush(); void Finish(); - void EndCommandList(); + void ProcessFaultBuffer(); + void ProcessDownloadImages(); PipelineCache& GetPipelineCache() { return pipeline_cache; @@ -76,11 +81,8 @@ public: template void ForEachMappedRangeInRange(VAddr addr, u64 size, Func&& func) { - const auto range = decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); - Common::RecursiveSharedLock lock{mapped_ranges_mutex}; - for (const auto& mapped_range : (mapped_ranges & range)) { - func(mapped_range); - } + Common::RecursiveSharedLock lk(mapped_ranges_mutex); + mapped_ranges.ForEachInRange(addr, size, std::forward(func)); } private: @@ -123,7 +125,7 @@ private: VideoCore::TextureCache texture_cache; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; - boost::icl::interval_set mapped_ranges; + VideoCore::RangeSet mapped_ranges; Common::SharedFirstMutex mapped_ranges_mutex; PipelineCache pipeline_cache;