diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index b880f5ec3..f0faca4f6 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -70,8 +70,10 @@ UniqueBuffer::~UniqueBuffer() { void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage, VmaAllocationInfo* out_alloc_info) { + const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress); + const VmaAllocationCreateFlags bda_flag = with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0; const VmaAllocationCreateInfo alloc_ci = { - .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage), + .flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage), .usage = MemoryUsageVma(usage), .requiredFlags = 0, .preferredFlags = MemoryUsagePreferredVmaFlags(usage), @@ -86,6 +88,15 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}", vk::to_string(vk::Result{result})); buffer = vk::Buffer{unsafe_buffer}; + + if (with_bda) { + vk::BufferDeviceAddressInfo bda_info{ + .buffer = buffer, + }; + auto bda_result = device.getBufferAddress(bda_info); + ASSERT_MSG(bda_result != 0, "Failed to get buffer device address"); + bda_addr = bda_result; + } } Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_, diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 3842f20c4..e81b05671 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -68,6 +68,7 @@ struct UniqueBuffer { VmaAllocator allocator; VmaAllocation allocation; vk::Buffer buffer{}; + vk::DeviceAddress bda_addr = 0; }; class Buffer { @@ -115,6 +116,11 @@ public: return buffer; } + vk::DeviceAddress BufferDeviceAddress() const noexcept { + ASSERT_MSG(buffer.bda_addr != 0, "Can't get BDA from a non BDA buffer"); + return buffer.bda_addr; + } + std::optional GetBarrier( vk::Flags dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage, u32 offset = 0) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 877770bc7..db35099f8 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -27,6 +27,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, + fault_readback_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_READBACK_SIZE), memory_tracker{&tracker} { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); @@ -323,36 +324,36 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { return CreateBuffer(device_addr, size); } -void BufferCache::QueueCoverage(VAddr device_addr, u64 size) { +void BufferCache::QueueMemoryImport(VAddr device_addr, u64 size) { std::scoped_lock lk{mutex}; const u64 start = device_addr; const u64 end = device_addr + size; - auto queue_range = decltype(covered_regions)::interval_type::right_open(start, end); - queued_coverage += queue_range; + auto queue_range = decltype(imported_regions)::interval_type::right_open(start, end); + queued_imports += queue_range; } -void BufferCache::CoverQueuedRegions() { +void BufferCache::ImportQueuedRegions() { std::scoped_lock lk{mutex}; - if (queued_coverage.empty()) { + if (queued_imports.empty()) { return; } - for (const auto& range : queued_coverage) { - CoverMemory(range.lower(), range.upper()); + for (const auto& range : queued_imports) { + ImportMemory(range.lower(), range.upper()); } - queued_coverage.clear(); + queued_imports.clear(); } -void BufferCache::CoverMemory(u64 start, u64 end) { +void BufferCache::ImportMemory(u64 start, u64 end) { const u64 page_start = start >> CACHING_PAGEBITS; const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE); - auto interval = decltype(covered_regions)::interval_type::right_open(page_start, page_end); + auto interval = decltype(imported_regions)::interval_type::right_open(page_start, page_end); auto interval_set = boost::icl::interval_set{interval}; - auto uncovered_ranges = interval_set - covered_regions; + auto uncovered_ranges = interval_set - imported_regions; if (uncovered_ranges.empty()) { return; } // We fill any holes within the given range - boost::container::small_vector bda_addrs; + boost::container::small_vector bda_addrs; for (const auto& range : uncovered_ranges) { // import host memory const u64 range_start = range.lower(); @@ -371,19 +372,19 @@ void BufferCache::CoverMemory(u64 start, u64 end) { bda_addrs.clear(); bda_addrs.reserve(range_pages); for (u64 i = 0; i < range_pages; ++i) { - // TODO: we may want to mark the page as host imported - // to let the shader know so that it can notify us if it - // accesses the page, so we can create a GPU local buffer. + // Mark the page as host imported to let the shader know + // so that it can notify us if it accesses the page, so we can + // create a GPU local buffer. bda_addrs.push_back((bda_addr + (i << CACHING_PAGEBITS)) | 0x1); } - WriteDataBuffer(bda_pagetable_buffer, range_start * sizeof(u64), bda_addrs.data(), - bda_addrs.size() * sizeof(u64)); + WriteDataBuffer(bda_pagetable_buffer, range_start * sizeof(vk::DeviceAddress), bda_addrs.data(), + bda_addrs.size() * sizeof(vk::DeviceAddress)); { std::scoped_lock lk{mutex}; imported_buffers.emplace_back(std::move(buffer)); } // Mark the pages as covered - covered_regions += range; + imported_regions += range; } } @@ -525,9 +526,25 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { const BufferId new_buffer_id = [&] { std::scoped_lock lk{mutex}; return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, - AllFlags, size); + AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size); }(); auto& new_buffer = slot_buffers[new_buffer_id]; + boost::container::small_vector bda_addrs; + const u64 start_page = overlap.begin >> CACHING_PAGEBITS; + const u64 size_pages = size >> CACHING_PAGEBITS; + bda_addrs.reserve(size_pages); + for (u64 i = 0; i < size_pages; ++i) { + // Here, we do not set the host imported bit. + bda_addrs.push_back(new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS)); + } + WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(), + bda_addrs.size() * sizeof(vk::DeviceAddress)); + { + // Mark the pages as covered + std::scoped_lock lk{mutex}; + imported_regions += boost::icl::interval_set::interval_type::right_open( + start_page, start_page + size_pages); + } const size_t size_bytes = new_buffer.SizeBytes(); const auto cmdbuf = scheduler.CommandBuffer(); scheduler.EndRendering(); @@ -539,6 +556,44 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { return new_buffer_id; } +void BufferCache::CreateFaultBuffers() { + // Download the fault readback buffer + const auto [mapped, offset] = staging_buffer.Map(FAULT_READBACK_SIZE); + vk::BufferCopy copy = { + .srcOffset = 0, + .dstOffset = offset, + .size = FAULT_READBACK_SIZE, + }; + staging_buffer.Commit(); + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyBuffer(fault_readback_buffer.buffer, staging_buffer.Handle(), copy); + scheduler.Finish(); + std::array buffer{}; + std::memcpy(buffer.data(), mapped, FAULT_READBACK_SIZE); + // Reset the fault readback buffer + cmdbuf.fillBuffer(fault_readback_buffer.buffer, 0, FAULT_READBACK_SIZE, 0); + // Create the fault buffers batched + boost::icl::interval_set fault_ranges; + for (u64 i = 0; i < FAULT_READBACK_SIZE / sizeof(vk::DeviceAddress); ++i) { + if (buffer[i] != 0) { + // Each byte contains information for 8 pages. + // We are oing to create an aligned buffer of + // 8 * 64 KB = 512 KB arround the fault address. + const VAddr fault_addr = buffer[i] << CACHING_PAGEBITS; + const u32 fault_end = mapped[i + 1] << CACHING_PAGEBITS; + auto range = decltype(fault_ranges)::interval_type::right_open( + fault_addr, fault_end); + fault_ranges += range; + } + } + for (const auto& range : fault_ranges) { + const VAddr start = range.lower(); + const u32 size = range.upper() - start; + CreateBuffer(start, size); + } +} + void BufferCache::Register(BufferId buffer_id) { ChangeRegister(buffer_id); } @@ -740,6 +795,18 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, return true; } +void BufferCache::SynchronizeRange(VAddr device_addr, u32 size) { + if (device_addr == 0) { + return; + } + ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + if (buffer.is_deleted) { + return; + } + SynchronizeBuffer(buffer, buffer.CpuAddr(), buffer.SizeBytes(), false); + }); +} + void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) { scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 74d45005f..b970d8acb 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -43,7 +43,9 @@ public: static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; static constexpr u64 DEVICE_PAGESIZE = 64_KB; static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS); - static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(u64); + + static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress); + static constexpr u64 FAULT_READBACK_SIZE = CACHING_NUMPAGES / 8; // Bit per page struct Traits { using Entry = BufferId; @@ -81,6 +83,11 @@ public: return bda_pagetable_buffer; } + /// Retrieves the fault readback buffer. + [[nodiscard]] Buffer& GetFaultReadbackBuffer() noexcept { + return fault_readback_buffer; + } + /// Retrieves the buffer with the specified id. [[nodiscard]] Buffer& GetBuffer(BufferId id) { return slot_buffers[id]; @@ -123,10 +130,16 @@ public: [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); /// Queue a region for coverage for DMA. - void QueueCoverage(VAddr device_addr, u64 size); + void QueueMemoryImport(VAddr device_addr, u64 size); /// Covers all queued regions. - void CoverQueuedRegions(); + void ImportQueuedRegions(); + + /// Creates buffers for "faulted" shader accesses to host memory. + void CreateFaultBuffers(); + + /// Synchronizes all buffers in the specified range. + void SynchronizeRange(VAddr device_addr, u32 size); private: template @@ -171,7 +184,7 @@ private: void DeleteBuffer(BufferId buffer_id); - void CoverMemory(u64 start, u64 end); + void ImportMemory(u64 start, u64 end); const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; @@ -183,8 +196,9 @@ private: StreamBuffer stream_buffer; Buffer gds_buffer; Buffer bda_pagetable_buffer; - boost::icl::interval_set queued_coverage; - boost::icl::interval_set covered_regions; + Buffer fault_readback_buffer; + boost::icl::interval_set queued_imports; + boost::icl::interval_set imported_regions; std::vector imported_buffers; std::shared_mutex mutex; Common::SlotVector slot_buffers; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c12e3335e..c91ece24a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -946,7 +946,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) { mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size); } page_manager.OnGpuMap(addr, size); - buffer_cache.QueueCoverage(addr, size); + buffer_cache.QueueMemoryImport(addr, size); } void Rasterizer::UnmapMemory(VAddr addr, u64 size) {