Faster syncing with ranges

This commit is contained in:
Lander Gallastegi 2025-05-11 11:10:11 +02:00
parent c82c165b29
commit bd31f1d12c
5 changed files with 227 additions and 59 deletions

View File

@ -3,6 +3,7 @@
#include <algorithm>
#include "common/alignment.h"
#include "common/debug.h"
#include "common/scope_exit.h"
#include "common/types.h"
#include "video_core/amdgpu/liverpool.h"
@ -131,11 +132,22 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
BufferCache::~BufferCache() = default;
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) {
const bool is_tracked = IsRegionRegistered(device_addr, size);
if (is_tracked) {
// Mark the page as CPU modified to stop tracking writes.
memory_tracker.MarkRegionAsCpuModified(device_addr, size);
if (unmap) {
return;
}
{
std::scoped_lock lock(dma_sync_ranges_mutex);
const VAddr page_addr = Common::AlignDown(device_addr, CACHING_PAGESIZE);
const u64 page_size = Common::AlignUp(size, CACHING_PAGESIZE);
dma_sync_ranges.Add(device_addr, size);
}
}
}
@ -371,24 +383,10 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
}
bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
const VAddr end_addr = addr + size;
const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE);
for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page].buffer_id;
if (!buffer_id) {
++page;
continue;
}
std::shared_lock lk{slot_buffers_mutex};
Buffer& buffer = slot_buffers[buffer_id];
const VAddr buf_start_addr = buffer.CpuAddr();
const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
if (buf_start_addr < end_addr && addr < buf_end_addr) {
return true;
}
page = Common::DivCeil(buf_end_addr, CACHING_PAGESIZE);
}
return false;
// Check if we are missing some edge case here
const u64 page = addr >> CACHING_PAGEBITS;
const u64 page_size = Common::DivCeil(size, CACHING_PAGESIZE);
return buffer_ranges.Intersects(page, page_size);
}
bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) {
@ -577,6 +575,10 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
}
Register(new_buffer_id);
{
std::scoped_lock lk(dma_sync_ranges_mutex);
dma_sync_ranges.Add(overlap.begin, overlap.end);
}
return new_buffer_id;
}
@ -704,7 +706,6 @@ void BufferCache::ProcessFaultBuffer() {
// Only create a buffer is the current range doesn't fit in an existing one
FindBuffer(start, static_cast<u32>(end - start));
}
rasterizer.AddDmaSyncRanges(fault_ranges);
});
}
@ -731,6 +732,11 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
page_table[page].buffer_id = BufferId{};
}
}
if constexpr (insert) {
buffer_ranges.Add(page_begin, page_end - page_begin, buffer_id);
} else {
buffer_ranges.Subtract(page_begin, page_end - page_begin);
}
}
void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
@ -915,6 +921,7 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
}
VAddr device_addr_end = device_addr + size;
ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
RENDERER_TRACE;
// Note that this function synchronizes the whole buffer, not just the range.
// This is because this function is used to sync buffers before using a
// shader that uses DMA.
@ -924,6 +931,16 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
});
}
void BufferCache::SynchronizeDmaBuffers() {
RENDERER_TRACE;
std::scoped_lock lk(dma_sync_ranges_mutex);
dma_sync_ranges.ForEach([&](VAddr device_addr, u64 end_addr) {
RENDERER_TRACE;
SynchronizeBuffersInRange(device_addr, end_addr - device_addr);
});
dma_sync_ranges.Clear();
}
void BufferCache::MemoryBarrier() {
// Vulkan doesn't know which buffer we access in a shader if we use
// BufferDeviceAddress. We need a full memory barrier.

View File

@ -98,7 +98,7 @@ public:
}
/// Invalidates any buffer in the logical page range.
void InvalidateMemory(VAddr device_addr, u64 size);
void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
/// Binds host vertex buffers for the current draw.
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@ -139,25 +139,21 @@ public:
/// Synchronizes all buffers in the specified range.
void SynchronizeBuffersInRange(VAddr device_addr, u64 size);
/// Synchronizes all buffers neede for DMA.
void SynchronizeDmaBuffers();
/// Record memory barrier. Used for buffers when accessed via BDA.
void MemoryBarrier();
private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
for (u64 page = device_addr >> CACHING_PAGEBITS; page < page_end;) {
const BufferId buffer_id = page_table[page].buffer_id;
if (!buffer_id) {
++page;
continue;
}
Buffer& buffer = slot_buffers[buffer_id];
func(buffer_id, buffer);
const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
page = Common::DivCeil(end_addr, CACHING_PAGESIZE);
}
const u64 page = device_addr >> CACHING_PAGEBITS;
const u64 page_size = Common::DivCeil(size, CACHING_PAGESIZE);
buffer_ranges.ForEachInRange(page, page_size, [&](u64 page_start, u64 page_end, BufferId id) {
Buffer& buffer = slot_buffers[id];
func(id, buffer);
});
}
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size);
@ -199,7 +195,10 @@ private:
Buffer fault_buffer;
std::shared_mutex slot_buffers_mutex;
Common::SlotVector<Buffer> slot_buffers;
std::shared_mutex dma_sync_ranges_mutex;
RangeSet dma_sync_ranges;
RangeSet gpu_modified_ranges;
SplitRangeMap<BufferId> buffer_ranges;
MemoryTracker memory_tracker;
PageTable page_table;
vk::UniqueDescriptorSetLayout fault_process_desc_layout;

View File

@ -4,6 +4,9 @@
#pragma once
#include <boost/icl/interval_map.hpp>
#include <boost/icl/split_interval_map.hpp>
#include <boost/icl/split_interval_set.hpp>
#include <boost/icl/discrete_interval.hpp>
#include <boost/pool/pool.hpp>
#include <boost/pool/pool_alloc.hpp>
#include <boost/pool/poolfwd.hpp>
@ -38,6 +41,22 @@ struct RangeSet {
m_ranges_set.subtract(interval);
}
void Clear() {
m_ranges_set.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_set, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_set, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_set.empty()) {
@ -77,14 +96,29 @@ struct RangeSet {
}
}
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}
base_addr = range_end;
});
if (base_addr != end_addr) {
func(base_addr, end_addr - base_addr);
}
}
IntervalSet m_ranges_set;
};
template <typename T>
class RangeMap {
public:
using IntervalMap =
boost::icl::interval_map<VAddr, u64, boost::icl::partial_absorber, std::less,
boost::icl::inplace_plus, boost::icl::inter_section,
boost::icl::interval_map<VAddr, T, boost::icl::total_absorber, std::less,
boost::icl::inplace_identity, boost::icl::inter_section,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalMap::interval_type;
@ -99,7 +133,7 @@ public:
RangeMap(RangeMap&& other);
RangeMap& operator=(RangeMap&& other);
void Add(VAddr base_address, size_t size, u64 value) {
void Add(VAddr base_address, size_t size, const T& value) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map.add({interval, value});
@ -111,6 +145,35 @@ public:
m_ranges_map -= interval;
}
void Clear() {
m_ranges_map.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_map, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_map, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
for (const auto& [interval, value] : m_ranges_map) {
const VAddr inter_addr_end = interval.upper();
const VAddr inter_addr = interval.lower();
func(inter_addr, inter_addr_end, value);
}
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
@ -140,7 +203,112 @@ public:
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, u64) {
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}
base_addr = range_end;
});
if (base_addr != end_addr) {
func(base_addr, end_addr - base_addr);
}
}
private:
IntervalMap m_ranges_map;
};
template <typename T>
class SplitRangeMap {
public:
using IntervalMap =
boost::icl::split_interval_map<VAddr, T, boost::icl::total_absorber, std::less,
boost::icl::inplace_identity, boost::icl::inter_section,
ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, std::less),
RangeSetsAllocator>;
using IntervalType = typename IntervalMap::interval_type;
public:
SplitRangeMap() = default;
~SplitRangeMap() = default;
SplitRangeMap(SplitRangeMap const&) = delete;
SplitRangeMap& operator=(SplitRangeMap const&) = delete;
SplitRangeMap(SplitRangeMap&& other);
SplitRangeMap& operator=(SplitRangeMap&& other);
void Add(VAddr base_address, size_t size, const T& value) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map.add({interval, value});
}
void Subtract(VAddr base_address, size_t size) {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
m_ranges_map -= interval;
}
void Clear() {
m_ranges_map.clear();
}
bool Contains(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::contains(m_ranges_map, interval);
}
bool Intersects(VAddr base_address, size_t size) const {
const VAddr end_address = base_address + size;
IntervalType interval{base_address, end_address};
return boost::icl::intersects(m_ranges_map, interval);
}
template <typename Func>
void ForEach(Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
for (const auto& [interval, value] : m_ranges_map) {
const VAddr inter_addr_end = interval.upper();
const VAddr inter_addr = interval.lower();
func(inter_addr, inter_addr_end, value);
}
}
template <typename Func>
void ForEachInRange(VAddr base_addr, size_t size, Func&& func) const {
if (m_ranges_map.empty()) {
return;
}
const VAddr start_address = base_addr;
const VAddr end_address = start_address + size;
const IntervalType search_interval{start_address, end_address};
auto it = m_ranges_map.lower_bound(search_interval);
if (it == m_ranges_map.end()) {
return;
}
auto end_it = m_ranges_map.upper_bound(search_interval);
for (; it != end_it; it++) {
VAddr inter_addr_end = it->first.upper();
VAddr inter_addr = it->first.lower();
if (inter_addr_end > end_address) {
inter_addr_end = end_address;
}
if (inter_addr < start_address) {
inter_addr = start_address;
}
func(inter_addr, inter_addr_end, it->second);
}
}
template <typename Func>
void ForEachNotInRange(VAddr base_addr, size_t size, Func&& func) const {
const VAddr end_addr = base_addr + size;
ForEachInRange(base_addr, size, [&](VAddr range_addr, VAddr range_end, const T&) {
if (size_t gap_size = range_addr - base_addr; gap_size != 0) {
func(base_addr, gap_size);
}

View File

@ -477,10 +477,9 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
if (uses_dma) {
// We only use fault buffer for DMA right now.
{
Common::RecursiveSharedLock lock(mapped_ranges_mutex);
for (const auto& range : dma_sync_mapped_ranges) {
buffer_cache.SynchronizeBuffersInRange(range.lower(), range.upper() - range.lower());
}
// We don't want the mapped ranges to be modified while we are syncing
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
buffer_cache.SynchronizeDmaBuffers();
}
buffer_cache.MemoryBarrier();
}
@ -726,14 +725,6 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin
}
}
void Rasterizer::AddDmaSyncRanges(const boost::icl::interval_set<VAddr>& ranges) {
dma_sync_ranges += ranges;
{
std::scoped_lock lock(mapped_ranges_mutex);
dma_sync_mapped_ranges = mapped_ranges & dma_sync_ranges;
}
}
void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& state) {
int cb_index = 0;
for (auto attach_idx = 0u; attach_idx < state.num_color_attachments; ++attach_idx) {
@ -964,7 +955,7 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) {
// Not GPU mapped memory, can skip invalidation logic entirely.
return false;
}
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, false);
texture_cache.InvalidateMemory(addr, size);
return true;
}
@ -984,19 +975,17 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) {
{
std::scoped_lock lock{mapped_ranges_mutex};
mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
dma_sync_mapped_ranges = mapped_ranges & dma_sync_ranges;
}
page_manager.OnGpuMap(addr, size);
}
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {
buffer_cache.InvalidateMemory(addr, size);
buffer_cache.InvalidateMemory(addr, size, true);
texture_cache.UnmapMemory(addr, size);
page_manager.OnGpuUnmap(addr, size);
{
std::scoped_lock lock{mapped_ranges_mutex};
mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
dma_sync_mapped_ranges -= decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
}
}

View File

@ -107,9 +107,6 @@ private:
}
bool IsComputeMetaClear(const Pipeline* pipeline);
void AddDmaSyncRanges(const boost::icl::interval_set<VAddr>& ranges);
private:
friend class VideoCore::BufferCache;
@ -121,8 +118,6 @@ private:
AmdGpu::Liverpool* liverpool;
Core::MemoryManager* memory;
boost::icl::interval_set<VAddr> mapped_ranges;
boost::icl::interval_set<VAddr> dma_sync_ranges;
boost::icl::interval_set<VAddr> dma_sync_mapped_ranges;
std::shared_mutex mapped_ranges_mutex;
PipelineCache pipeline_cache;