video_core: garbage collector (part 1) (#3350)

* Memory information

* Buffer cache GC

* Texture cache GC

* Fix ChangeRegister

* Better image touching

* Buffer async download on GC destroy

* Handle image download, SKIP NON-LINEAR WORKAROUND

* Only download when not dirty

* Correctly handle BDA pagefile update

* Restructure ChangeRegistration
This commit is contained in:
Lander Gallastegi
2025-08-06 12:30:13 +02:00
committed by GitHub
parent 2f701311f2
commit 841aa9e43d
14 changed files with 508 additions and 29 deletions

View File

@@ -112,6 +112,14 @@ public:
return size_bytes;
}
void SetLRUId(u64 id) noexcept {
lru_id = id;
}
u64 LRUId() const noexcept {
return lru_id;
}
vk::Buffer Handle() const noexcept {
return buffer;
}
@@ -151,6 +159,7 @@ public:
bool is_deleted{};
int stream_score = 0;
size_t size_bytes = 0;
u64 lru_id = 0;
std::span<u8> mapped_data;
const Vulkan::Instance* instance;
Vulkan::Scheduler* scheduler;

View File

@@ -130,6 +130,26 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
"Fault Buffer Parser Pipeline");
instance.GetDevice().destroyShaderModule(module);
// Set up garbage collection parameters
if (!instance.CanReportMemoryUsage()) {
trigger_gc_memory = DEFAULT_TRIGGER_GC_MEMORY;
critical_gc_memory = DEFAULT_CRITICAL_GC_MEMORY;
return;
}
const s64 device_local_memory = static_cast<s64>(instance.GetTotalMemoryBudget());
const s64 min_spacing_expected = device_local_memory - 1_GB;
const s64 min_spacing_critical = device_local_memory - 512_MB;
const s64 mem_threshold = std::min<s64>(device_local_memory, TARGET_GC_THRESHOLD);
const s64 min_vacancy_expected = (6 * mem_threshold) / 10;
const s64 min_vacancy_critical = (2 * mem_threshold) / 10;
trigger_gc_memory = static_cast<u64>(
std::max<u64>(std::min(device_local_memory - min_vacancy_expected, min_spacing_expected),
DEFAULT_TRIGGER_GC_MEMORY));
critical_gc_memory = static_cast<u64>(
std::max<u64>(std::min(device_local_memory - min_vacancy_critical, min_spacing_critical),
DEFAULT_CRITICAL_GC_MEMORY));
}
BufferCache::~BufferCache() = default;
@@ -145,10 +165,11 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) {
liverpool->SendCommand<true>([this, device_addr, size, is_write] {
Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)];
DownloadBufferMemory(buffer, device_addr, size, is_write);
DownloadBufferMemory<false>(buffer, device_addr, size, is_write);
});
}
template <bool async>
void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write) {
boost::container::small_vector<vk::BufferCopy, 1> copies;
u64 total_size_bytes = 0;
@@ -183,17 +204,24 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies);
scheduler.Finish();
auto* memory = Core::Memory::Instance();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
copy.size);
}
memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
if (is_write) {
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
const auto write_data = [&]() {
auto* memory = Core::Memory::Instance();
for (const auto& copy : copies) {
const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
const u64 dst_offset = copy.dstOffset - offset;
memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr), download + dst_offset,
copy.size);
}
memory_tracker->UnmarkRegionAsGpuModified(device_addr, size);
if (is_write) {
memory_tracker->MarkRegionAsCpuModified(device_addr, size);
}
};
if constexpr (async) {
scheduler.DeferOperation(write_data);
} else {
scheduler.Finish();
write_data();
}
}
@@ -647,16 +675,6 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
}();
auto& new_buffer = slot_buffers[new_buffer_id];
boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
const u64 start_page = overlap.begin >> CACHING_PAGEBITS;
const u64 size_pages = size >> CACHING_PAGEBITS;
bda_addrs.reserve(size_pages);
for (u64 i = 0; i < size_pages; ++i) {
vk::DeviceAddress addr = new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS);
bda_addrs.push_back(addr);
}
WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
bda_addrs.size() * sizeof(vk::DeviceAddress));
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
@@ -807,6 +825,7 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
const VAddr device_addr_end = device_addr_begin + size;
const u64 page_begin = device_addr_begin / CACHING_PAGESIZE;
const u64 page_end = Common::DivCeil(device_addr_end, CACHING_PAGESIZE);
const u64 size_pages = page_end - page_begin;
for (u64 page = page_begin; page != page_end; ++page) {
if constexpr (insert) {
page_table[page].buffer_id = buffer_id;
@@ -815,8 +834,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
}
}
if constexpr (insert) {
total_used_memory += Common::AlignUp(size, CACHING_PAGESIZE);
buffer.SetLRUId(lru_cache.Insert(buffer_id, gc_tick));
boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
bda_addrs.reserve(size_pages);
for (u64 i = 0; i < size_pages; ++i) {
vk::DeviceAddress addr = buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS);
bda_addrs.push_back(addr);
}
WriteDataBuffer(bda_pagetable_buffer, page_begin * sizeof(vk::DeviceAddress),
bda_addrs.data(), bda_addrs.size() * sizeof(vk::DeviceAddress));
buffer_ranges.Add(buffer.CpuAddr(), buffer.SizeBytes(), buffer_id);
} else {
total_used_memory -= Common::AlignUp(size, CACHING_PAGESIZE);
lru_cache.Free(buffer.LRUId());
FillBuffer(bda_pagetable_buffer, page_begin * sizeof(vk::DeviceAddress),
size_pages * sizeof(vk::DeviceAddress), 0);
buffer_ranges.Subtract(buffer.CpuAddr(), buffer.SizeBytes());
}
}
@@ -874,6 +907,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
TouchBuffer(buffer);
}
vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span<vk::BufferCopy> copies,
@@ -1154,6 +1188,70 @@ void BufferCache::WriteDataBuffer(Buffer& buffer, VAddr address, const void* val
});
}
void BufferCache::FillBuffer(Buffer& buffer, VAddr address, u32 num_bytes, u32 value) {
scheduler.EndRendering();
ASSERT_MSG(num_bytes % 4 == 0, "FillBuffer size must be a multiple of 4 bytes");
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.fillBuffer(buffer.Handle(), buffer.Offset(address), num_bytes, value);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
void BufferCache::RunGarbageCollector() {
SCOPE_EXIT {
++gc_tick;
};
if (instance.CanReportMemoryUsage()) {
total_used_memory = instance.GetDeviceMemoryUsage();
}
if (total_used_memory < trigger_gc_memory) {
return;
}
const bool aggressive = total_used_memory >= critical_gc_memory;
const u64 ticks_to_destroy = std::min<u64>(aggressive ? 80 : 160, gc_tick);
int max_deletions = aggressive ? 64 : 32;
const auto clean_up = [&](BufferId buffer_id) {
if (max_deletions == 0) {
return;
}
--max_deletions;
Buffer& buffer = slot_buffers[buffer_id];
// InvalidateMemory(buffer.CpuAddr(), buffer.SizeBytes());
DownloadBufferMemory<true>(buffer, buffer.CpuAddr(), buffer.SizeBytes(), true);
DeleteBuffer(buffer_id);
};
}
void BufferCache::TouchBuffer(const Buffer& buffer) {
lru_cache.Touch(buffer.LRUId(), gc_tick);
}
void BufferCache::DeleteBuffer(BufferId buffer_id) {
Buffer& buffer = slot_buffers[buffer_id];
Unregister(buffer_id);

View File

@@ -5,7 +5,7 @@
#include <shared_mutex>
#include <boost/container/small_vector.hpp>
#include "common/div_ceil.h"
#include "common/lru_cache.h"
#include "common/slot_vector.h"
#include "common/types.h"
#include "video_core/buffer_cache/buffer.h"
@@ -44,6 +44,11 @@ public:
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
static constexpr u64 FAULT_BUFFER_SIZE = CACHING_NUMPAGES / 8; // Bit per page
// Default values for garbage collection
static constexpr s64 DEFAULT_TRIGGER_GC_MEMORY = 1_GB;
static constexpr s64 DEFAULT_CRITICAL_GC_MEMORY = 2_GB;
static constexpr s64 TARGET_GC_THRESHOLD = 8_GB;
struct PageData {
BufferId buffer_id{};
};
@@ -162,6 +167,9 @@ public:
/// Record memory barrier. Used for buffers when accessed via BDA.
void MemoryBarrier();
/// Runs the garbage collector.
void RunGarbageCollector();
private:
template <typename Func>
void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) {
@@ -176,6 +184,7 @@ private:
return !buffer_id || slot_buffers[buffer_id].is_deleted;
}
template <bool async>
void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write);
[[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@@ -203,6 +212,10 @@ private:
void WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
void FillBuffer(Buffer& buffer, VAddr address, u32 num_bytes, u32 value);
void TouchBuffer(const Buffer& buffer);
void DeleteBuffer(BufferId buffer_id);
const Vulkan::Instance& instance;
@@ -220,6 +233,11 @@ private:
Buffer fault_buffer;
std::shared_mutex slot_buffers_mutex;
Common::SlotVector<Buffer> slot_buffers;
u64 total_used_memory = 0;
u64 trigger_gc_memory = 0;
u64 critical_gc_memory = 0;
u64 gc_tick = 0;
Common::LeastRecentlyUsedCache<BufferId, u64> lru_cache;
RangeSet gpu_modified_ranges;
SplitRangeMap<BufferId> buffer_ranges;
PageTable page_table;