Buffer syncing, faulted readback adn BDA in Buffer

This commit is contained in:
Lander Gallastegi 2025-04-17 21:37:30 +02:00
parent 83255ee68f
commit 94a078207f
5 changed files with 125 additions and 27 deletions

View File

@ -70,8 +70,10 @@ UniqueBuffer::~UniqueBuffer() {
void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usage,
VmaAllocationInfo* out_alloc_info) {
const bool with_bda = bool(buffer_ci.usage & vk::BufferUsageFlagBits::eShaderDeviceAddress);
const VmaAllocationCreateFlags bda_flag = with_bda ? VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT : 0;
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | bda_flag | MemoryUsageVmaFlags(usage),
.usage = MemoryUsageVma(usage),
.requiredFlags = 0,
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
@ -86,6 +88,15 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa
ASSERT_MSG(result == VK_SUCCESS, "Failed allocating buffer with error {}",
vk::to_string(vk::Result{result}));
buffer = vk::Buffer{unsafe_buffer};
if (with_bda) {
vk::BufferDeviceAddressInfo bda_info{
.buffer = buffer,
};
auto bda_result = device.getBufferAddress(bda_info);
ASSERT_MSG(bda_result != 0, "Failed to get buffer device address");
bda_addr = bda_result;
}
}
Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_,

View File

@ -68,6 +68,7 @@ struct UniqueBuffer {
VmaAllocator allocator;
VmaAllocation allocation;
vk::Buffer buffer{};
vk::DeviceAddress bda_addr = 0;
};
class Buffer {
@ -115,6 +116,11 @@ public:
return buffer;
}
vk::DeviceAddress BufferDeviceAddress() const noexcept {
ASSERT_MSG(buffer.bda_addr != 0, "Can't get BDA from a non BDA buffer");
return buffer.bda_addr;
}
std::optional<vk::BufferMemoryBarrier2> GetBarrier(
vk::Flags<vk::AccessFlagBits2> dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage,
u32 offset = 0) {

View File

@ -27,6 +27,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE},
fault_readback_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_READBACK_SIZE),
memory_tracker{&tracker} {
Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer");
@ -323,36 +324,36 @@ BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) {
return CreateBuffer(device_addr, size);
}
void BufferCache::QueueCoverage(VAddr device_addr, u64 size) {
void BufferCache::QueueMemoryImport(VAddr device_addr, u64 size) {
std::scoped_lock lk{mutex};
const u64 start = device_addr;
const u64 end = device_addr + size;
auto queue_range = decltype(covered_regions)::interval_type::right_open(start, end);
queued_coverage += queue_range;
auto queue_range = decltype(imported_regions)::interval_type::right_open(start, end);
queued_imports += queue_range;
}
void BufferCache::CoverQueuedRegions() {
void BufferCache::ImportQueuedRegions() {
std::scoped_lock lk{mutex};
if (queued_coverage.empty()) {
if (queued_imports.empty()) {
return;
}
for (const auto& range : queued_coverage) {
CoverMemory(range.lower(), range.upper());
for (const auto& range : queued_imports) {
ImportMemory(range.lower(), range.upper());
}
queued_coverage.clear();
queued_imports.clear();
}
void BufferCache::CoverMemory(u64 start, u64 end) {
void BufferCache::ImportMemory(u64 start, u64 end) {
const u64 page_start = start >> CACHING_PAGEBITS;
const u64 page_end = Common::DivCeil(end, CACHING_PAGESIZE);
auto interval = decltype(covered_regions)::interval_type::right_open(page_start, page_end);
auto interval = decltype(imported_regions)::interval_type::right_open(page_start, page_end);
auto interval_set = boost::icl::interval_set<u64>{interval};
auto uncovered_ranges = interval_set - covered_regions;
auto uncovered_ranges = interval_set - imported_regions;
if (uncovered_ranges.empty()) {
return;
}
// We fill any holes within the given range
boost::container::small_vector<u64, 1024> bda_addrs;
boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
for (const auto& range : uncovered_ranges) {
// import host memory
const u64 range_start = range.lower();
@ -371,19 +372,19 @@ void BufferCache::CoverMemory(u64 start, u64 end) {
bda_addrs.clear();
bda_addrs.reserve(range_pages);
for (u64 i = 0; i < range_pages; ++i) {
// TODO: we may want to mark the page as host imported
// to let the shader know so that it can notify us if it
// accesses the page, so we can create a GPU local buffer.
// Mark the page as host imported to let the shader know
// so that it can notify us if it accesses the page, so we can
// create a GPU local buffer.
bda_addrs.push_back((bda_addr + (i << CACHING_PAGEBITS)) | 0x1);
}
WriteDataBuffer(bda_pagetable_buffer, range_start * sizeof(u64), bda_addrs.data(),
bda_addrs.size() * sizeof(u64));
WriteDataBuffer(bda_pagetable_buffer, range_start * sizeof(vk::DeviceAddress), bda_addrs.data(),
bda_addrs.size() * sizeof(vk::DeviceAddress));
{
std::scoped_lock lk{mutex};
imported_buffers.emplace_back(std::move(buffer));
}
// Mark the pages as covered
covered_regions += range;
imported_regions += range;
}
}
@ -525,9 +526,25 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
const BufferId new_buffer_id = [&] {
std::scoped_lock lk{mutex};
return slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin,
AllFlags, size);
AllFlags | vk::BufferUsageFlagBits::eShaderDeviceAddress, size);
}();
auto& new_buffer = slot_buffers[new_buffer_id];
boost::container::small_vector<vk::DeviceAddress, 128> bda_addrs;
const u64 start_page = overlap.begin >> CACHING_PAGEBITS;
const u64 size_pages = size >> CACHING_PAGEBITS;
bda_addrs.reserve(size_pages);
for (u64 i = 0; i < size_pages; ++i) {
// Here, we do not set the host imported bit.
bda_addrs.push_back(new_buffer.BufferDeviceAddress() + (i << CACHING_PAGEBITS));
}
WriteDataBuffer(bda_pagetable_buffer, start_page * sizeof(vk::DeviceAddress), bda_addrs.data(),
bda_addrs.size() * sizeof(vk::DeviceAddress));
{
// Mark the pages as covered
std::scoped_lock lk{mutex};
imported_regions += boost::icl::interval_set<u64>::interval_type::right_open(
start_page, start_page + size_pages);
}
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
@ -539,6 +556,44 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
return new_buffer_id;
}
void BufferCache::CreateFaultBuffers() {
// Download the fault readback buffer
const auto [mapped, offset] = staging_buffer.Map(FAULT_READBACK_SIZE);
vk::BufferCopy copy = {
.srcOffset = 0,
.dstOffset = offset,
.size = FAULT_READBACK_SIZE,
};
staging_buffer.Commit();
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyBuffer(fault_readback_buffer.buffer, staging_buffer.Handle(), copy);
scheduler.Finish();
std::array<u8, FAULT_READBACK_SIZE> buffer{};
std::memcpy(buffer.data(), mapped, FAULT_READBACK_SIZE);
// Reset the fault readback buffer
cmdbuf.fillBuffer(fault_readback_buffer.buffer, 0, FAULT_READBACK_SIZE, 0);
// Create the fault buffers batched
boost::icl::interval_set<VAddr> fault_ranges;
for (u64 i = 0; i < FAULT_READBACK_SIZE / sizeof(vk::DeviceAddress); ++i) {
if (buffer[i] != 0) {
// Each byte contains information for 8 pages.
// We are oing to create an aligned buffer of
// 8 * 64 KB = 512 KB arround the fault address.
const VAddr fault_addr = buffer[i] << CACHING_PAGEBITS;
const u32 fault_end = mapped[i + 1] << CACHING_PAGEBITS;
auto range = decltype(fault_ranges)::interval_type::right_open(
fault_addr, fault_end);
fault_ranges += range;
}
}
for (const auto& range : fault_ranges) {
const VAddr start = range.lower();
const u32 size = range.upper() - start;
CreateBuffer(start, size);
}
}
void BufferCache::Register(BufferId buffer_id) {
ChangeRegister<true>(buffer_id);
}
@ -740,6 +795,18 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
return true;
}
void BufferCache::SynchronizeRange(VAddr device_addr, u32 size) {
if (device_addr == 0) {
return;
}
ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
if (buffer.is_deleted) {
return;
}
SynchronizeBuffer(buffer, buffer.CpuAddr(), buffer.SizeBytes(), false);
});
}
void BufferCache::InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes) {
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();

View File

@ -43,7 +43,9 @@ public:
static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
static constexpr u64 DEVICE_PAGESIZE = 64_KB;
static constexpr u64 CACHING_NUMPAGES = u64{1} << (40 - CACHING_PAGEBITS);
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(u64);
static constexpr u64 BDA_PAGETABLE_SIZE = CACHING_NUMPAGES * sizeof(vk::DeviceAddress);
static constexpr u64 FAULT_READBACK_SIZE = CACHING_NUMPAGES / 8; // Bit per page
struct Traits {
using Entry = BufferId;
@ -81,6 +83,11 @@ public:
return bda_pagetable_buffer;
}
/// Retrieves the fault readback buffer.
[[nodiscard]] Buffer& GetFaultReadbackBuffer() noexcept {
return fault_readback_buffer;
}
/// Retrieves the buffer with the specified id.
[[nodiscard]] Buffer& GetBuffer(BufferId id) {
return slot_buffers[id];
@ -123,10 +130,16 @@ public:
[[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size);
/// Queue a region for coverage for DMA.
void QueueCoverage(VAddr device_addr, u64 size);
void QueueMemoryImport(VAddr device_addr, u64 size);
/// Covers all queued regions.
void CoverQueuedRegions();
void ImportQueuedRegions();
/// Creates buffers for "faulted" shader accesses to host memory.
void CreateFaultBuffers();
/// Synchronizes all buffers in the specified range.
void SynchronizeRange(VAddr device_addr, u32 size);
private:
template <typename Func>
@ -171,7 +184,7 @@ private:
void DeleteBuffer(BufferId buffer_id);
void CoverMemory(u64 start, u64 end);
void ImportMemory(u64 start, u64 end);
const Vulkan::Instance& instance;
Vulkan::Scheduler& scheduler;
@ -183,8 +196,9 @@ private:
StreamBuffer stream_buffer;
Buffer gds_buffer;
Buffer bda_pagetable_buffer;
boost::icl::interval_set<VAddr> queued_coverage;
boost::icl::interval_set<u64> covered_regions;
Buffer fault_readback_buffer;
boost::icl::interval_set<VAddr> queued_imports;
boost::icl::interval_set<u64> imported_regions;
std::vector<ImportedHostBuffer> imported_buffers;
std::shared_mutex mutex;
Common::SlotVector<Buffer> slot_buffers;

View File

@ -946,7 +946,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) {
mapped_ranges += decltype(mapped_ranges)::interval_type::right_open(addr, addr + size);
}
page_manager.OnGpuMap(addr, size);
buffer_cache.QueueCoverage(addr, size);
buffer_cache.QueueMemoryImport(addr, size);
}
void Rasterizer::UnmapMemory(VAddr addr, u64 size) {