video_core: Better handling of image copies with DmaData (#3672)

This commit is contained in:
TheTurtle
2025-09-30 15:51:30 +03:00
committed by GitHub
parent a35c9f3586
commit ad99bda08d
4 changed files with 55 additions and 93 deletions

View File

@@ -124,6 +124,42 @@ Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}
void Buffer::Fill(u64 offset, u32 num_bytes, u32 value) {
scheduler->EndRendering();
ASSERT_MSG(offset % 4 == 0 && num_bytes % 4 == 0,
"FillBuffer size must be a multiple of 4 bytes");
const auto cmdbuf = scheduler->CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer,
.offset = offset,
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer,
.offset = offset,
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.fillBuffer(buffer, offset, num_bytes, value);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000;
constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000;

View File

@@ -83,29 +83,24 @@ public:
Buffer& operator=(Buffer&&) = default;
Buffer(Buffer&&) = default;
/// Increases the likeliness of this being a stream buffer
void IncreaseStreamScore(int score) noexcept {
stream_score += score;
}
/// Returns the likeliness of this being a stream buffer
[[nodiscard]] int StreamScore() const noexcept {
return stream_score;
}
/// Returns true when vaddr -> vaddr+size is fully contained in the buffer
[[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
}
/// Returns the base CPU address of the buffer
[[nodiscard]] VAddr CpuAddr() const noexcept {
return cpu_addr;
}
/// Returns the offset relative to the given CPU address
[[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept {
return static_cast<u32>(other_cpu_addr - cpu_addr);
[[nodiscard]] u64 Offset(VAddr other_cpu_addr) const noexcept {
return other_cpu_addr - cpu_addr;
}
size_t SizeBytes() const {
@@ -129,16 +124,16 @@ public:
return buffer.bda_addr;
}
std::optional<vk::BufferMemoryBarrier2> GetBarrier(
vk::Flags<vk::AccessFlagBits2> dst_acess_mask, vk::PipelineStageFlagBits2 dst_stage,
u32 offset = 0) {
std::optional<vk::BufferMemoryBarrier2> GetBarrier(vk::AccessFlags2 dst_acess_mask,
vk::PipelineStageFlagBits2 dst_stage,
u32 offset = 0) {
if (dst_acess_mask == access_mask && stage == dst_stage) {
return {};
}
DEBUG_ASSERT(offset < size_bytes);
auto barrier = vk::BufferMemoryBarrier2{
const auto barrier = vk::BufferMemoryBarrier2{
.srcStageMask = stage,
.srcAccessMask = access_mask,
.dstStageMask = dst_stage,
@@ -152,6 +147,8 @@ public:
return barrier;
}
void Fill(u64 offset, u32 num_bytes, u32 value);
public:
VAddr cpu_addr = 0;
bool is_picked{};

View File

@@ -364,53 +364,28 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
InlineDataBuffer(*buffer, address, value, num_bytes);
}
void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds) {
ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned");
if (!is_gds && !IsRegionRegistered(address, num_bytes)) {
memcpy(std::bit_cast<void*>(address), value, num_bytes);
return;
}
Buffer* buffer = [&] {
if (is_gds) {
return &gds_buffer;
}
const BufferId buffer_id = FindBuffer(address, num_bytes);
return &slot_buffers[buffer_id];
}();
WriteDataBuffer(*buffer, address, value, num_bytes);
}
void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) {
if (!src_gds && !IsRegionGpuModified(src, num_bytes)) {
// Both buffers were not transferred to GPU yet. Can safely copy in host memory.
memcpy(std::bit_cast<void*>(dst), std::bit_cast<void*>(src), num_bytes);
return;
}
// Without a readback there's nothing we can do with this
// Fallback to creating dst buffer on GPU to at least have this data there
}
texture_cache.InvalidateMemoryFromGPU(dst, num_bytes);
auto& src_buffer = [&] -> const Buffer& {
if (src_gds) {
return gds_buffer;
}
// Avoid using ObtainBuffer here as that might give us the stream buffer.
const BufferId buffer_id = FindBuffer(src, num_bytes);
const auto buffer_id = FindBuffer(src, num_bytes);
auto& buffer = slot_buffers[buffer_id];
if (SynchronizeBuffer(buffer, src, num_bytes, false, true)) {
texture_cache.InvalidateMemoryFromGPU(dst, num_bytes);
}
SynchronizeBuffer(buffer, src, num_bytes, false, true);
return buffer;
}();
auto& dst_buffer = [&] -> const Buffer& {
if (dst_gds) {
return gds_buffer;
}
// Prefer using ObtainBuffer here as that will auto-mark the region as GPU modified.
const auto [buffer, offset] = ObtainBuffer(dst, num_bytes, true);
return *buffer;
const auto buffer_id = FindBuffer(dst, num_bytes);
auto& buffer = slot_buffers[buffer_id];
SynchronizeBuffer(buffer, dst, num_bytes, true, true);
gpu_modified_ranges.Add(dst, num_bytes);
return buffer;
}();
vk::BufferCopy region{
const vk::BufferCopy region = {
.srcOffset = src_buffer.Offset(src),
.dstOffset = dst_buffer.Offset(dst),
.size = num_bytes,
@@ -680,8 +655,6 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) {
auto& new_buffer = slot_buffers[new_buffer_id];
const size_t size_bytes = new_buffer.SizeBytes();
const auto cmdbuf = scheduler.CommandBuffer();
scheduler.EndRendering();
cmdbuf.fillBuffer(new_buffer.buffer, 0, size_bytes, 0);
for (const BufferId overlap_id : overlap.ids) {
JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
}
@@ -851,8 +824,8 @@ void BufferCache::ChangeRegister(BufferId buffer_id) {
} else {
total_used_memory -= Common::AlignUp(size, CACHING_PAGESIZE);
lru_cache.Free(buffer.LRUId());
FillBuffer(bda_pagetable_buffer, page_begin * sizeof(vk::DeviceAddress),
size_pages * sizeof(vk::DeviceAddress), 0);
const u64 offset = bda_pagetable_buffer.Offset(page_begin * sizeof(vk::DeviceAddress));
bda_pagetable_buffer.Fill(offset, size_pages * sizeof(vk::DeviceAddress), 0);
buffer_ranges.Subtract(buffer.CpuAddr(), buffer.SizeBytes());
}
}
@@ -1004,10 +977,6 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) {
}
void BufferCache::MemoryBarrier() {
// Vulkan doesn't know which buffer we access in a shader if we use
// BufferDeviceAddress. We need a full memory barrier.
// For now, we only read memory using BDA. If we want to write to it,
// we might need to change this.
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
vk::MemoryBarrier2 barrier = {
@@ -1121,41 +1090,6 @@ void BufferCache::WriteDataBuffer(Buffer& buffer, VAddr address, const void* val
});
}
void BufferCache::FillBuffer(Buffer& buffer, VAddr address, u32 num_bytes, u32 value) {
scheduler.EndRendering();
ASSERT_MSG(num_bytes % 4 == 0, "FillBuffer size must be a multiple of 4 bytes");
const auto cmdbuf = scheduler.CommandBuffer();
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer.Handle(),
.offset = buffer.Offset(address),
.size = num_bytes,
};
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.fillBuffer(buffer.Handle(), buffer.Offset(address), num_bytes, value);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
void BufferCache::RunGarbageCollector() {
SCOPE_EXIT {
++gc_tick;

View File

@@ -128,9 +128,6 @@ public:
/// Writes a value to GPU buffer. (uses command buffer to temporarily store the data)
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
/// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
/// Performs buffer to buffer data copy on the GPU.
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
@@ -211,8 +208,6 @@ private:
void WriteDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes);
void FillBuffer(Buffer& buffer, VAddr address, u32 num_bytes, u32 value);
void TouchBuffer(const Buffer& buffer);
void DeleteBuffer(BufferId buffer_id);