shader_recompiler: Optimize general case of buffer addressing (#3159)

* shader_recompiler: Simplify dma types

Only U32 is needed for S_LOAD_DWORD

* shader_recompiler: Perform address shift on IR level

Buffer instructions now expect address in the data unit they work on. Doing the shift on IR level will allow us to optimize some operations away on common case

* shader_recompiler: Optimize common buffer access pattern

* emit_spirv: Use 32-bit integer ops for fault buffer

Not many GPUs have 8-bit bitwise or operations so that would probably require some overhead to emulate from the driver

* resource_tracking_pass: Fix texel buffer shift
This commit is contained in:
TheTurtle
2025-06-26 12:14:36 +03:00
committed by GitHub
parent 6eaec7a004
commit a49b13fe66
12 changed files with 271 additions and 233 deletions

View File

@@ -225,6 +225,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
.needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||
instance.GetDriverID() == vk::DriverId::eMoltenvk,
.needs_buffer_offsets = instance.StorageMinAlignment() > 4,
// When binding a UBO, we calculate its size considering the offset in the larger buffer
// cache underlying resource. In some cases, it may produce sizes exceeding the system
// maximum allowed UBO range, so we need to reduce the threshold to prevent issues.

View File

@@ -468,17 +468,12 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
stage->PushUd(binding, push_data);
BindBuffers(*stage, binding, push_data);
BindTextures(*stage, binding);
uses_dma |= stage->dma_types != Shader::IR::Type::Void;
uses_dma |= stage->uses_dma;
}
pipeline->BindResources(set_writes, buffer_barriers, push_data);
if (uses_dma && !fault_process_pending) {
// We only use fault buffer for DMA right now.
{
// TODO: GPU might have written to memory (for example with EVENT_WRITE_EOP)
// we need to account for that and synchronize.
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
for (auto& range : mapped_ranges) {
buffer_cache.SynchronizeBuffersInRange(range.lower(),
@@ -490,6 +485,8 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
fault_process_pending |= uses_dma;
pipeline->BindResources(set_writes, buffer_barriers, push_data);
return true;
}