From b05e6577d198e80582f2beb7b1a42e52b68251cc Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Fri, 14 Feb 2025 04:21:47 +0200 Subject: [PATCH] shader_recompiler: Reintroduce shared memory on ssbo emulation * Now it is performed with an IR pass, and combined with the previous commit cleanup, is fully transparent from the backend, other than requiring workgroup_index be provided as an attribute (computing this on every shared memory access is gonna be too verbose --- CMakeLists.txt | 1 + .../spirv/emit_spirv_context_get_set.cpp | 4 +- .../backend/spirv/emit_spirv_special.cpp | 3 + .../backend/spirv/spirv_emit_context.cpp | 42 +++++-- .../backend/spirv/spirv_emit_context.h | 5 +- .../frontend/translate/data_share.cpp | 16 +-- src/shader_recompiler/ir/attribute.h | 21 ++-- src/shader_recompiler/ir/passes/ir_passes.h | 5 +- .../ir/passes/shader_info_collection_pass.cpp | 3 +- .../ir/passes/shared_memory_barrier_pass.cpp | 67 +++++++---- .../passes/shared_memory_to_storage_pass.cpp | 109 ++++++++++++++++++ src/shader_recompiler/recompiler.cpp | 3 +- src/video_core/amdgpu/liverpool.h | 4 + src/video_core/buffer_cache/buffer.h | 2 +- src/video_core/buffer_cache/buffer_cache.cpp | 15 +-- src/video_core/buffer_cache/buffer_cache.h | 9 +- .../renderer_vulkan/vk_rasterizer.cpp | 15 ++- 17 files changed, 242 insertions(+), 82 deletions(-) create mode 100644 src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 640bb86b5..9cac88f3d 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -758,6 +758,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp + src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp src/shader_recompiler/ir/abstract_syntax_list.h src/shader_recompiler/ir/attribute.cpp diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 798dfe8c8..04ac28d24 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -186,7 +186,7 @@ Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value)); } -Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); @@ -287,6 +287,8 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return EmitReadStepRate(ctx, 0); case IR::Attribute::InstanceId1: return EmitReadStepRate(ctx, 1); + case IR::Attribute::WorkgroupIndex: + return ctx.workgroup_index_id; case IR::Attribute::WorkgroupId: return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.workgroup_id), comp); case IR::Attribute::LocalInvocationId: diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index a0a3ed8ff..724550cd6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -11,6 +11,9 @@ void EmitPrologue(EmitContext& ctx) { if (ctx.stage == Stage::Fragment) { ctx.DefineInterpolatedAttribs(); } + if (ctx.info.loads.Get(IR::Attribute::WorkgroupIndex)) { + ctx.DefineWorkgroupIndex(); + } ctx.DefineBufferOffsets(); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 8551b83ff..50d846e45 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -215,8 +215,7 @@ void EmitContext::DefineInterpolatedAttribs() { if (!profile.needs_manual_interpolation) { return; } - // Iterate all input attributes, load them and manually interpolate with barycentric - // coordinates. + // Iterate all input attributes, load them and manually interpolate. for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) { const auto& input = runtime_info.fs_info.inputs[i]; const u32 semantic = input.param_index; @@ -241,6 +240,19 @@ void EmitContext::DefineInterpolatedAttribs() { } } +void EmitContext::DefineWorkgroupIndex() { + const Id workgroup_id_val{OpLoad(U32[3], workgroup_id)}; + const Id workgroup_x{OpCompositeExtract(U32[1], workgroup_id_val, 0)}; + const Id workgroup_y{OpCompositeExtract(U32[1], workgroup_id_val, 1)}; + const Id workgroup_z{OpCompositeExtract(U32[1], workgroup_id_val, 2)}; + const Id num_workgroups{OpLoad(U32[3], num_workgroups_id)}; + const Id num_workgroups_x{OpCompositeExtract(U32[1], num_workgroups, 0)}; + const Id num_workgroups_y{OpCompositeExtract(U32[1], num_workgroups, 1)}; + workgroup_index_id = OpIAdd(U32[1], OpIAdd(U32[1], workgroup_x, OpIMul(U32[1], workgroup_y, num_workgroups_x)), + OpIMul(U32[1], workgroup_z, OpIMul(U32[1], num_workgroups_x, num_workgroups_y))); + Name(workgroup_index_id, "workgroup_index"); +} + Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { switch (default_value) { case 0: @@ -309,9 +321,15 @@ void EmitContext::DefineInputs() { break; } case LogicalStage::Fragment: - frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); - frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); - front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); + if (info.loads.GetAny(IR::Attribute::FragCoord)) { + frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); + } + if (info.stores.Get(IR::Attribute::Depth)) { + frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); + } + if (info.loads.Get(IR::Attribute::IsFrontFace)) { + front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); + } if (profile.needs_manual_interpolation) { gl_bary_coord_id = DefineVariable(F32[3], spv::BuiltIn::BaryCoordKHR, spv::StorageClass::Input); @@ -346,9 +364,16 @@ void EmitContext::DefineInputs() { } break; case LogicalStage::Compute: - workgroup_id = DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); - local_invocation_id = - DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); + if (info.loads.GetAny(IR::Attribute::WorkgroupIndex) || info.loads.GetAny(IR::Attribute::WorkgroupId)) { + workgroup_id = DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); + } + if (info.loads.GetAny(IR::Attribute::WorkgroupIndex)) { + num_workgroups_id = DefineVariable(U32[3], spv::BuiltIn::NumWorkgroups, spv::StorageClass::Input); + } + if (info.loads.GetAny(IR::Attribute::LocalInvocationId)) { + local_invocation_id = + DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); + } break; case LogicalStage::Geometry: { primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); @@ -810,6 +835,7 @@ void EmitContext::DefineSharedMemory() { ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)}; + LOG_ERROR(Render_Recompiler, "Defined {:#x} num_elements = {}, shared_memory_size = {}", info.pgm_hash, num_elements, shared_memory_size); const Id type{TypeArray(U32[1], ConstU32(num_elements))}; shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 56f3f1656..29d883bcf 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -45,6 +45,7 @@ public: void DefineBufferOffsets(); void DefineInterpolatedAttribs(); + void DefineWorkgroupIndex(); [[nodiscard]] Id DefineInput(Id type, std::optional location = std::nullopt, std::optional builtin = std::nullopt) { @@ -200,8 +201,10 @@ public: std::array patches{}; Id workgroup_id{}; + Id num_workgroups_id{}; + Id workgroup_index_id{}; Id local_invocation_id{}; - Id invocation_id{}; // for instanced geoshaders or output vertices within TCS patch + Id invocation_id{}; Id subgroup_local_invocation_id{}; Id image_u32{}; diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 5904122e2..36ed9eaad 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -176,8 +176,8 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; if (info.stage == Stage::Fragment) { - const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0)); return; @@ -201,14 +201,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid addr1); } } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); ir.WriteShared(bit_size, data, addr0); } else { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } } @@ -229,8 +227,8 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; if (info.stage == Stage::Fragment) { - const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); return; @@ -255,14 +253,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); } } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); } else { - const IR::U32 addr0 = ir.IAdd( - addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; ir.SetVectorReg(dst_reg, data); } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index bcb2b44a9..5117f5650 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -69,16 +69,17 @@ enum class Attribute : u64 { SampleIndex = 72, GlobalInvocationId = 73, WorkgroupId = 74, - LocalInvocationId = 75, - LocalInvocationIndex = 76, - FragCoord = 77, - InstanceId0 = 78, // step rate 0 - InstanceId1 = 79, // step rate 1 - InvocationId = 80, // TCS id in output patch and instanced geometry shader id - PatchVertices = 81, - TessellationEvaluationPointU = 82, - TessellationEvaluationPointV = 83, - PackedHullInvocationInfo = 84, // contains patch id within the VGT and invocation ID + WorkgroupIndex = 75, + LocalInvocationId = 76, + LocalInvocationIndex = 77, + FragCoord = 78, + InstanceId0 = 79, // step rate 0 + InstanceId1 = 80, // step rate 1 + InvocationId = 81, // TCS id in output patch and instanced geometry shader id + PatchVertices = 82, + TessellationEvaluationPointU = 83, + TessellationEvaluationPointV = 84, + PackedHullInvocationInfo = 85, // contains patch id within the VGT and invocation ID Max, }; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index e64622405..69628dbfd 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -25,6 +25,9 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); -void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile); +void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile); +void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, + const Profile& profile); } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 0b0a62f6a..219378a6c 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -95,10 +95,9 @@ void Visit(Info& info, const IR::Inst& inst) { } void CollectShaderInfoPass(IR::Program& program) { - Info& info{program.info}; for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { - Visit(info, inst); + Visit(program.info, inst); } } } diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp index ec7d7e986..3a83c601e 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp @@ -8,37 +8,46 @@ namespace Shader::Optimization { +static bool IsLoadShared(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::LoadSharedU32 || + inst.GetOpcode() == IR::Opcode::LoadSharedU64; +} + +static bool IsWriteShared(const IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::WriteSharedU32 || + inst.GetOpcode() == IR::Opcode::WriteSharedU64; +} + +// Inserts barriers when a shared memory write and read occur in the same basic block. static void EmitBarrierInBlock(IR::Block* block) { - // This is inteded to insert a barrier when shared memory write and read - // occur in the same basic block. Also checks if branch depth is zero as - // we don't want to insert barrier in potentially divergent code. - bool emit_barrier_on_write = false; - bool emit_barrier_on_read = false; - const auto emit_barrier = [block](bool& emit_cond, IR::Inst& inst) { - if (emit_cond) { - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - ir.Barrier(); - emit_cond = false; - } + enum class BarrierAction : u32 { + None, + BarrierOnWrite, + BarrierOnRead, }; + BarrierAction action{}; for (IR::Inst& inst : block->Instructions()) { - if (inst.GetOpcode() == IR::Opcode::LoadSharedU32 || - inst.GetOpcode() == IR::Opcode::LoadSharedU64) { - emit_barrier(emit_barrier_on_read, inst); - emit_barrier_on_write = true; + if (IsLoadShared(inst)) { + if (action == BarrierAction::BarrierOnRead) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ir.Barrier(); + } + action = BarrierAction::BarrierOnWrite; + continue; } - if (inst.GetOpcode() == IR::Opcode::WriteSharedU32 || - inst.GetOpcode() == IR::Opcode::WriteSharedU64) { - emit_barrier(emit_barrier_on_write, inst); - emit_barrier_on_read = true; + if (IsWriteShared(inst)) { + if (action == BarrierAction::BarrierOnWrite) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ir.Barrier(); + } + action = BarrierAction::BarrierOnRead; } } } +// Inserts a barrier after divergent conditional blocks to avoid undefined +// behavior when some threads write and others read from shared memory. static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { - // Insert a barrier after divergent conditional blocks. - // This avoids potential softlocks and crashes when some threads - // initialize shared memory and others read from it. const IR::U1 cond = data.if_node.cond; const auto insert_barrier = IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { @@ -56,8 +65,17 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { } } -void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { - if (!program.info.uses_shared || !profile.needs_lds_barriers) { +static constexpr u32 GcnSubgroupSize = 64; + +void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile) { + if (program.info.stage != Stage::Compute) { + return; + } + const auto& cs_info = runtime_info.cs_info; + const u32 shared_memory_size = cs_info.shared_memory_size; + const u32 threadgroup_size = cs_info.workgroup_size[0] * cs_info.workgroup_size[1] * cs_info.workgroup_size[2]; + // The compiler can only omit barriers when the local workgroup size is the same as the HW subgroup. + if (shared_memory_size == 0 || threadgroup_size != GcnSubgroupSize || !profile.needs_lds_barriers) { return; } using Type = IR::AbstractSyntaxNode::Type; @@ -67,6 +85,7 @@ void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { --branch_depth; continue; } + // Check if branch depth is zero, we don't want to insert barrier in potentially divergent code. if (node.type == Type::If && branch_depth++ == 0) { EmitBarrierInMergeBlock(node.data); continue; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp new file mode 100644 index 000000000..517852fae --- /dev/null +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/profile.h" + +namespace Shader::Optimization { + +static bool IsSharedAccess(const IR::Inst& inst) { + const auto opcode = inst.GetOpcode(); + switch (opcode) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicXor32: + return true; + default: + return false; + } +} + +void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile) { + if (program.info.stage != Stage::Compute) { + return; + } + // Only perform the transform if the host shared memory is insufficient. + const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; + if (shared_memory_size <= profile.max_shared_memory_size) { + return; + } + // Add buffer binding for shared memory storage buffer. + const u32 binding = static_cast(program.info.buffers.size()); + program.info.buffers.push_back({ + .used_types = IR::Type::U32, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::SharedMemory, + .is_written = true, + }); + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsSharedAccess(inst)) { + continue; + } + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::U32 handle = ir.Imm32(binding); + // Replace shared atomics first + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {})); + continue; + default: + break; + } + // Replace shared operations. + const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex), ir.Imm32(shared_memory_size)); + const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU32(2, handle, address, inst.Arg(1), {}); + inst.Invalidate(); + break; + default: + break; + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index f7077e167..1c132ebbb 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -86,11 +86,12 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::LowerBufferFormatToRaw(program); + Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile); + Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::CollectShaderInfoPass(program); - Shader::Optimization::SharedMemoryBarrierPass(program, profile); return program; } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 525a0c9f1..5b9b647eb 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -197,6 +197,10 @@ struct Liverpool { return settings.lds_dwords.Value() * 128 * 4; } + u32 NumWorkgroups() const noexcept { + return dim_x * dim_y * dim_z; + } + bool IsTgidEnabled(u32 i) const noexcept { return (settings.tgid_enable.Value() >> i) & 1; } diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index ec92a0ebf..188b4b2ca 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -168,7 +168,7 @@ public: void Commit(); /// Maps and commits a memory region with user provided data - u64 Copy(VAddr src, size_t size, size_t alignment = 0) { + u64 Copy(auto src, size_t size, size_t alignment = 0) { const auto [data, offset] = Map(size, alignment); std::memcpy(data, reinterpret_cast(src), size); Commit(); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 37af62f30..ac607af28 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -5,11 +5,8 @@ #include "common/alignment.h" #include "common/scope_exit.h" #include "common/types.h" -#include "shader_recompiler/frontend/fetch_shader.h" -#include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -19,7 +16,7 @@ namespace VideoCore { static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 1_GB; -static constexpr size_t UboStreamBufferSize = 64_MB; +static constexpr size_t UboStreamBufferSize = 256_MB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, @@ -29,10 +26,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, - lds_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, DataShareBufferSize}, memory_tracker{&tracker} { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); - Vulkan::SetObjectName(instance.GetDevice(), lds_buffer.Handle(), "LDS Buffer"); // Ensure the first slot is used for the null buffer const auto null_id = @@ -251,14 +246,6 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo }); } -std::pair BufferCache::ObtainHostUBO(std::span data) { - static constexpr u64 StreamThreshold = CACHING_PAGESIZE; - ASSERT(data.size_bytes() <= StreamThreshold); - const u64 offset = stream_buffer.Copy(reinterpret_cast(data.data()), data.size_bytes(), - instance.UniformMinAlignment()); - return {&stream_buffer, offset}; -} - std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { // For small uniform buffers that have not been modified by gpu diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 088c22c12..71a6bed2a 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -68,9 +68,9 @@ public: return &gds_buffer; } - /// Returns a pointer to LDS device local buffer. - [[nodiscard]] const Buffer* GetLdsBuffer() const noexcept { - return &lds_buffer; + /// Retrieves the host visible device local stream buffer. + [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept { + return stream_buffer; } /// Retrieves the buffer with the specified id. @@ -90,8 +90,6 @@ public: /// Writes a value to GPU buffer. void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); - [[nodiscard]] std::pair ObtainHostUBO(std::span data); - /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, @@ -159,7 +157,6 @@ private: StreamBuffer staging_buffer; StreamBuffer stream_buffer; Buffer gds_buffer; - Buffer lds_buffer; std::shared_mutex mutex; Common::SlotVector slot_buffers; RangeSet gpu_modified_ranges; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 06470a636..a9ce4b4e2 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -530,9 +530,18 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto* gds_buf = buffer_cache.GetGdsBuffer(); buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::ReadConstUbo) { - const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(stage.flattened_ud_buf); - buffer_infos.emplace_back(vk_buffer->Handle(), offset, - stage.flattened_ud_buf.size() * sizeof(u32)); + auto& vk_buffer = buffer_cache.GetStreamBuffer(); + const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); + const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, instance.UniformMinAlignment()); + buffer_infos.emplace_back(vk_buffer.Handle(), offset, ubo_size); + } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { + // Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer + auto& lds_buffer = buffer_cache.GetStreamBuffer(); + const auto& cs_program = liverpool->GetCsRegs(); + const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); + const auto [data, offset] = lds_buffer.Map(lds_size, instance.StorageMinAlignment()); + std::memset(data, 0, lds_size); + buffer_infos.emplace_back(lds_buffer.Handle(), offset, lds_size); } else if (instance.IsNullDescriptorSupported()) { buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); } else {