From dd8fb9dd4b05c23724fa5e37baafd943ac542ce0 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Thu, 13 Feb 2025 19:55:52 +0200 Subject: [PATCH] shader_recompiler: Move shared mem lowering into emitter * IR can be quite verbose during first stages of translation, before ssa and constant prop passes have run that drastically simplify it. This lowering can also be done during emission so why not do it then to save some compilation time --- CMakeLists.txt | 1 - .../frontend/translate/data_share.cpp | 12 +++ .../frontend/translate/translate.cpp | 21 ++++- .../frontend/translate/translate.h | 2 + src/shader_recompiler/ir/passes/ir_passes.h | 1 - .../passes/lower_shared_mem_to_registers.cpp | 81 ------------------- src/shader_recompiler/recompiler.cpp | 4 - 7 files changed, 33 insertions(+), 89 deletions(-) delete mode 100644 src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 506198e1a..640bb86b5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -754,7 +754,6 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp - src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 62c0423dd..5904122e2 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -176,6 +176,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; + if (info.stage == Stage::Fragment) { + const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0)); + return; + } if (is_pair) { const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); @@ -223,6 +229,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; IR::VectorReg dst_reg{inst.dst[0].code}; + if (info.stage == Stage::Fragment) { + const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } if (is_pair) { // Pair loads are either 32 or 64-bit const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 7f5504663..877cf4a54 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -4,7 +4,6 @@ #include "common/config.h" #include "common/io_file.h" #include "common/path_util.h" -#include "shader_recompiler/exception.h" #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/info.h" @@ -21,9 +20,14 @@ namespace Shader::Gcn { +static u32 next_vgpr_num; +static std::unordered_map vgpr_map; + Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_, const Profile& profile_) - : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {} + : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} { + next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num; +} void Translator::EmitPrologue() { ir.Prologue(); @@ -179,8 +183,21 @@ void Translator::EmitPrologue() { default: UNREACHABLE_MSG("Unknown shader stage"); } + + // Clear any scratch vgpr mappings for next shader. + vgpr_map.clear(); } +IR::VectorReg Translator::GetScratchVgpr(u32 offset) { + const auto [it, is_new] = vgpr_map.try_emplace(offset); + if (is_new) { + ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); + const auto new_vgpr = static_cast(next_vgpr_num++); + it->second = new_vgpr; + } + return it->second; +}; + template T Translator::GetSrc(const InstOperand& operand) { constexpr bool is_float = std::is_same_v; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 287885854..4bb3e5762 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -317,6 +317,8 @@ private: void LogMissingOpcode(const GcnInst& inst); + IR::VectorReg GetScratchVgpr(u32 offset); + private: IR::IREmitter ir; Info& info; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 3c98579a0..e64622405 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -20,7 +20,6 @@ void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program); -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp deleted file mode 100644 index 23963a991..000000000 --- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -#include "shader_recompiler/ir/ir_emitter.h" -#include "shader_recompiler/ir/program.h" - -namespace Shader::Optimization { - -static bool IsSharedMemoryInst(const IR::Inst& inst) { - const auto opcode = inst.GetOpcode(); - return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 || - opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64; -} - -static u32 GetSharedMemImmOffset(const IR::Inst& inst) { - const auto* address = inst.Arg(0).InstRecursive(); - ASSERT(address->GetOpcode() == IR::Opcode::IAdd32); - const auto ir_offset = address->Arg(1); - ASSERT_MSG(ir_offset.IsImmediate()); - const auto offset = ir_offset.U32(); - // Typical usage is the compiler spilling registers into shared memory, with 256 bytes between - // each register to account for 4 bytes per register times 64 threads per group. Ensure that - // this assumption holds, as if it does not this approach may need to be revised. - ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); - return offset; -} - -static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadSharedU32: - inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr)); - break; - case IR::Opcode::LoadSharedU64: - inst.ReplaceUsesWithAndRemove( - ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1))); - break; - case IR::Opcode::WriteSharedU32: - ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)}); - inst.Invalidate(); - break; - case IR::Opcode::WriteSharedU64: { - const auto value = inst.Arg(1); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)}); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)}); - inst.Invalidate(); - break; - } - default: - UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode()); - } -} - -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) { - u32 next_vgpr_num = runtime_info.num_allocated_vgprs; - std::unordered_map vgpr_map; - const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) { - const auto [it, is_new] = vgpr_map.try_emplace(offset); - if (is_new) { - ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); - const auto new_vgpr = static_cast(next_vgpr_num++); - it->second = new_vgpr; - } - return it->second; - }; - - for (IR::Block* const block : program.blocks) { - for (IR::Inst& inst : block->Instructions()) { - if (!IsSharedMemoryInst(inst)) { - continue; - } - const auto offset = GetSharedMemImmOffset(inst); - const auto vgpr = get_vgpr(offset); - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - ConvertSharedMemToVgpr(ir, inst, vgpr); - } - } -} - -} // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 5a6d1d775..f7077e167 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -65,10 +65,6 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info // Run optimization passes const auto stage = program.info.stage; - if (stage == Stage::Fragment) { - // Before SSA pass, as it will rewrite to VGPR load/store. - Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info); - } Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::IdentityRemovalPass(program.blocks); if (info.l_stage == LogicalStage::TessellationControl) {