diff --git a/CMakeLists.txt b/CMakeLists.txt index 506198e1a..640bb86b5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -754,7 +754,6 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp - src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 62c0423dd..5904122e2 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -176,6 +176,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; + if (info.stage == Stage::Fragment) { + const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0)); + return; + } if (is_pair) { const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); @@ -223,6 +229,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; IR::VectorReg dst_reg{inst.dst[0].code}; + if (info.stage == Stage::Fragment) { + const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } if (is_pair) { // Pair loads are either 32 or 64-bit const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 7f5504663..877cf4a54 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -4,7 +4,6 @@ #include "common/config.h" #include "common/io_file.h" #include "common/path_util.h" -#include "shader_recompiler/exception.h" #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/info.h" @@ -21,9 +20,14 @@ namespace Shader::Gcn { +static u32 next_vgpr_num; +static std::unordered_map vgpr_map; + Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_, const Profile& profile_) - : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {} + : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} { + next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num; +} void Translator::EmitPrologue() { ir.Prologue(); @@ -179,8 +183,21 @@ void Translator::EmitPrologue() { default: UNREACHABLE_MSG("Unknown shader stage"); } + + // Clear any scratch vgpr mappings for next shader. + vgpr_map.clear(); } +IR::VectorReg Translator::GetScratchVgpr(u32 offset) { + const auto [it, is_new] = vgpr_map.try_emplace(offset); + if (is_new) { + ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); + const auto new_vgpr = static_cast(next_vgpr_num++); + it->second = new_vgpr; + } + return it->second; +}; + template T Translator::GetSrc(const InstOperand& operand) { constexpr bool is_float = std::is_same_v; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 287885854..4bb3e5762 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -317,6 +317,8 @@ private: void LogMissingOpcode(const GcnInst& inst); + IR::VectorReg GetScratchVgpr(u32 offset); + private: IR::IREmitter ir; Info& info; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 3c98579a0..e64622405 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -20,7 +20,6 @@ void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program); -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp deleted file mode 100644 index 23963a991..000000000 --- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include - -#include "shader_recompiler/ir/ir_emitter.h" -#include "shader_recompiler/ir/program.h" - -namespace Shader::Optimization { - -static bool IsSharedMemoryInst(const IR::Inst& inst) { - const auto opcode = inst.GetOpcode(); - return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 || - opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64; -} - -static u32 GetSharedMemImmOffset(const IR::Inst& inst) { - const auto* address = inst.Arg(0).InstRecursive(); - ASSERT(address->GetOpcode() == IR::Opcode::IAdd32); - const auto ir_offset = address->Arg(1); - ASSERT_MSG(ir_offset.IsImmediate()); - const auto offset = ir_offset.U32(); - // Typical usage is the compiler spilling registers into shared memory, with 256 bytes between - // each register to account for 4 bytes per register times 64 threads per group. Ensure that - // this assumption holds, as if it does not this approach may need to be revised. - ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset); - return offset; -} - -static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadSharedU32: - inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr)); - break; - case IR::Opcode::LoadSharedU64: - inst.ReplaceUsesWithAndRemove( - ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1))); - break; - case IR::Opcode::WriteSharedU32: - ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)}); - inst.Invalidate(); - break; - case IR::Opcode::WriteSharedU64: { - const auto value = inst.Arg(1); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)}); - ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)}); - inst.Invalidate(); - break; - } - default: - UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode()); - } -} - -void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) { - u32 next_vgpr_num = runtime_info.num_allocated_vgprs; - std::unordered_map vgpr_map; - const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) { - const auto [it, is_new] = vgpr_map.try_emplace(offset); - if (is_new) { - ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs"); - const auto new_vgpr = static_cast(next_vgpr_num++); - it->second = new_vgpr; - } - return it->second; - }; - - for (IR::Block* const block : program.blocks) { - for (IR::Inst& inst : block->Instructions()) { - if (!IsSharedMemoryInst(inst)) { - continue; - } - const auto offset = GetSharedMemImmOffset(inst); - const auto vgpr = get_vgpr(offset); - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; - ConvertSharedMemToVgpr(ir, inst, vgpr); - } - } -} - -} // namespace Shader::Optimization diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 5a6d1d775..f7077e167 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -65,10 +65,6 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info // Run optimization passes const auto stage = program.info.stage; - if (stage == Stage::Fragment) { - // Before SSA pass, as it will rewrite to VGPR load/store. - Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info); - } Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::IdentityRemovalPass(program.blocks); if (info.l_stage == LogicalStage::TessellationControl) {