shader_recompiler: Move shared mem lowering into emitter

* IR can be quite verbose during first stages of translation, before ssa and constant prop passes have run that drastically simplify it. This lowering can also be done during emission so why not do it then to save some compilation time
This commit is contained in:
IndecisiveTurtle 2025-02-13 19:55:52 +02:00
parent 455b23c6f1
commit dd8fb9dd4b
7 changed files with 33 additions and 89 deletions

View File

@ -754,7 +754,6 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp
src/shader_recompiler/ir/passes/resource_tracking_pass.cpp src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/ring_access_elimination.cpp
src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp

View File

@ -176,6 +176,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data0{inst.src[1].code};
const IR::VectorReg data1{inst.src[2].code}; const IR::VectorReg data1{inst.src[2].code};
if (info.stage == Stage::Fragment) {
const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0;
ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0));
return;
}
if (is_pair) { if (is_pair) {
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
@ -223,6 +229,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride
const GcnInst& inst) { const GcnInst& inst) {
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
IR::VectorReg dst_reg{inst.dst[0].code}; IR::VectorReg dst_reg{inst.dst[0].code};
if (info.stage == Stage::Fragment) {
const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0;
ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset)));
return;
}
if (is_pair) { if (is_pair) {
// Pair loads are either 32 or 64-bit // Pair loads are either 32 or 64-bit
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);

View File

@ -4,7 +4,6 @@
#include "common/config.h" #include "common/config.h"
#include "common/io_file.h" #include "common/io_file.h"
#include "common/path_util.h" #include "common/path_util.h"
#include "shader_recompiler/exception.h"
#include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/fetch_shader.h"
#include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/frontend/translate/translate.h"
#include "shader_recompiler/info.h" #include "shader_recompiler/info.h"
@ -21,9 +20,14 @@
namespace Shader::Gcn { namespace Shader::Gcn {
static u32 next_vgpr_num;
static std::unordered_map<u32, IR::VectorReg> vgpr_map;
Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_, Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_,
const Profile& profile_) const Profile& profile_)
: ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {} : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {
next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num;
}
void Translator::EmitPrologue() { void Translator::EmitPrologue() {
ir.Prologue(); ir.Prologue();
@ -179,8 +183,21 @@ void Translator::EmitPrologue() {
default: default:
UNREACHABLE_MSG("Unknown shader stage"); UNREACHABLE_MSG("Unknown shader stage");
} }
// Clear any scratch vgpr mappings for next shader.
vgpr_map.clear();
} }
IR::VectorReg Translator::GetScratchVgpr(u32 offset) {
const auto [it, is_new] = vgpr_map.try_emplace(offset);
if (is_new) {
ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
it->second = new_vgpr;
}
return it->second;
};
template <typename T> template <typename T>
T Translator::GetSrc(const InstOperand& operand) { T Translator::GetSrc(const InstOperand& operand) {
constexpr bool is_float = std::is_same_v<T, IR::F32>; constexpr bool is_float = std::is_same_v<T, IR::F32>;

View File

@ -317,6 +317,8 @@ private:
void LogMissingOpcode(const GcnInst& inst); void LogMissingOpcode(const GcnInst& inst);
IR::VectorReg GetScratchVgpr(u32 offset);
private: private:
IR::IREmitter ir; IR::IREmitter ir;
Info& info; Info& info;

View File

@ -20,7 +20,6 @@ void FlattenExtendedUserdataPass(IR::Program& program);
void ResourceTrackingPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program);
void LowerBufferFormatToRaw(IR::Program& program); void LowerBufferFormatToRaw(IR::Program& program);
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info);
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
Stage stage); Stage stage);
void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);

View File

@ -1,81 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/program.h"
namespace Shader::Optimization {
static bool IsSharedMemoryInst(const IR::Inst& inst) {
const auto opcode = inst.GetOpcode();
return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 ||
opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64;
}
static u32 GetSharedMemImmOffset(const IR::Inst& inst) {
const auto* address = inst.Arg(0).InstRecursive();
ASSERT(address->GetOpcode() == IR::Opcode::IAdd32);
const auto ir_offset = address->Arg(1);
ASSERT_MSG(ir_offset.IsImmediate());
const auto offset = ir_offset.U32();
// Typical usage is the compiler spilling registers into shared memory, with 256 bytes between
// each register to account for 4 bytes per register times 64 threads per group. Ensure that
// this assumption holds, as if it does not this approach may need to be revised.
ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
return offset;
}
static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) {
switch (inst.GetOpcode()) {
case IR::Opcode::LoadSharedU32:
inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr));
break;
case IR::Opcode::LoadSharedU64:
inst.ReplaceUsesWithAndRemove(
ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1)));
break;
case IR::Opcode::WriteSharedU32:
ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)});
inst.Invalidate();
break;
case IR::Opcode::WriteSharedU64: {
const auto value = inst.Arg(1);
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)});
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)});
inst.Invalidate();
break;
}
default:
UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode());
}
}
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) {
u32 next_vgpr_num = runtime_info.num_allocated_vgprs;
std::unordered_map<u32, IR::VectorReg> vgpr_map;
const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) {
const auto [it, is_new] = vgpr_map.try_emplace(offset);
if (is_new) {
ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
it->second = new_vgpr;
}
return it->second;
};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
if (!IsSharedMemoryInst(inst)) {
continue;
}
const auto offset = GetSharedMemImmOffset(inst);
const auto vgpr = get_vgpr(offset);
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
ConvertSharedMemToVgpr(ir, inst, vgpr);
}
}
}
} // namespace Shader::Optimization

View File

@ -65,10 +65,6 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
// Run optimization passes // Run optimization passes
const auto stage = program.info.stage; const auto stage = program.info.stage;
if (stage == Stage::Fragment) {
// Before SSA pass, as it will rewrite to VGPR load/store.
Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info);
}
Shader::Optimization::SsaRewritePass(program.post_order_blocks); Shader::Optimization::SsaRewritePass(program.post_order_blocks);
Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::IdentityRemovalPass(program.blocks);
if (info.l_stage == LogicalStage::TessellationControl) { if (info.l_stage == LogicalStage::TessellationControl) {