mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-08-04 16:32:39 +00:00
shader_recompiler: Move shared mem lowering into emitter
* IR can be quite verbose during first stages of translation, before ssa and constant prop passes have run that drastically simplify it. This lowering can also be done during emission so why not do it then to save some compilation time
This commit is contained in:
parent
455b23c6f1
commit
dd8fb9dd4b
@ -754,7 +754,6 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
|
|||||||
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
|
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
|
||||||
src/shader_recompiler/ir/passes/ir_passes.h
|
src/shader_recompiler/ir/passes/ir_passes.h
|
||||||
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
|
src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
|
||||||
src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp
|
|
||||||
src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
|
src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
|
||||||
src/shader_recompiler/ir/passes/ring_access_elimination.cpp
|
src/shader_recompiler/ir/passes/ring_access_elimination.cpp
|
||||||
src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
|
src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
|
||||||
|
@ -176,6 +176,12 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid
|
|||||||
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
|
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
|
||||||
const IR::VectorReg data0{inst.src[1].code};
|
const IR::VectorReg data0{inst.src[1].code};
|
||||||
const IR::VectorReg data1{inst.src[2].code};
|
const IR::VectorReg data1{inst.src[2].code};
|
||||||
|
if (info.stage == Stage::Fragment) {
|
||||||
|
const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0;
|
||||||
|
ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
|
||||||
|
ir.SetVectorReg(GetScratchVgpr(offset), ir.GetVectorReg(data0));
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (is_pair) {
|
if (is_pair) {
|
||||||
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
|
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
|
||||||
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
|
const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
|
||||||
@ -223,6 +229,12 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride
|
|||||||
const GcnInst& inst) {
|
const GcnInst& inst) {
|
||||||
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
|
const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))};
|
||||||
IR::VectorReg dst_reg{inst.dst[0].code};
|
IR::VectorReg dst_reg{inst.dst[0].code};
|
||||||
|
if (info.stage == Stage::Fragment) {
|
||||||
|
const u64 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0;
|
||||||
|
ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
|
||||||
|
ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset)));
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (is_pair) {
|
if (is_pair) {
|
||||||
// Pair loads are either 32 or 64-bit
|
// Pair loads are either 32 or 64-bit
|
||||||
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
|
const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
|
||||||
|
@ -4,7 +4,6 @@
|
|||||||
#include "common/config.h"
|
#include "common/config.h"
|
||||||
#include "common/io_file.h"
|
#include "common/io_file.h"
|
||||||
#include "common/path_util.h"
|
#include "common/path_util.h"
|
||||||
#include "shader_recompiler/exception.h"
|
|
||||||
#include "shader_recompiler/frontend/fetch_shader.h"
|
#include "shader_recompiler/frontend/fetch_shader.h"
|
||||||
#include "shader_recompiler/frontend/translate/translate.h"
|
#include "shader_recompiler/frontend/translate/translate.h"
|
||||||
#include "shader_recompiler/info.h"
|
#include "shader_recompiler/info.h"
|
||||||
@ -21,9 +20,14 @@
|
|||||||
|
|
||||||
namespace Shader::Gcn {
|
namespace Shader::Gcn {
|
||||||
|
|
||||||
|
static u32 next_vgpr_num;
|
||||||
|
static std::unordered_map<u32, IR::VectorReg> vgpr_map;
|
||||||
|
|
||||||
Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_,
|
Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_,
|
||||||
const Profile& profile_)
|
const Profile& profile_)
|
||||||
: ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {}
|
: ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} {
|
||||||
|
next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num;
|
||||||
|
}
|
||||||
|
|
||||||
void Translator::EmitPrologue() {
|
void Translator::EmitPrologue() {
|
||||||
ir.Prologue();
|
ir.Prologue();
|
||||||
@ -179,8 +183,21 @@ void Translator::EmitPrologue() {
|
|||||||
default:
|
default:
|
||||||
UNREACHABLE_MSG("Unknown shader stage");
|
UNREACHABLE_MSG("Unknown shader stage");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clear any scratch vgpr mappings for next shader.
|
||||||
|
vgpr_map.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
IR::VectorReg Translator::GetScratchVgpr(u32 offset) {
|
||||||
|
const auto [it, is_new] = vgpr_map.try_emplace(offset);
|
||||||
|
if (is_new) {
|
||||||
|
ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
|
||||||
|
const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
|
||||||
|
it->second = new_vgpr;
|
||||||
|
}
|
||||||
|
return it->second;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T Translator::GetSrc(const InstOperand& operand) {
|
T Translator::GetSrc(const InstOperand& operand) {
|
||||||
constexpr bool is_float = std::is_same_v<T, IR::F32>;
|
constexpr bool is_float = std::is_same_v<T, IR::F32>;
|
||||||
|
@ -317,6 +317,8 @@ private:
|
|||||||
|
|
||||||
void LogMissingOpcode(const GcnInst& inst);
|
void LogMissingOpcode(const GcnInst& inst);
|
||||||
|
|
||||||
|
IR::VectorReg GetScratchVgpr(u32 offset);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
IR::IREmitter ir;
|
IR::IREmitter ir;
|
||||||
Info& info;
|
Info& info;
|
||||||
|
@ -20,7 +20,6 @@ void FlattenExtendedUserdataPass(IR::Program& program);
|
|||||||
void ResourceTrackingPass(IR::Program& program);
|
void ResourceTrackingPass(IR::Program& program);
|
||||||
void CollectShaderInfoPass(IR::Program& program);
|
void CollectShaderInfoPass(IR::Program& program);
|
||||||
void LowerBufferFormatToRaw(IR::Program& program);
|
void LowerBufferFormatToRaw(IR::Program& program);
|
||||||
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info);
|
|
||||||
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
|
void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info,
|
||||||
Stage stage);
|
Stage stage);
|
||||||
void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);
|
void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info);
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
||||||
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "shader_recompiler/ir/ir_emitter.h"
|
|
||||||
#include "shader_recompiler/ir/program.h"
|
|
||||||
|
|
||||||
namespace Shader::Optimization {
|
|
||||||
|
|
||||||
static bool IsSharedMemoryInst(const IR::Inst& inst) {
|
|
||||||
const auto opcode = inst.GetOpcode();
|
|
||||||
return opcode == IR::Opcode::LoadSharedU32 || opcode == IR::Opcode::LoadSharedU64 ||
|
|
||||||
opcode == IR::Opcode::WriteSharedU32 || opcode == IR::Opcode::WriteSharedU64;
|
|
||||||
}
|
|
||||||
|
|
||||||
static u32 GetSharedMemImmOffset(const IR::Inst& inst) {
|
|
||||||
const auto* address = inst.Arg(0).InstRecursive();
|
|
||||||
ASSERT(address->GetOpcode() == IR::Opcode::IAdd32);
|
|
||||||
const auto ir_offset = address->Arg(1);
|
|
||||||
ASSERT_MSG(ir_offset.IsImmediate());
|
|
||||||
const auto offset = ir_offset.U32();
|
|
||||||
// Typical usage is the compiler spilling registers into shared memory, with 256 bytes between
|
|
||||||
// each register to account for 4 bytes per register times 64 threads per group. Ensure that
|
|
||||||
// this assumption holds, as if it does not this approach may need to be revised.
|
|
||||||
ASSERT_MSG(offset % 256 == 0, "Unexpected shared memory offset alignment: {}", offset);
|
|
||||||
return offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ConvertSharedMemToVgpr(IR::IREmitter& ir, IR::Inst& inst, const IR::VectorReg vgpr) {
|
|
||||||
switch (inst.GetOpcode()) {
|
|
||||||
case IR::Opcode::LoadSharedU32:
|
|
||||||
inst.ReplaceUsesWithAndRemove(ir.GetVectorReg(vgpr));
|
|
||||||
break;
|
|
||||||
case IR::Opcode::LoadSharedU64:
|
|
||||||
inst.ReplaceUsesWithAndRemove(
|
|
||||||
ir.CompositeConstruct(ir.GetVectorReg(vgpr), ir.GetVectorReg(vgpr + 1)));
|
|
||||||
break;
|
|
||||||
case IR::Opcode::WriteSharedU32:
|
|
||||||
ir.SetVectorReg(vgpr, IR::U32{inst.Arg(1)});
|
|
||||||
inst.Invalidate();
|
|
||||||
break;
|
|
||||||
case IR::Opcode::WriteSharedU64: {
|
|
||||||
const auto value = inst.Arg(1);
|
|
||||||
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 0)});
|
|
||||||
ir.SetVectorReg(vgpr, IR::U32{ir.CompositeExtract(value, 1)});
|
|
||||||
inst.Invalidate();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
UNREACHABLE_MSG("Unknown shared memory opcode: {}", inst.GetOpcode());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void LowerSharedMemToRegisters(IR::Program& program, const RuntimeInfo& runtime_info) {
|
|
||||||
u32 next_vgpr_num = runtime_info.num_allocated_vgprs;
|
|
||||||
std::unordered_map<u32, IR::VectorReg> vgpr_map;
|
|
||||||
const auto get_vgpr = [&next_vgpr_num, &vgpr_map](const u32 offset) {
|
|
||||||
const auto [it, is_new] = vgpr_map.try_emplace(offset);
|
|
||||||
if (is_new) {
|
|
||||||
ASSERT_MSG(next_vgpr_num < 256, "Out of VGPRs");
|
|
||||||
const auto new_vgpr = static_cast<IR::VectorReg>(next_vgpr_num++);
|
|
||||||
it->second = new_vgpr;
|
|
||||||
}
|
|
||||||
return it->second;
|
|
||||||
};
|
|
||||||
|
|
||||||
for (IR::Block* const block : program.blocks) {
|
|
||||||
for (IR::Inst& inst : block->Instructions()) {
|
|
||||||
if (!IsSharedMemoryInst(inst)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const auto offset = GetSharedMemImmOffset(inst);
|
|
||||||
const auto vgpr = get_vgpr(offset);
|
|
||||||
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
|
|
||||||
ConvertSharedMemToVgpr(ir, inst, vgpr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace Shader::Optimization
|
|
@ -65,10 +65,6 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
|
|||||||
// Run optimization passes
|
// Run optimization passes
|
||||||
const auto stage = program.info.stage;
|
const auto stage = program.info.stage;
|
||||||
|
|
||||||
if (stage == Stage::Fragment) {
|
|
||||||
// Before SSA pass, as it will rewrite to VGPR load/store.
|
|
||||||
Shader::Optimization::LowerSharedMemToRegisters(program, runtime_info);
|
|
||||||
}
|
|
||||||
Shader::Optimization::SsaRewritePass(program.post_order_blocks);
|
Shader::Optimization::SsaRewritePass(program.post_order_blocks);
|
||||||
Shader::Optimization::IdentityRemovalPass(program.blocks);
|
Shader::Optimization::IdentityRemovalPass(program.blocks);
|
||||||
if (info.l_stage == LogicalStage::TessellationControl) {
|
if (info.l_stage == LogicalStage::TessellationControl) {
|
||||||
|
Loading…
Reference in New Issue
Block a user