shader_recompiler: Rework sharp tracking for robustness (#3327)

* shader_recompiler: Remove remnants of old discard

Also constant propagate conditional discard if condition is constant

* resource_tracking_pass: Rework sharp tracking for robustness

* resource_tracking_pass: Add source dominance analysis

When reachability is not enough to prune source list, check if a source dominates all other sources

* resource_tracking_pass: Fix immediate check

How did this work before

* resource_tracking_pass: Remove unused template type

* readlane_elimination_pass: Don't add phi when all args are the same

New sharp tracking exposed some bad sources coming on sampler sharps with aniso disable pattern that also were part of readlane pattern, fix tracking by removing the unnecessary phis inbetween

* resource_tracking_pass: Allow phi in disable aniso pattern

* resource_tracking_pass: Handle not valid buffer sharp and more phi in aniso pattern
This commit is contained in:
TheTurtle 2025-07-28 23:32:16 +03:00 committed by GitHub
parent d286631798
commit 93767ae31b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 304 additions and 243 deletions

View File

@ -893,6 +893,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/ir/attribute.h
src/shader_recompiler/ir/basic_block.cpp
src/shader_recompiler/ir/basic_block.h
src/shader_recompiler/ir/breadth_first_search.h
src/shader_recompiler/ir/condition.h
src/shader_recompiler/ir/ir_emitter.cpp
src/shader_recompiler/ir/ir_emitter.h

View File

@ -945,11 +945,11 @@ void EmitContext::DefineImagesAndSamplers() {
const Id id{AddGlobalVariable(sampler_pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
auto sharp_desc = std::holds_alternative<u32>(samp_desc.sampler)
? fmt::format("sgpr:{}", std::get<u32>(samp_desc.sampler))
: fmt::format("inline:{:#x}:{:#x}",
std::get<AmdGpu::Sampler>(samp_desc.sampler).raw0,
std::get<AmdGpu::Sampler>(samp_desc.sampler).raw1);
const auto sharp_desc =
samp_desc.is_inline_sampler
? fmt::format("inline:{:#x}:{:#x}", samp_desc.inline_sampler.raw0,
samp_desc.inline_sampler.raw1)
: fmt::format("sgpr:{}", samp_desc.sharp_idx);
Name(id, fmt::format("{}_{}{}", stage, "samp", sharp_desc));
samplers.push_back(id);
interfaces.push_back(id);

View File

@ -2,7 +2,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <unordered_map>
#include "common/assert.h"
#include "common/logging/log.h"
#include "shader_recompiler/frontend/control_flow_graph.h"
@ -350,19 +349,7 @@ void CFG::LinkBlocks() {
block.branch_false = end_block;
block.end_class = EndClass::Branch;
} else if (end_inst.opcode == Opcode::S_ENDPGM) {
const auto& prev_inst = inst_list[block.end_index - 1];
if (prev_inst.opcode == Opcode::EXP && prev_inst.control.exp.en == 0) {
if (prev_inst.control.exp.target != 9) {
block.end_class = EndClass::Kill;
} else if (const auto& exec_mask = inst_list[block.end_index - 2];
exec_mask.src[0].field == OperandField::ConstZero) {
block.end_class = EndClass::Kill;
} else {
block.end_class = EndClass::Exit;
}
} else {
block.end_class = EndClass::Exit;
}
block.end_class = EndClass::Exit;
} else {
UNREACHABLE();
}
@ -403,12 +390,6 @@ std::string CFG::Dot() const {
fmt::format("\t\tN{} [label=\"Exit\"][shape=square][style=stripped];\n", node_uid);
++node_uid;
break;
case EndClass::Kill:
dot += fmt::format("\t\t{}->N{};\n", name, node_uid);
dot +=
fmt::format("\t\tN{} [label=\"Kill\"][shape=square][style=stripped];\n", node_uid);
++node_uid;
break;
}
}
dot += "\t\tlabel = \"main\";\n\t}\n";

View File

@ -23,7 +23,6 @@ using Hook =
enum class EndClass {
Branch, ///< Block ends with a (un)conditional branch.
Exit, ///< Block ends with an exit instruction.
Kill, ///< Block ends with a discard instruction.
};
/// A block represents a linear range of instructions.

View File

@ -39,7 +39,6 @@ enum class StatementType {
Loop,
Break,
Return,
Kill,
Unreachable,
Function,
Identity,
@ -88,7 +87,6 @@ struct Statement : ListBaseHook {
Statement(Break, Statement* cond_, Statement* up_)
: cond{cond_}, up{up_}, type{StatementType::Break} {}
Statement(Return, Statement* up_) : up{up_}, type{StatementType::Return} {}
Statement(Kill, Statement* up_) : up{up_}, type{StatementType::Kill} {}
Statement(Unreachable, Statement* up_) : up{up_}, type{StatementType::Unreachable} {}
Statement(FunctionTag) : children{}, type{StatementType::Function} {}
Statement(Identity, IR::Condition cond_, Statement* up_)
@ -174,9 +172,6 @@ std::string DumpExpr(const Statement* stmt) {
case StatementType::Return:
ret += fmt::format("{} return;\n", indent);
break;
case StatementType::Kill:
ret += fmt::format("{} kill;\n", indent);
break;
case StatementType::Unreachable:
ret += fmt::format("{} unreachable;\n", indent);
break;
@ -335,9 +330,9 @@ private:
}
}
// Expensive operation:
if (!AreSiblings(goto_stmt, label_stmt)) {
UNREACHABLE_MSG("Goto is not a sibling with the label");
}
// if (!AreSiblings(goto_stmt, label_stmt)) {
// UNREACHABLE_MSG("Goto is not a sibling with the label");
//}
// goto_stmt and label_stmt are guaranteed to be siblings, eliminate
if (std::next(goto_stmt) == label_stmt) {
// Simply eliminate the goto if the label is next to it
@ -410,9 +405,6 @@ private:
case EndClass::Exit:
root.insert(ip, *pool.Create(Return{}, &root_stmt));
break;
case EndClass::Kill:
root.insert(ip, *pool.Create(Kill{}, &root_stmt));
break;
}
}
}
@ -637,6 +629,7 @@ private:
if (!stmt.block->is_dummy) {
const u32 start = stmt.block->begin_index;
const u32 size = stmt.block->end_index - start + 1;
current_block->cfg_block = stmt.block;
translator.Translate(current_block, stmt.block->begin,
inst_list.subspan(start, size));
}
@ -770,18 +763,6 @@ private:
syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return;
break;
}
case StatementType::Kill: {
ensure_block();
IR::Block* demote_block{MergeBlock(parent, stmt)};
IR::IREmitter{*current_block}.Discard();
current_block->AddBranch(demote_block);
current_block = demote_block;
auto& merge{syntax_list.emplace_back()};
merge.type = IR::AbstractSyntaxNode::Type::Block;
merge.data.block = demote_block;
break;
}
case StatementType::Unreachable: {
ensure_block();
current_block = nullptr;
@ -789,7 +770,7 @@ private:
break;
}
default:
throw NotImplementedException("Statement type {}", u32(stmt.type));
UNREACHABLE_MSG("Statement type {}", u32(stmt.type));
}
}
if (current_block) {

View File

@ -6,7 +6,7 @@
namespace Shader::Gcn {
void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) {
void Translator::EmitFlowControl(const GcnInst& inst) {
switch (inst.opcode) {
case Opcode::S_BARRIER:
return S_BARRIER();
@ -20,7 +20,7 @@ void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) {
LOG_WARNING(Render_Vulkan, "S_TRAP instruction!");
return;
case Opcode::S_GETPC_B64:
return S_GETPC_B64(pc, inst);
return S_GETPC_B64(inst);
case Opcode::S_SETPC_B64:
case Opcode::S_WAITCNT:
case Opcode::S_NOP:
@ -45,9 +45,7 @@ void Translator::S_BARRIER() {
ir.Barrier();
}
void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) {
// This only really exists to let resource tracking pass know
// there is an inline cbuf.
void Translator::S_GETPC_B64(const GcnInst& inst) {
const IR::ScalarReg dst{inst.dst[0].code};
ir.SetScalarReg(dst, ir.Imm32(pc));
ir.SetScalarReg(dst + 1, ir.Imm32(0));

View File

@ -520,14 +520,13 @@ void Translator::EmitFetch(const GcnInst& inst) {
GcnDecodeContext decoder;
// Decode and save instructions
u32 sub_pc = 0;
while (!slice.atEnd()) {
const auto sub_inst = decoder.decodeInstruction(slice);
if (sub_inst.opcode == Opcode::S_SETPC_B64) {
// Assume we're swapping back to the main shader.
break;
}
TranslateInstruction(sub_inst, sub_pc++);
TranslateInstruction(sub_inst);
}
return;
}
@ -574,11 +573,12 @@ void Translator::LogMissingOpcode(const GcnInst& inst) {
info.translation_failed = true;
}
void Translator::Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list) {
void Translator::Translate(IR::Block* block, u32 start_pc, std::span<const GcnInst> inst_list) {
if (inst_list.empty()) {
return;
}
ir = IR::IREmitter{*block, block->begin()};
pc = start_pc;
for (const auto& inst : inst_list) {
pc += inst.length;
@ -590,11 +590,11 @@ void Translator::Translate(IR::Block* block, u32 pc, std::span<const GcnInst> in
continue;
}
TranslateInstruction(inst, pc);
TranslateInstruction(inst);
}
}
void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) {
void Translator::TranslateInstruction(const GcnInst& inst) {
// Emit instructions for each category.
switch (inst.category) {
case InstCategory::DataShare:
@ -613,7 +613,7 @@ void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) {
EmitExport(inst);
break;
case InstCategory::FlowControl:
EmitFlowControl(pc, inst);
EmitFlowControl(inst);
break;
case InstCategory::ScalarALU:
EmitScalarAlu(inst);

View File

@ -61,13 +61,13 @@ public:
explicit Translator(Info& info, const RuntimeInfo& runtime_info, const Profile& profile);
void Translate(IR::Block* block, u32 pc, std::span<const GcnInst> inst_list);
void TranslateInstruction(const GcnInst& inst, u32 pc);
void TranslateInstruction(const GcnInst& inst);
// Instruction categories
void EmitPrologue(IR::Block* first_block);
void EmitFetch(const GcnInst& inst);
void EmitExport(const GcnInst& inst);
void EmitFlowControl(u32 pc, const GcnInst& inst);
void EmitFlowControl(const GcnInst& inst);
void EmitScalarAlu(const GcnInst& inst);
void EmitScalarMemory(const GcnInst& inst);
void EmitVectorAlu(const GcnInst& inst);
@ -126,7 +126,7 @@ public:
void S_FLBIT_I32_B32(const GcnInst& inst);
void S_FLBIT_I32_B64(const GcnInst& inst);
void S_BITSET_B32(const GcnInst& inst, u32 bit_value);
void S_GETPC_B64(u32 pc, const GcnInst& inst);
void S_GETPC_B64(const GcnInst& inst);
void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst);
void S_ABS_I32(const GcnInst& inst);
@ -337,6 +337,7 @@ private:
std::unordered_map<u32, IR::VectorReg> vgpr_map;
std::array<IR::Attribute, MaxInterpVgpr> vgpr_to_interp{};
bool opcode_missing = false;
u32 pc{};
};
} // namespace Shader::Gcn

View File

@ -588,7 +588,7 @@ void Translator::IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst) {
IR::Value EmitImageSample(IR::IREmitter& ir, const GcnInst& inst, const IR::ScalarReg tsharp_reg,
const IR::ScalarReg sampler_reg, const IR::VectorReg addr_reg,
bool gather) {
bool gather, u32 pc) {
const auto& mimg = inst.control.mimg;
const auto flags = MimgModifierFlags(mimg.mod);
@ -602,6 +602,7 @@ IR::Value EmitImageSample(IR::IREmitter& ir, const GcnInst& inst, const IR::Scal
info.is_array.Assign(mimg.da);
info.is_unnormalized.Assign(mimg.unrm);
info.is_r128.Assign(mimg.r128);
info.pc.Assign(pc);
if (gather) {
info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1);
@ -610,11 +611,11 @@ IR::Value EmitImageSample(IR::IREmitter& ir, const GcnInst& inst, const IR::Scal
info.has_derivatives.Assign(flags.test(MimgModifier::Derivative));
}
// Load first dword of T# and S#. We will use them as the handle that will guide resource
// tracking pass where to read the sharps. This will later also get patched to the SPIRV texture
// binding index.
const IR::Value handle = ir.GetScalarReg(tsharp_reg);
const IR::Value inline_sampler =
// Load first dword of T# and the full S#. We will use them as the handle that will guide
// resource tracking pass where to read the sharps. This will later also get patched to the
// backend texture binding index.
const IR::Value image_handle = ir.GetScalarReg(tsharp_reg);
const IR::Value sampler_handle =
ir.CompositeConstruct(ir.GetScalarReg(sampler_reg), ir.GetScalarReg(sampler_reg + 1),
ir.GetScalarReg(sampler_reg + 2), ir.GetScalarReg(sampler_reg + 3));
@ -652,8 +653,8 @@ IR::Value EmitImageSample(IR::IREmitter& ir, const GcnInst& inst, const IR::Scal
const IR::Value address4 = get_addr_reg(12);
// Issue the placeholder IR instruction.
IR::Value texel =
ir.ImageSampleRaw(handle, address1, address2, address3, address4, inline_sampler, info);
IR::Value texel = ir.ImageSampleRaw(image_handle, sampler_handle, address1, address2, address3,
address4, info);
if (info.is_depth && !gather) {
// For non-gather depth sampling, only return a single value.
texel = ir.CompositeExtract(texel, 0);
@ -669,7 +670,7 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) {
const IR::ScalarReg sampler_reg{inst.src[3].code * 4};
const auto flags = MimgModifierFlags(mimg.mod);
const IR::Value texel = EmitImageSample(ir, inst, tsharp_reg, sampler_reg, addr_reg, false);
const IR::Value texel = EmitImageSample(ir, inst, tsharp_reg, sampler_reg, addr_reg, false, pc);
for (u32 i = 0; i < 4; i++) {
if (((mimg.dmask >> i) & 1) == 0) {
continue;
@ -698,7 +699,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) {
// should be always 1st (R) component for depth
ASSERT(!flags.test(MimgModifier::Pcf) || mimg.dmask & 1);
const IR::Value texel = EmitImageSample(ir, inst, tsharp_reg, sampler_reg, addr_reg, true);
const IR::Value texel = EmitImageSample(ir, inst, tsharp_reg, sampler_reg, addr_reg, true, pc);
for (u32 i = 0; i < 4; i++) {
const IR::F32 value = IR::F32{ir.CompositeExtract(texel, i)};
ir.SetVectorReg(dest_reg++, value);

View File

@ -4,7 +4,6 @@
#pragma once
#include <span>
#include <variant>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <boost/container/static_vector.hpp>
@ -93,15 +92,12 @@ struct ImageResource {
using ImageResourceList = boost::container::small_vector<ImageResource, NumImages>;
struct SamplerResource {
std::variant<u32, AmdGpu::Sampler> sampler;
u32 sharp_idx;
AmdGpu::Sampler inline_sampler;
u32 is_inline_sampler : 1;
u32 associated_image : 4;
u32 disable_aniso : 1;
SamplerResource(u32 sharp_idx, u32 associated_image_, bool disable_aniso_)
: sampler{sharp_idx}, associated_image{associated_image_}, disable_aniso{disable_aniso_} {}
SamplerResource(AmdGpu::Sampler sampler_)
: sampler{sampler_}, associated_image{0}, disable_aniso(0) {}
constexpr AmdGpu::Sampler GetSharp(const Info& info) const noexcept;
};
using SamplerResourceList = boost::container::small_vector<SamplerResource, NumSamplers>;
@ -312,20 +308,24 @@ struct Info {
DECLARE_ENUM_FLAG_OPERATORS(Info::ReadConstType);
constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
const auto buffer = inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
if (!buffer.Valid()) {
LOG_DEBUG(Render, "Encountered invalid buffer sharp");
return AmdGpu::Buffer::Null();
}
return buffer;
}
constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept {
AmdGpu::Image image{0};
AmdGpu::Image image{};
if (!is_r128) {
image = info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
} else {
const auto buf = info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
memcpy(&image, &buf, sizeof(buf));
const auto raw = info.ReadUdSharp<u128>(sharp_idx);
std::memcpy(&image, &raw, sizeof(raw));
}
if (!image.Valid()) {
// Fall back to null image if unbound.
LOG_DEBUG(Render_Vulkan, "Encountered unbound image!");
LOG_DEBUG(Render_Vulkan, "Encountered invalid image sharp");
image = is_depth ? AmdGpu::Image::NullDepth() : AmdGpu::Image::Null();
} else if (is_depth) {
const auto data_fmt = image.GetDataFmt();
@ -338,9 +338,7 @@ constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept
}
constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept {
return std::holds_alternative<AmdGpu::Sampler>(sampler)
? std::get<AmdGpu::Sampler>(sampler)
: info.ReadUdSharp<AmdGpu::Sampler>(std::get<u32>(sampler));
return is_inline_sampler ? inline_sampler : info.ReadUdSharp<AmdGpu::Sampler>(sharp_idx);
}
constexpr AmdGpu::Image FMaskResource::GetSharp(const Info& info) const noexcept {

View File

@ -123,8 +123,8 @@ std::string DumpBlock(const Block& block, const std::map<const Block*, size_t>&
ret += fmt::format(" {}", op); // '%00000 = ' -> 1 + 5 + 3 = 9 spaces
}
if (op == Opcode::ReadConst) {
ret += fmt::format(" (flags={}) ", inst.Flags<u32>());
if (op == Opcode::ReadConst || op == Opcode::ImageSampleRaw) {
ret += fmt::format(" (flags={:#x}) ", inst.Flags<u32>());
}
const size_t arg_count{inst.NumArgs()};
for (size_t arg_index = 0; arg_index < arg_count; ++arg_index) {

View File

@ -14,6 +14,10 @@
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::Gcn {
struct Block;
}
namespace Shader::IR {
class Block {
@ -150,6 +154,10 @@ public:
std::array<Value, NumScalarRegs> ssa_sbit_values;
std::array<Value, NumVectorRegs> ssa_vreg_values;
/// Block of the CFG that corresponds to this IR block.
/// It can be null as IR has additional control flow blocks.
const Shader::Gcn::Block* cfg_block{};
private:
/// Memory pool for instruction list
Common::ObjectPool<Inst>* inst_pool;

View File

@ -2105,11 +2105,11 @@ Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, c
return Inst(Opcode::ImageAtomicExchange32, Flags{info}, handle, coords, value);
}
Value IREmitter::ImageSampleRaw(const Value& handle, const Value& address1, const Value& address2,
const Value& address3, const Value& address4,
const Value& inline_sampler, TextureInstInfo info) {
return Inst(Opcode::ImageSampleRaw, Flags{info}, handle, address1, address2, address3, address4,
inline_sampler);
Value IREmitter::ImageSampleRaw(const Value& image_handle, const Value& sampler_handle,
const Value& address1, const Value& address2, const Value& address3,
const Value& address4, TextureInstInfo info) {
return Inst(Opcode::ImageSampleRaw, Flags{info}, image_handle, sampler_handle, address1,
address2, address3, address4);
}
Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& coords, const F32& bias,

View File

@ -359,9 +359,9 @@ public:
[[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords,
const Value& value, TextureInstInfo info);
[[nodiscard]] Value ImageSampleRaw(const Value& handle, const Value& address1,
const Value& address2, const Value& address3,
const Value& address4, const Value& inline_sampler,
[[nodiscard]] Value ImageSampleRaw(const Value& image_handle, const Value& sampler_handle,
const Value& address1, const Value& address2,
const Value& address3, const Value& address4,
TextureInstInfo info);
[[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& body,

View File

@ -436,7 +436,7 @@ OPCODE(ConvertS32S8, U32, U8,
OPCODE(ConvertS32S16, U32, U16, )
// Image operations
OPCODE(ImageSampleRaw, F32x4, Opaque, F32x4, F32x4, F32x4, F32, Opaque, )
OPCODE(ImageSampleRaw, F32x4, Opaque, Opaque, F32x4, F32x4, F32x4, F32, )
OPCODE(ImageSampleImplicitLod, F32x4, Opaque, F32x4, F32, Opaque, )
OPCODE(ImageSampleExplicitLod, F32x4, Opaque, Opaque, F32, Opaque, )
OPCODE(ImageSampleDrefImplicitLod, F32x4, Opaque, Opaque, F32, F32, Opaque, )
@ -445,7 +445,7 @@ OPCODE(ImageGather, F32x4, Opaq
OPCODE(ImageGatherDref, F32x4, Opaque, Opaque, Opaque, F32, )
OPCODE(ImageQueryDimensions, U32x4, Opaque, U32, U1, )
OPCODE(ImageQueryLod, F32x4, Opaque, Opaque, )
OPCODE(ImageGradient, F32x4, Opaque, Opaque, Opaque, Opaque, Opaque, F32, )
OPCODE(ImageGradient, F32x4, Opaque, Opaque, Opaque, Opaque, Opaque, F32, )
OPCODE(ImageRead, F32x4, Opaque, Opaque, U32, U32, )
OPCODE(ImageWrite, Void, Opaque, Opaque, U32, U32, F32x4, )

View File

@ -204,6 +204,18 @@ void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) {
}
}
void FoldDiscardCond(IR::Inst& inst) {
const IR::U1 cond{inst.Arg(0)};
if (!cond.IsImmediate()) {
return;
}
if (cond.U1()) {
inst.ReplaceOpcode(IR::Opcode::Discard);
} else {
inst.Invalidate();
}
}
template <typename T>
void FoldAdd(IR::Block& block, IR::Inst& inst) {
if (!FoldCommutative<T>(inst, [](T a, T b) { return a + b; })) {
@ -505,6 +517,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
return FoldConvert(inst, IR::Opcode::ConvertF16F32);
case IR::Opcode::ConvertF16F32:
return FoldConvert(inst, IR::Opcode::ConvertF32F16);
case IR::Opcode::DiscardCond:
return FoldDiscardCond(inst);
default:
break;
}

View File

@ -78,10 +78,20 @@ static IR::Value GetRealValue(PhiMap& phi_map, IR::Inst* inst, u32 lane) {
it->second = new_phi;
// Gather all arguments.
boost::container::static_vector<IR::Value, 5> phi_args;
for (size_t arg_index = 0; arg_index < inst->NumArgs(); arg_index++) {
IR::Inst* arg_prod = inst->Arg(arg_index).InstRecursive();
const IR::Value arg = GetRealValue(phi_map, arg_prod, lane);
new_phi->AddPhiOperand(inst->PhiBlock(arg_index), arg);
phi_args.push_back(arg);
}
const IR::Value arg0 = phi_args[0].Resolve();
if (std::ranges::all_of(phi_args,
[&](const IR::Value& arg) { return arg.Resolve() == arg0; })) {
new_phi->ReplaceUsesWith(arg0);
} else {
for (size_t arg_index = 0; arg_index < inst->NumArgs(); arg_index++) {
new_phi->AddPhiOperand(inst->PhiBlock(arg_index), phi_args[arg_index]);
}
}
return IR::Value{new_phi};
}

View File

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/frontend/control_flow_graph.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/breadth_first_search.h"
@ -259,7 +260,9 @@ public:
u32 Add(const SamplerResource& desc) {
const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
return desc.sampler == existing.sampler;
return desc.sharp_idx == existing.sharp_idx &&
desc.is_inline_sampler == existing.is_inline_sampler &&
desc.inline_sampler == existing.inline_sampler;
})};
return index;
}
@ -313,11 +316,24 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
return not_found;
}
// The bitfield extract might be hidden by phi sometimes
auto* prod0_arg0 = prod0->Arg(0).InstRecursive();
if (prod0_arg0->GetOpcode() == IR::Opcode::Phi) {
auto arg0 = prod0_arg0->Arg(0);
auto arg1 = prod0_arg0->Arg(1);
if (!arg0.IsImmediate() &&
arg0.InstRecursive()->GetOpcode() == IR::Opcode::BitFieldUExtract) {
prod0_arg0 = arg0.InstRecursive();
} else if (!arg1.IsImmediate() &&
arg1.InstRecursive()->GetOpcode() == IR::Opcode::BitFieldUExtract) {
prod0_arg0 = arg1.InstRecursive();
}
}
// The bits range is for lods (note that constants are changed after constant propagation pass)
const auto* prod0_arg0 = prod0->Arg(0).InstRecursive();
if (prod0_arg0->GetOpcode() != IR::Opcode::BitFieldUExtract ||
!(prod0_arg0->Arg(1).IsIdentity() && prod0_arg0->Arg(1).U32() == 12) ||
!(prod0_arg0->Arg(2).IsIdentity() && prod0_arg0->Arg(2).U32() == 8)) {
!(prod0_arg0->Arg(1).IsImmediate() && prod0_arg0->Arg(1).U32() == 12) ||
!(prod0_arg0->Arg(2).IsImmediate() && prod0_arg0->Arg(2).U32() == 8)) {
return not_found;
}
@ -330,102 +346,170 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
// We're working on the first dword of s#
const auto* prod2 = inst->Arg(2).InstRecursive();
if (prod2->GetOpcode() != IR::Opcode::GetUserData &&
prod2->GetOpcode() != IR::Opcode::ReadConst) {
prod2->GetOpcode() != IR::Opcode::ReadConst && prod2->GetOpcode() != IR::Opcode::Phi) {
return not_found;
}
return {prod2, true};
}
SharpLocation AttemptTrackSharp(const IR::Inst* inst, auto& visited_insts) {
// Search until we find a potential sharp source.
const auto pred = [&visited_insts](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
if (std::ranges::find(visited_insts, inst) != visited_insts.end()) {
return std::nullopt;
using SharpSources = boost::container::small_vector<const IR::Inst*, 4>;
bool IsSharpSource(const IR::Inst* inst) {
return inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst;
}
SharpSources FindSharpSources(const IR::Inst* handle, u32 pc) {
SharpSources sources;
if (IsSharpSource(handle)) {
sources.push_back(handle);
return sources;
}
bool found_read_const_buffer = false;
boost::container::small_vector<const IR::Inst*, 8> visited;
std::queue<const IR::Inst*> queue;
queue.push(handle);
while (!queue.empty()) {
const IR::Inst* inst{queue.front()};
queue.pop();
if (IsSharpSource(inst)) {
sources.push_back(inst);
continue;
}
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
found_read_const_buffer |= inst->GetOpcode() == IR::Opcode::ReadConstBuffer;
if (inst->GetOpcode() != IR::Opcode::Phi) {
continue;
}
return std::nullopt;
};
const auto result = IR::BreadthFirstSearch(inst, pred);
ASSERT_MSG(result, "Unable to track sharp source");
inst = result.value();
visited_insts.emplace_back(inst);
for (size_t arg = inst->NumArgs(); arg--;) {
const IR::Value arg_value = inst->Arg(arg);
if (arg_value.IsImmediate()) {
continue;
}
const IR::Inst* arg_inst = arg_value.InstRecursive();
if (std::ranges::find(visited, arg_inst) == visited.end()) {
visited.push_back(arg_inst);
queue.push(arg_inst);
}
}
}
if (sources.empty()) {
if (found_read_const_buffer) {
UNREACHABLE_MSG("Bindless sharp access detected pc={:#x}", pc);
} else {
UNREACHABLE_MSG("Unable to find sharp sources pc={:#x}", pc);
}
}
return sources;
}
bool IsCfgBlockDominatedBy(const Shader::Gcn::Block* maybe_dominator,
const Shader::Gcn::Block* block, const Shader::Gcn::Block* dest_block) {
if (block == maybe_dominator) {
return true;
}
boost::container::small_vector<const Shader::Gcn::Block*, 8> visited;
std::queue<const Shader::Gcn::Block*> queue;
queue.push(block);
while (!queue.empty()) {
const Shader::Gcn::Block* block{queue.front()};
queue.pop();
if (block == dest_block) {
return false;
}
if (block == maybe_dominator) {
continue;
}
if (block->branch_false && !std::ranges::contains(visited, block->branch_false)) {
visited.push_back(block->branch_false);
queue.push(block->branch_false);
}
if (block->branch_true && !std::ranges::contains(visited, block->branch_true)) {
visited.push_back(block->branch_true);
queue.push(block->branch_true);
}
}
return true;
}
SharpLocation SharpLocationFromSource(const IR::Inst* inst) {
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return static_cast<u32>(inst->Arg(0).ScalarReg());
return static_cast<SharpLocation>(inst->Arg(0).ScalarReg());
} else {
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst,
"Sharp load not from constant memory");
return inst->Flags<u32>();
}
}
/// Tracks a sharp with validation of the chosen data type.
template <typename DataType>
std::pair<SharpLocation, DataType> TrackSharp(const IR::Inst* inst, const Info& info) {
boost::container::small_vector<const IR::Inst*, 4> visited_insts{};
while (true) {
const auto prev_size = visited_insts.size();
const auto sharp = AttemptTrackSharp(inst, visited_insts);
if (const auto data = info.ReadUdSharp<DataType>(sharp); data.Valid()) {
return std::make_pair(sharp, data);
SharpLocation TrackSharp(const IR::Inst* inst, const IR::Block& current_parent, u32 pc = 0) {
auto sources = FindSharpSources(inst, pc);
size_t num_sources = sources.size();
ASSERT(current_parent.cfg_block);
// Perform dominance analysis on found sources and eliminate ones that don't pass
// If a sharp source is dominated by another, the former can be eliminated.
for (s32 i = 0; i < num_sources;) {
const IR::Block* block = sources[i]->GetParent();
ASSERT(block->cfg_block);
bool was_removed = false;
for (s32 j = 0; j < num_sources;) {
const IR::Block* dominator = sources[j]->GetParent();
ASSERT(dominator->cfg_block);
if (i != j && IsCfgBlockDominatedBy(dominator->cfg_block, block->cfg_block,
current_parent.cfg_block)) {
std::swap(sources[i], sources[num_sources - 1]);
--num_sources;
sources.pop_back();
was_removed = true;
break;
} else {
++j;
}
}
if (prev_size == visited_insts.size()) {
// No change in visited instructions, we've run out of paths.
UNREACHABLE_MSG("Unable to find valid sharp.");
if (!was_removed) {
++i;
}
}
}
/// Tracks a sharp without data validation.
SharpLocation TrackSharp(const IR::Inst* inst, const Info& info) {
boost::container::static_vector<const IR::Inst*, 1> visited_insts{};
return AttemptTrackSharp(inst, visited_insts);
}
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
AmdGpu::Buffer& cbuf) {
// Assuming V# is in UD s[32:35]
// The next pattern:
// s_getpc_b64 s[32:33]
// s_add_u32 s32, <const>, s32
// s_addc_u32 s33, 0, s33
// s_mov_b32 s35, <const>
// s_movk_i32 s34, <const>
// buffer_load_format_xyz v[8:10], v1, s[32:35], 0 ...
// is used to define an inline constant buffer
IR::Inst* handle = inst.Arg(0).InstRecursive();
if (!handle->AreAllArgsImmediates()) {
return -1;
}
// We have found this pattern. Build the sharp.
std::array<u64, 2> buffer;
buffer[0] = info.pgm_base + (handle->Arg(0).U32() | u64(handle->Arg(1).U32()) << 32);
buffer[1] = handle->Arg(2).U32() | u64(handle->Arg(3).U32()) << 32;
cbuf = std::bit_cast<AmdGpu::Buffer>(buffer);
// Assign a binding to this sharp.
return descriptors.Add(BufferResource{
.sharp_idx = std::numeric_limits<u32>::max(),
.used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
.inline_cbuf = cbuf,
.buffer_type = BufferType::Guest,
});
ASSERT_MSG(sources.size() == 1, "Unable to deduce sharp source");
return SharpLocationFromSource(sources[0]);
}
void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
s32 binding{};
AmdGpu::Buffer buffer;
if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) {
IR::Inst* handle = inst.Arg(0).InstRecursive();
IR::Inst* producer = handle->Arg(0).InstRecursive();
SharpLocation sharp;
std::tie(sharp, buffer) = TrackSharp<AmdGpu::Buffer>(producer, info);
binding = descriptors.Add(BufferResource{
.sharp_idx = sharp,
IR::Inst* handle = inst.Arg(0).InstRecursive();
u32 buffer_binding = 0;
if (handle->AreAllArgsImmediates()) {
// Assuming V# is in UD s[32:35]
// The next pattern:
// s_getpc_b64 s[32:33]
// s_add_u32 s32, <const>, s32
// s_addc_u32 s33, 0, s33
// s_mov_b32 s35, <const>
// s_movk_i32 s34, <const>
// buffer_load_format_xyz v[8:10], v1, s[32:35], 0 ...
// is used to define an inline buffer resource
std::array<u64, 2> raw;
raw[0] = info.pgm_base + (handle->Arg(0).U32() | u64(handle->Arg(1).U32()) << 32);
raw[1] = handle->Arg(2).U32() | u64(handle->Arg(3).U32()) << 32;
const auto buffer = std::bit_cast<AmdGpu::Buffer>(raw);
buffer_binding = descriptors.Add(BufferResource{
.sharp_idx = std::numeric_limits<u32>::max(),
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.inline_cbuf = buffer,
.buffer_type = BufferType::Guest,
});
} else {
// Normal buffer resource.
IR::Inst* buffer_handle = handle->Arg(0).InstRecursive();
const auto sharp_idx = TrackSharp(buffer_handle, block);
const auto buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
buffer_binding = descriptors.Add(BufferResource{
.sharp_idx = sharp_idx,
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.buffer_type = BufferType::Guest,
.is_written = IsBufferStore(inst),
@ -436,25 +520,14 @@ void PatchBufferSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
// Replace handle with binding index in buffer resource list.
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
inst.SetArg(0, ir.Imm32(binding));
inst.SetArg(0, ir.Imm32(buffer_binding));
}
void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
const auto opcode = inst->GetOpcode();
if (opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only)
opcode == IR::Opcode::GetUserData) {
return inst;
}
return std::nullopt;
};
const auto result = IR::BreadthFirstSearch(&inst, pred);
ASSERT_MSG(result, "Unable to find image sharp source");
const IR::Inst* tsharp_handle = result.value();
// Read image sharp.
const auto tsharp = TrackSharp(tsharp_handle, info);
const auto inst_info = inst.Flags<IR::TextureInstInfo>();
const IR::Inst* image_handle = inst.Arg(0).InstRecursive();
const auto tsharp = TrackSharp(image_handle, block, inst_info.pc);
const bool is_atomic = IsImageAtomicInstruction(inst);
const bool is_written = inst.GetOpcode() == IR::Opcode::ImageWrite || is_atomic;
const ImageResource image_res = {
@ -506,38 +579,34 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors&
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
if (inst.GetOpcode() == IR::Opcode::ImageSampleRaw) {
// Read sampler sharp.
const auto sampler_binding = [&] -> u32 {
const auto sampler = inst.Arg(5).InstRecursive();
ASSERT(sampler && sampler->GetOpcode() == IR::Opcode::CompositeConstructU32x4);
const auto handle = sampler->Arg(0);
// Inline sampler resource.
if (handle.IsImmediate()) {
LOG_DEBUG(Render_Vulkan, "Inline sampler detected");
const auto [s1, s2, s3, s4] =
std::tuple{sampler->Arg(0), sampler->Arg(1), sampler->Arg(2), sampler->Arg(3)};
ASSERT(s1.IsImmediate() && s2.IsImmediate() && s3.IsImmediate() &&
s4.IsImmediate());
const auto inline_sampler = AmdGpu::Sampler{
.raw0 = u64(s2.U32()) << 32 | u64(s1.U32()),
.raw1 = u64(s4.U32()) << 32 | u64(s3.U32()),
};
const auto binding = descriptors.Add(SamplerResource{inline_sampler});
return binding;
} else {
// Normal sampler resource.
const auto ssharp_handle = handle.InstRecursive();
const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle);
const auto ssharp = TrackSharp(ssharp_ud, info);
const auto binding =
descriptors.Add(SamplerResource{ssharp, image_binding, disable_aniso});
return binding;
}
}();
// Patch image and sampler handle.
u32 sampler_binding = 0;
const IR::Inst* sampler = inst.Arg(1).InstRecursive();
ASSERT(sampler && sampler->GetOpcode() == IR::Opcode::CompositeConstructU32x4);
// Inline sampler resource.
if (sampler->AreAllArgsImmediates()) {
const auto inline_sampler = AmdGpu::Sampler{
.raw0 = u64(sampler->Arg(1).U32()) << 32 | u64(sampler->Arg(0).U32()),
.raw1 = u64(sampler->Arg(3).U32()) << 32 | u64(sampler->Arg(2).U32()),
};
sampler_binding = descriptors.Add(SamplerResource{
.sharp_idx = std::numeric_limits<u32>::max(),
.inline_sampler = inline_sampler,
.is_inline_sampler = true,
});
} else {
// Normal sampler resource.
const auto& [sampler_handle, disable_aniso] =
TryDisableAnisoLod0(sampler->Arg(0).InstRecursive());
const auto ssharp = TrackSharp(sampler_handle, block, inst_info.pc);
sampler_binding = descriptors.Add(SamplerResource{
.sharp_idx = ssharp,
.is_inline_sampler = false,
.associated_image = image_binding,
.disable_aniso = disable_aniso,
});
}
inst.SetArg(0, ir.Imm32(image_binding | sampler_binding << 16));
} else {
// Patch image handle.
inst.SetArg(0, ir.Imm32(image_binding));
}
}
@ -768,10 +837,10 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
const auto inst_info = inst.Flags<IR::TextureInstInfo>();
const auto view_type = image.GetViewType(image_res.is_array);
IR::Inst* body1 = inst.Arg(1).InstRecursive();
IR::Inst* body2 = inst.Arg(2).InstRecursive();
IR::Inst* body3 = inst.Arg(3).InstRecursive();
IR::F32 body4 = IR::F32{inst.Arg(4)};
IR::Inst* body1 = inst.Arg(2).InstRecursive();
IR::Inst* body2 = inst.Arg(3).InstRecursive();
IR::Inst* body3 = inst.Arg(4).InstRecursive();
IR::F32 body4 = IR::F32{inst.Arg(5)};
const auto get_addr_reg = [&](u32 index) -> IR::F32 {
if (index <= 3) {
return IR::F32{body1->Arg(index)};
@ -942,14 +1011,13 @@ void PatchImageArgs(IR::Block& block, IR::Inst& inst, Info& info) {
return;
}
const auto handle = inst.Arg(0);
const auto image_res = info.images[handle.U32() & 0xFFFF];
const auto image_handle = inst.Arg(0);
const auto& image_res = info.images[image_handle.U32() & 0xFFFF];
auto image = image_res.GetSharp(info);
// Sample instructions must be handled separately using address register data.
if (inst.GetOpcode() == IR::Opcode::ImageSampleRaw) {
PatchImageSampleArgs(block, inst, info, image_res, image);
return;
return PatchImageSampleArgs(block, inst, info, image_res, image);
}
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
@ -963,17 +1031,13 @@ void PatchImageArgs(IR::Block& block, IR::Inst& inst, Info& info) {
case AmdGpu::ImageType::Color1D: // x, [lod]
return {body->Arg(0), body->Arg(1)};
case AmdGpu::ImageType::Color1DArray: // x, slice, [lod]
[[fallthrough]];
case AmdGpu::ImageType::Color2D: // x, y, [lod]
[[fallthrough]];
case AmdGpu::ImageType::Color2DMsaa: // x, y. (sample is passed on different argument)
case AmdGpu::ImageType::Color2D: // x, y, [lod]
case AmdGpu::ImageType::Color2DMsaa: // x, y. (sample is passed on different argument)
return {ir.CompositeConstruct(body->Arg(0), body->Arg(1)), body->Arg(2)};
case AmdGpu::ImageType::Color2DArray: // x, y, slice, [lod]
[[fallthrough]];
case AmdGpu::ImageType::Color2DArray: // x, y, slice, [lod]
case AmdGpu::ImageType::Color2DMsaaArray: // x, y, slice. (sample is passed on different
// argument)
[[fallthrough]];
case AmdGpu::ImageType::Color3D: // x, y, z, [lod]
case AmdGpu::ImageType::Color3D: // x, y, z, [lod]
return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)};
default:
UNREACHABLE_MSG("Unknown image type {}", view_type);
@ -988,7 +1052,7 @@ void PatchImageArgs(IR::Block& block, IR::Inst& inst, Info& info) {
const auto is_storage = image_res.is_written;
if (inst.GetOpcode() == IR::Opcode::ImageRead) {
auto texel = ir.ImageRead(handle, coords, lod, ms, inst_info);
auto texel = ir.ImageRead(image_handle, coords, lod, ms, inst_info);
if (is_storage) {
// Storage image requires shader swizzle.
texel = ApplySwizzle(ir, texel, image.DstSelect());

View File

@ -45,6 +45,7 @@ union TextureInstInfo {
BitField<10, 1, u32> is_unnormalized;
BitField<11, 1, u32> is_gather;
BitField<12, 1, u32> is_r128;
BitField<16, 16, u32> pc;
};
union BufferInstInfo {

View File

@ -486,6 +486,10 @@ struct Sampler {
return raw0 != 0 || raw1 != 0;
}
bool Valid() const {
return true;
}
bool operator==(const Sampler& other) const noexcept {
return std::memcmp(this, &other, sizeof(Sampler)) == 0;
}