diff --git a/CMakeLists.txt b/CMakeLists.txt index 92a30808c..aa1712d5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -926,6 +926,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/opcodes.cpp src/shader_recompiler/ir/opcodes.h src/shader_recompiler/ir/opcodes.inc + src/shader_recompiler/ir/operand_helper.h src/shader_recompiler/ir/patch.cpp src/shader_recompiler/ir/patch.h src/shader_recompiler/ir/position.h diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 4b72b81ab..e7a7b3be6 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -179,6 +179,14 @@ void Translator::EmitPrologue(IR::Block* first_block) { // [8:12]: output control point id ir.SetVectorReg(IR::VectorReg::V1, ir.GetAttributeU32(IR::Attribute::PackedHullInvocationInfo)); + + if (runtime_info.hs_info.offchip_lds_enable) { + // No off-chip tessellation has been observed yet. If this survives dead code elim, + // revisit + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::OffChipLdsBase)); + } + ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::TessFactorsBufferBase)); + break; } case LogicalStage::TessellationEval: diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 668821201..e0c64ff4a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -202,39 +202,18 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst, u32 scalar_width, bool is_signed) { const auto& mubuf = inst.control.mubuf; - const bool is_ring = mubuf.glc && mubuf.slc && info.l_stage != LogicalStage::Vertex && - info.l_stage != LogicalStage::Fragment; const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; - const IR::Value soffset{GetSrc(inst.src[3])}; - const bool has_soffset = !soffset.IsImmediate() || soffset.U32() != 0; - if (info.stage != Stage::Geometry) { - ASSERT_MSG(!has_soffset || !mubuf.offen, - "Having both scalar and vector offsets is not supported"); - } - const IR::Value address = [&] -> IR::Value { - if (is_ring) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); - } - if (mubuf.idxen && mubuf.offen) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); - } - if (mubuf.idxen && has_soffset) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); - } - if (mubuf.idxen || mubuf.offen) { - return ir.GetVectorReg(vaddr); - } - if (has_soffset) { - return soffset; - } - return {}; - }(); + const IR::U32 index = mubuf.idxen ? ir.GetVectorReg(vaddr) : ir.Imm32(0); + const IR::VectorReg voffset_vgpr = mubuf.idxen ? vaddr + 1 : vaddr; + const IR::U32 voffset = mubuf.offen ? ir.GetVectorReg(voffset_vgpr) : ir.Imm32(0); + const IR::U32 soffset{GetSrc(inst.src[3])}; + const IR::Value address = ir.CompositeConstruct(index, voffset, soffset); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen || has_soffset); + buffer_info.voffset_enable.Assign(mubuf.offen); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); @@ -290,35 +269,18 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst, u32 scalar_width) { const auto& mubuf = inst.control.mubuf; - const bool is_ring = - mubuf.glc && mubuf.slc && info.l_stage != LogicalStage::Fragment && - info.stage != - Stage::Vertex; // VS passes attributes down with EXPORT, VS HW stage is always present const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; - const IR::Value soffset{GetSrc(inst.src[3])}; - if (info.stage != Stage::Export && info.stage != Stage::Hull && info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, - "Non immediate offset not supported"); - } - - IR::Value address = [&] -> IR::Value { - if (is_ring) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); - } - if (mubuf.idxen && mubuf.offen) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); - } - if (mubuf.idxen || mubuf.offen) { - return ir.GetVectorReg(vaddr); - } - return {}; - }(); + const IR::U32 index = mubuf.idxen ? ir.GetVectorReg(vaddr) : ir.Imm32(0); + const IR::VectorReg voffset_vgpr = mubuf.idxen ? vaddr + 1 : vaddr; + const IR::U32 voffset = mubuf.offen ? ir.GetVectorReg(voffset_vgpr) : ir.Imm32(0); + const IR::U32 soffset{GetSrc(inst.src[3])}; + const IR::Value address = ir.CompositeConstruct(index, voffset, soffset); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.voffset_enable.Assign(mubuf.offen); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); @@ -377,21 +339,15 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { const IR::VectorReg vaddr{inst.src[0].code}; const IR::VectorReg vdata{inst.src[1].code}; const IR::ScalarReg srsrc{inst.src[2].code * 4}; - const IR::Value address = [&] -> IR::Value { - if (mubuf.idxen && mubuf.offen) { - return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); - } - if (mubuf.idxen || mubuf.offen) { - return ir.GetVectorReg(vaddr); - } - return {}; - }(); + const IR::U32 index = mubuf.idxen ? ir.GetVectorReg(vaddr) : ir.Imm32(0); + const IR::VectorReg voffset_vgpr = mubuf.idxen ? vaddr + 1 : vaddr; + const IR::U32 voffset = mubuf.offen ? ir.GetVectorReg(voffset_vgpr) : ir.Imm32(0); const IR::U32 soffset{GetSrc(inst.src[3])}; - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); + const IR::Value address = ir.CompositeConstruct(index, voffset, soffset); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.voffset_enable.Assign(mubuf.offen); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 094c34ee8..1572ef615 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -153,7 +153,11 @@ std::string NameOf(Attribute attribute) { case Attribute::TessellationEvaluationPointV: return "TessellationEvaluationPointV"; case Attribute::PackedHullInvocationInfo: + return "OffChipLdsBase"; + case Attribute::OffChipLdsBase: return "PackedHullInvocationInfo"; + case Attribute::TessFactorsBufferBase: + return "TessFactorsBufferBase"; default: break; } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 58f28fb81..42466e5bb 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -85,6 +85,8 @@ enum class Attribute : u64 { TessellationEvaluationPointU = 88, TessellationEvaluationPointV = 89, PackedHullInvocationInfo = 90, // contains patch id within the VGT and invocation ID + OffChipLdsBase = 91, + TessFactorsBufferBase = 92, Max, }; diff --git a/src/shader_recompiler/ir/operand_helper.h b/src/shader_recompiler/ir/operand_helper.h new file mode 100644 index 000000000..cce2f081d --- /dev/null +++ b/src/shader_recompiler/ir/operand_helper.h @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// Some helpers to get operand indices of instructions by name to make it a bit safer. +// Just a start, not widely used + +#include "shader_recompiler/ir/value.h" + +namespace Shader::IR { + +// use namespaces. Enums would be better choice, but annoyingly need casting to size_t to use +// as indices + +namespace LoadBufferArgs { +static const size_t Handle = 0; +static const size_t Address = 1; +}; // namespace LoadBufferArgs + +namespace StoreBufferArgs { +static const size_t Handle = 0; +static const size_t Address = 1; +static const size_t Data = 2; +}; // namespace StoreBufferArgs + +static_assert(LoadBufferArgs::Handle == StoreBufferArgs::Handle); +static_assert(LoadBufferArgs::Address == StoreBufferArgs::Address); + +// Get certain components of buffer address argument, used in Load/StoreBuffer variants. +// We keep components separate as u32x3, before combining after sharp tracking +static inline IR::U32 GetBufferAddressComponent(const Inst* buffer_inst, u32 comp) { + Inst* address = buffer_inst->Arg(1).InstRecursive(); + ASSERT(address->GetOpcode() == IR::Opcode::CompositeConstructU32x3); + return IR::U32{address->Arg(comp).Resolve()}; +} + +static inline U32 GetBufferIndexArg(const Inst* buffer_inst) { + return GetBufferAddressComponent(buffer_inst, 0); +} + +static inline U32 GetBufferVOffsetArg(const Inst* buffer_inst) { + return GetBufferAddressComponent(buffer_inst, 1); +} + +static inline U32 GetBufferSOffsetArg(const Inst* buffer_inst) { + return GetBufferAddressComponent(buffer_inst, 2); +} + +} // namespace Shader::IR \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index 156cb6628..2f8e1d7b1 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -7,6 +7,7 @@ #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/operand_helper.h" #include "shader_recompiler/ir/passes/ir_passes.h" #include "shader_recompiler/ir/pattern_matching.h" #include "shader_recompiler/ir/program.h" @@ -373,11 +374,27 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: { - const auto info = inst.Flags(); - if (!info.globally_coherent) { + IR::Value soffset = IR::GetBufferSOffsetArg(&inst); + if (!M_GETATTRIBUTEU32(MatchAttribute(IR::Attribute::TessFactorsBufferBase), + MatchIgnore()) + .Match(soffset)) { break; } + + const auto info = inst.Flags(); IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + + IR::Value voffset; + bool success = + M_COMPOSITECONSTRUCTU32X3(MatchU32(0), MatchImm(voffset), MatchIgnore()) + .Match(inst.Arg(IR::StoreBufferArgs::Address)); + ASSERT_MSG(success, "unhandled pattern in tess factor store"); + + const u32 gcn_factor_idx = (info.inst_offset.Value() + voffset.U32()) >> 2; + const IR::Value data = inst.Arg(IR::StoreBufferArgs::Data); + + const u32 num_dwords = u32(opcode) - u32(IR::Opcode::StoreBufferU32) + 1; + const auto GetValue = [&](IR::Value data) -> IR::F32 { if (auto* inst = data.TryInstRecursive(); inst && inst->GetOpcode() == IR::Opcode::BitCastU32F32) { @@ -385,12 +402,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { } return ir.BitCast(IR::U32{data}); }; - const u32 num_dwords = u32(opcode) - u32(IR::Opcode::StoreBufferU32) + 1; - IR::U32 index = IR::U32{inst.Arg(1)}; - ASSERT(index.IsImmediate()); - const u32 gcn_factor_idx = (info.inst_offset.Value() + index.U32()) >> 2; - const IR::Value data = inst.Arg(2); auto get_factor_attr = [&](u32 gcn_factor_idx) -> IR::Patch { // The hull outputs tess factors in different formats depending on the shader. // For triangle domains, it seems to pack the entries into 4 consecutive floats, diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 12d8d0e02..625c8676e 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -6,6 +6,7 @@ #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/operand_helper.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/ir/reinterpret.h" #include "video_core/amdgpu/resource.h" @@ -740,22 +741,25 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In : buffer.GetDataFmt(); const u32 shift = BufferAddressShift(inst, data_format); const u32 mask = (1 << shift) - 1; + const IR::U32 soffset = IR::GetBufferSOffsetArg(&inst); // If address calculation is of the form "index * const_stride + offset" with offset constant // and both const_stride and offset are divisible with the element size, apply shift directly. - if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable && - !buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) { - // buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift) - const IR::U32 index = IR::U32{inst.Arg(1)}; - return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift)); + if (inst_info.index_enable && !inst_info.voffset_enable && soffset.IsImmediate() && + !buffer.swizzle_enable && !buffer.add_tid_enable && (stride & mask) == 0) { + const u32 total_offset = soffset.U32() + inst_offset; + if ((total_offset & mask) == 0) { + // buffer_offset = index * (const_stride >> shift) + (offset >> shift) + const IR::U32 index = IR::GetBufferIndexArg(&inst); + return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), + ir.Imm32(total_offset >> shift)); + } } // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0) IR::U32 index = ir.Imm32(0U); if (inst_info.index_enable) { - const IR::U32 vgpr_index{inst_info.offset_enable - ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)} - : IR::U32{inst.Arg(1)}}; + const IR::U32 vgpr_index = IR::GetBufferIndexArg(&inst); index = ir.IAdd(index, vgpr_index); } if (buffer.add_tid_enable) { @@ -766,11 +770,10 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In } // offset = (inst_offen ? vgpr_offset : 0) + inst_offset IR::U32 offset = ir.Imm32(inst_offset); - if (inst_info.offset_enable) { - const IR::U32 vgpr_offset = inst_info.index_enable - ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)} - : IR::U32{inst.Arg(1)}; - offset = ir.IAdd(offset, vgpr_offset); + offset = ir.IAdd(offset, soffset); + if (inst_info.voffset_enable) { + const IR::U32 voffset = IR::GetBufferVOffsetArg(&inst); + offset = ir.IAdd(offset, voffset); } const IR::U32 const_stride = ir.Imm32(stride); IR::U32 buffer_offset; @@ -815,7 +818,8 @@ void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) { } IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride)); + inst.SetArg(IR::LoadBufferArgs::Address, + CalculateBufferAddress(ir, inst, info, buffer, buffer.stride)); } IR::Value FixCubeCoords(IR::IREmitter& ir, const AmdGpu::Image& image, const IR::Value& x, diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 382031710..83416bfb8 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -4,6 +4,7 @@ #include "common/assert.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/operand_helper.h" #include "shader_recompiler/ir/position.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/ir/reg.h" @@ -113,10 +114,12 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim break; } - const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; - const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); - const auto bucket = offset.Resolve().U32() / 256u; + const auto vertex_id = (info.index_enable ? IR::GetBufferIndexArg(&inst) + : IR::GetBufferVOffsetArg(&inst)) + .U32() >> + 2; + const auto soffset = IR::GetBufferSOffsetArg(&inst); + const auto bucket = soffset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 : IR::Attribute::Param0 + (bucket / 4 - 1); const auto comp = bucket % 4; diff --git a/src/shader_recompiler/ir/pattern_matching.h b/src/shader_recompiler/ir/pattern_matching.h index 1279f14c3..de5b1cd8a 100644 --- a/src/shader_recompiler/ir/pattern_matching.h +++ b/src/shader_recompiler/ir/pattern_matching.h @@ -121,6 +121,8 @@ inline auto MakeInstPattern(Args&&... args) { MakeInstPattern(__VA_ARGS__) #define M_COMPOSITECONSTRUCTU32X2(...) \ MakeInstPattern(__VA_ARGS__) +#define M_COMPOSITECONSTRUCTU32X3(...) \ + MakeInstPattern(__VA_ARGS__) #define M_COMPOSITECONSTRUCTU32X4(...) \ MakeInstPattern(__VA_ARGS__) diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index 96c5b2dc7..ee71cfc4e 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -51,7 +51,7 @@ union TextureInstInfo { union BufferInstInfo { u32 raw; BitField<0, 1, u32> index_enable; - BitField<1, 1, u32> offset_enable; + BitField<1, 1, u32> voffset_enable; BitField<2, 12, u32> inst_offset; BitField<14, 1, u32> system_coherent; BitField<15, 1, u32> globally_coherent; diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 6e138888a..a204afd36 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -114,6 +114,7 @@ struct HullRuntimeInfo { u32 num_input_control_points; u32 num_threads; AmdGpu::TessellationType tess_type; + bool offchip_lds_enable; // from tess constants buffer u32 ls_stride; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index aaff3e31f..ad4801d84 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -118,6 +118,7 @@ struct Liverpool { u32 address_lo; BitField<0, 8, u32> address_hi; union { + // SPI_SHADER_PGM_RSRC1_XX BitField<0, 6, u64> num_vgprs; BitField<6, 4, u64> num_sgprs; BitField<10, 2, u64> priority; @@ -127,7 +128,12 @@ struct Liverpool { BitField<18, 2, FpDenormMode> fp_denorm_mode64; BitField<12, 8, u64> float_mode; BitField<24, 2, u64> vgpr_comp_cnt; // SPI provided per-thread inputs + // SPI_SHADER_PGM_RSRC2_XX + BitField<32, 1, u64> scratch_en; BitField<33, 5, u64> num_user_regs; + union { + BitField<39, 1, u64> oc_lds_en; + } rsrc2_hs; } settings; UserData user_data; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 1345efae7..bcb5062bb 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -112,6 +112,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS info.hs_info.num_input_control_points = regs.ls_hs_config.hs_input_control_points.Value(); info.hs_info.num_threads = regs.ls_hs_config.hs_output_control_points.Value(); info.hs_info.tess_type = regs.tess_config.type; + info.hs_info.offchip_lds_enable = regs.hs_program.settings.rsrc2_hs.oc_lds_en.Value(); // We need to initialize most hs_info fields after finding the V# with tess constants break;