From c0878db548dd7a687f0882de04f44228f33ea2c1 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Sun, 13 Jul 2025 13:16:27 +0300 Subject: [PATCH] shader_recompiler: Replace buffer pulling with attribute divisor for instance step rates --- .../spirv/emit_spirv_context_get_set.cpp | 102 +++++++----------- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/spirv_emit_context.cpp | 62 ++++------- .../backend/spirv/spirv_emit_context.h | 1 - src/shader_recompiler/frontend/fetch_shader.h | 12 --- .../frontend/translate/translate.cpp | 16 +-- src/shader_recompiler/info.h | 14 +-- src/shader_recompiler/ir/ir_emitter.cpp | 4 +- src/shader_recompiler/ir/ir_emitter.h | 3 +- .../ir/passes/ring_access_elimination.cpp | 2 +- src/shader_recompiler/runtime_info.h | 2 + src/shader_recompiler/specialization.h | 20 ++-- src/video_core/buffer_cache/buffer_cache.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 43 +++++--- .../renderer_vulkan/vk_graphics_pipeline.h | 4 +- .../renderer_vulkan/vk_instance.cpp | 7 ++ src/video_core/renderer_vulkan/vk_instance.h | 6 ++ .../renderer_vulkan/vk_pipeline_cache.cpp | 6 +- .../renderer_vulkan/vk_rasterizer.cpp | 5 +- 19 files changed, 140 insertions(+), 176 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index f3a8c518c..c939f3524 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -120,6 +120,9 @@ std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr } } // Anonymous namespace +using PointerType = EmitContext::PointerType; +using PointerSize = EmitContext::PointerSize; + Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg); const u32 half = PushData::UdRegsIndex + (index >> 2); @@ -131,41 +134,6 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { return ud_reg; } -void EmitGetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -using PointerType = EmitContext::PointerType; -using PointerSize = EmitContext::PointerSize; - Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { const u32 flatbuf_off_dw = inst->Flags(); if (!Config::directMemoryAccess()) { @@ -201,18 +169,12 @@ Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { return ReadConstBuffer(ctx, handle, index); } -Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { - const auto index{rate_idx == 0 ? PushData::Step0Index : PushData::Step1Index}; - return ctx.OpLoad( - ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]), - ctx.push_data_block, ctx.ConstU32(index))); -} - -static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))}; + const auto pointer{ + ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -222,7 +184,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; const auto param = ctx.input_params.at(param_id).id; const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -230,7 +192,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 UNREACHABLE(); } -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (ctx.info.l_stage == LogicalStage::Geometry) { return EmitGetAttributeForGeometry(ctx, attr, comp, index); } else if (ctx.info.l_stage == LogicalStage::TessellationControl || @@ -248,18 +210,6 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsParam(attr)) { const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)}; const auto& param{ctx.input_params.at(param_index)}; - if (param.buffer_handle >= 0) { - const auto step_rate = EmitReadStepRate(ctx, param.id.value); - const auto offset = ctx.OpIAdd( - ctx.U32[1], - ctx.OpIMul( - ctx.U32[1], - ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate), - ctx.ConstU32(param.num_components)), - ctx.ConstU32(comp)); - return ReadConstBuffer(ctx, param.buffer_handle, offset); - } - Id result; if (param.is_loaded) { // Attribute is either default or manually interpolated. The id points to an already @@ -305,10 +255,6 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpLoad(ctx.U32[1], ctx.vertex_index); case IR::Attribute::InstanceId: return ctx.OpLoad(ctx.U32[1], ctx.instance_id); - case IR::Attribute::InstanceId0: - return EmitReadStepRate(ctx, 0); - case IR::Attribute::InstanceId1: - return EmitReadStepRate(ctx, 1); case IR::Attribute::WorkgroupIndex: return ctx.workgroup_index_id; case IR::Attribute::WorkgroupId: @@ -640,4 +586,36 @@ void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a UNREACHABLE_MSG("SPIR-V instruction"); } +void EmitGetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 74c94754d..37d5d84c9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -108,7 +108,7 @@ Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id cmp_value); -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 6a731d05c..852920ade 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -377,35 +377,13 @@ void EmitContext::DefineInputs() { ASSERT(attrib.semantic < IR::NumParams); const auto sharp = attrib.GetSharp(info); const Id type{GetAttributeType(*this, sharp.GetNumberFmt())[4]}; - if (attrib.UsesStepRates()) { - const u32 rate_idx = - attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::OverStepRate0 ? 0 - : 1; - const u32 num_components = AmdGpu::NumComponents(sharp.GetDataFmt()); - const auto buffer = - std::ranges::find_if(info.buffers, [&attrib](const auto& buffer) { - return buffer.instance_attrib == attrib.semantic; - }); - // Note that we pass index rather than Id - input_params[attrib.semantic] = SpirvAttribute{ - .id = {rate_idx}, - .pointer_type = input_u32, - .component_type = U32[1], - .num_components = std::min(attrib.num_elements, num_components), - .is_integer = true, - .is_loaded = false, - .buffer_handle = int(buffer - info.buffers.begin()), - }; + Id id{DefineInput(type, attrib.semantic)}; + if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) { + Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); } else { - Id id{DefineInput(type, attrib.semantic)}; - if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) { - Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); - } else { - Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); - } - input_params[attrib.semantic] = - GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); + Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); } + input_params[attrib.semantic] = GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); } break; } @@ -700,12 +678,10 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates - const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4], - U32[4], U32[4], U32[4], U32[4], U32[4], U32[2]), + const Id struct_type{Name(TypeStruct(F32[1], F32[1], F32[1], F32[1], U32[4], U32[4], U32[4], + U32[4], U32[4], U32[4], U32[2]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, PushData::Step0Index, "sr0"); - MemberName(struct_type, PushData::Step1Index, "sr1"); MemberName(struct_type, PushData::XOffsetIndex, "xoffset"); MemberName(struct_type, PushData::YOffsetIndex, "yoffset"); MemberName(struct_type, PushData::XScaleIndex, "xscale"); @@ -717,19 +693,17 @@ void EmitContext::DefinePushDataBlock() { MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0"); MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1"); MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2"); - MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U); - MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U); - MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U); - MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 12U); - MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 16U); - MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 20U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 24U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 40U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 56U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 120U); + MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 0U); + MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 4U); + MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 8U); + MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 12U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 16U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 32U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 48U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 64U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 80U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 96U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 112U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 28e9099d8..186925706 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -361,7 +361,6 @@ public: u32 num_components; bool is_integer{}; bool is_loaded{}; - s32 buffer_handle{-1}; }; Id input_attr_array; Id output_attr_array; diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index 837caafa0..e77925232 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include "common/types.h" #include "shader_recompiler/info.h" @@ -29,11 +28,6 @@ struct VertexAttribute { return static_cast(instance_data); } - [[nodiscard]] bool UsesStepRates() const { - const auto step_rate = GetStepRate(); - return step_rate == OverStepRate0 || step_rate == OverStepRate1; - } - [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Shader::Info& info) const noexcept { return info.ReadUdReg(sgpr_base, dword_offset); } @@ -52,12 +46,6 @@ struct FetchShaderData { s8 vertex_offset_sgpr = -1; ///< SGPR of vertex offset from VADDR s8 instance_offset_sgpr = -1; ///< SGPR of instance offset from VADDR - [[nodiscard]] bool UsesStepRates() const { - return std::ranges::find_if(attributes, [](const VertexAttribute& attribute) { - return attribute.UsesStepRates(); - }) != attributes.end(); - } - bool operator==(const FetchShaderData& other) const { return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr && instance_offset_sgpr == other.instance_offset_sgpr; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 5853f3e72..9c06dc6a5 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -481,11 +481,11 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra } void Translator::EmitFetch(const GcnInst& inst) { - // Read the pointer to the fetch shader assembly. const auto code_sgpr_base = inst.src[0].code; + + // The fetch shader must be inlined to access as regular buffers, so that + // bounds checks can be emitted to emulate robust buffer access. if (!profile.supports_robust_buffer_access) { - // The fetch shader must be inlined to access as regular buffers, so that - // bounds checks can be emitted to emulate robust buffer access. const auto* code = GetFetchShaderCode(info, code_sgpr_base); GcnCodeSlice slice(code, code + std::numeric_limits::max()); GcnDecodeContext decoder; @@ -535,16 +535,6 @@ void Translator::EmitFetch(const GcnInst& inst) { for (u32 i = 0; i < 4; i++) { ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(swizzled, i)}); } - - // In case of programmable step rates we need to fallback to instance data pulling in - // shader, so VBs should be bound as regular data buffers - if (attrib.UsesStepRates()) { - info.buffers.push_back({ - .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), - .used_types = IR::Type::F32, - .instance_attrib = attrib.semantic, - }); - } } } diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 9703643e8..6e12c6816 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -113,17 +113,13 @@ struct FMaskResource { using FMaskResourceList = boost::container::small_vector; struct PushData { - static constexpr u32 Step0Index = 0; - static constexpr u32 Step1Index = 1; - static constexpr u32 XOffsetIndex = 2; - static constexpr u32 YOffsetIndex = 3; - static constexpr u32 XScaleIndex = 4; - static constexpr u32 YScaleIndex = 5; - static constexpr u32 UdRegsIndex = 6; + static constexpr u32 XOffsetIndex = 0; + static constexpr u32 YOffsetIndex = 1; + static constexpr u32 XScaleIndex = 2; + static constexpr u32 YScaleIndex = 3; + static constexpr u32 UdRegsIndex = 4; static constexpr u32 BufOffsetIndex = UdRegsIndex + NumUserDataRegs / 4; - u32 step0; - u32 step1; float xoffset; float yoffset; float xscale; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 4997145d7..6ca86b2c0 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -255,8 +255,8 @@ void IREmitter::SetM0(const U32& value) { Inst(Opcode::SetM0, value); } -F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) { - return Inst(Opcode::GetAttribute, attribute, Imm32(comp), index); +F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) { + return Inst(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index)); } U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 6055df565..a105b042d 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -81,8 +81,7 @@ public: [[nodiscard]] U1 Condition(IR::Condition cond); - [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, - IR::Value index = IR::Value(u32(0u))); + [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0); [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index b292b41b9..ca72097e7 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -116,7 +116,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2); + const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); const auto bucket = offset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 5a0408e2c..ce0f7bf3a 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -85,6 +85,8 @@ struct VertexRuntimeInfo { std::array outputs; bool emulate_depth_negative_one_to_one{}; bool clip_disable{}; + u32 step_rate_0; + u32 step_rate_1; // Domain AmdGpu::TessellationType tess_type; AmdGpu::TessellationTopology tess_topology; diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index e40309aaf..8e6b0f01b 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -14,6 +14,7 @@ namespace Shader { struct VsAttribSpecialization { s32 num_components{}; + u32 divisor{}; AmdGpu::NumberClass num_class{}; AmdGpu::CompMapping dst_select{}; @@ -74,13 +75,13 @@ struct SamplerSpecialization { * after the first compilation of a module. */ struct StageSpecialization { - static constexpr size_t MaxStageResources = 64; + static constexpr size_t MaxStageResources = 128; const Shader::Info* info; RuntimeInfo runtime_info; + std::bitset bitset{}; std::optional fetch_shader_data{}; boost::container::small_vector vs_attribs; - std::bitset bitset{}; boost::container::small_vector buffers; boost::container::small_vector images; boost::container::small_vector fmasks; @@ -94,10 +95,17 @@ struct StageSpecialization { if (info_.stage == Stage::Vertex && fetch_shader_data) { // Specialize shader on VS input number types to follow spec. ForEachSharp(vs_attribs, fetch_shader_data->attributes, - [&profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { - spec.num_components = desc.UsesStepRates() - ? AmdGpu::NumComponents(sharp.GetDataFmt()) - : 0; + [&profile_, this](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + spec.num_components = AmdGpu::NumComponents(sharp.GetDataFmt()); + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; + if (const auto step_rate = desc.GetStepRate(); + step_rate != InstanceIdType::None) { + spec.divisor = step_rate == InstanceIdType::OverStepRate0 + ? runtime_info.vs_info.step_rate_0 + : (step_rate == InstanceIdType::OverStepRate1 + ? runtime_info.vs_info.step_rate_1 + : 1); + } spec.num_class = profile_.support_legacy_vertex_attributes ? AmdGpu::NumberClass{} : AmdGpu::GetNumberClass(sharp.GetNumberFmt()); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 28444ac60..42e3c61a5 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -198,10 +198,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { + const auto& regs = liverpool->regs; Vulkan::VertexInputs attributes; Vulkan::VertexInputs bindings; + Vulkan::VertexInputs divisors; Vulkan::VertexInputs guest_buffers; - pipeline.GetVertexInputs(attributes, bindings, guest_buffers); + pipeline.GetVertexInputs(attributes, bindings, divisors, guest_buffers, + regs.vgt_instance_step_rate_0, regs.vgt_instance_step_rate_1); if (instance.IsVertexInputDynamicState()) { // Update current vertex inputs. diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7c020a012..e971a00fc 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -72,12 +72,21 @@ GraphicsPipeline::GraphicsPipeline( VertexInputs vertex_attributes; VertexInputs vertex_bindings; + VertexInputs divisors; VertexInputs guest_buffers; if (!instance.IsVertexInputDynamicState()) { - GetVertexInputs(vertex_attributes, vertex_bindings, guest_buffers); + const auto& vs_info = runtime_infos[u32(Shader::LogicalStage::Vertex)].vs_info; + GetVertexInputs(vertex_attributes, vertex_bindings, divisors, guest_buffers, + vs_info.step_rate_0, vs_info.step_rate_1); } + const vk::PipelineVertexInputDivisorStateCreateInfo divisor_state = { + .vertexBindingDivisorCount = static_cast(divisors.size()), + .pVertexBindingDivisors = divisors.data(), + }; + const vk::PipelineVertexInputStateCreateInfo vertex_input_info = { + .pNext = &divisor_state, .vertexBindingDescriptionCount = static_cast(vertex_bindings.size()), .pVertexBindingDescriptions = vertex_bindings.data(), .vertexAttributeDescriptionCount = static_cast(vertex_attributes.size()), @@ -304,19 +313,16 @@ GraphicsPipeline::GraphicsPipeline( GraphicsPipeline::~GraphicsPipeline() = default; template -void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, - VertexInputs& bindings, - VertexInputs& guest_buffers) const { +void GraphicsPipeline::GetVertexInputs( + VertexInputs& attributes, VertexInputs& bindings, + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const { + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; if (!fetch_shader || fetch_shader->attributes.empty()) { return; } const auto& vs_info = GetStage(Shader::LogicalStage::Vertex); for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } - const auto& buffer = attrib.GetSharp(vs_info); attributes.push_back(Attribute{ .location = attrib.semantic, @@ -327,12 +333,21 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, bindings.push_back(Binding{ .binding = attrib.semantic, .stride = buffer.GetStride(), - .inputRate = attrib.GetStepRate() == Shader::Gcn::VertexAttribute::InstanceIdType::None + .inputRate = attrib.GetStepRate() == InstanceIdType::None ? vk::VertexInputRate::eVertex : vk::VertexInputRate::eInstance, }); + const u32 divisor = + attrib.GetStepRate() == InstanceIdType::OverStepRate0 + ? step_rate_0 + : (attrib.GetStepRate() == InstanceIdType::OverStepRate1 ? step_rate_1 : 1); if constexpr (std::is_same_v) { - bindings.back().divisor = 1; + bindings.back().divisor = divisor; + } else { + divisors.push_back(vk::VertexInputBindingDivisorDescriptionEXT{ + .binding = attrib.semantic, + .divisor = divisor, + }); } guest_buffers.emplace_back(buffer); } @@ -342,11 +357,13 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; void GraphicsPipeline::BuildDescSetLayout() { boost::container::small_vector bindings; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 59230ae46..ab67a52b4 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -81,7 +81,9 @@ public: /// Gets the attributes and bindings for vertex inputs. template void GetVertexInputs(VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, + u32 step_rate_1) const; private: void BuildDescSetLayout(); diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 237fa202d..3adf0f2ec 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -277,6 +277,7 @@ bool Instance::CreateDevice() { image_load_store_lod = add_extension(VK_AMD_SHADER_IMAGE_LOAD_STORE_LOD_EXTENSION_NAME); amd_gcn_shader = add_extension(VK_AMD_GCN_SHADER_EXTENSION_NAME); amd_shader_trinary_minmax = add_extension(VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME); + vertex_attribute_divisor = add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME); shader_atomic_float2 = add_extension(VK_EXT_SHADER_ATOMIC_FLOAT_2_EXTENSION_NAME); if (shader_atomic_float2) { shader_atomic_float2_features = @@ -436,6 +437,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{ .legacyVertexAttributes = true, }, + vk::PhysicalDeviceVertexAttributeDivisorFeatures{ + .vertexAttributeInstanceRateDivisor = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, @@ -498,6 +502,9 @@ bool Instance::CreateDevice() { if (!legacy_vertex_attributes) { device_chain.unlink(); } + if (!vertex_attribute_divisor) { + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 830b1d5c2..edaba4fbb 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -150,6 +150,11 @@ public: return legacy_vertex_attributes; } + /// Returns true when VK_EXT_vertex_attribute_divisor is supported. + bool IsVertexAttributeDivisorSupported() const { + return vertex_attribute_divisor; + } + /// Returns true when VK_AMD_shader_image_load_store_lod is supported. bool IsImageLoadStoreLodSupported() const { return image_load_store_lod; @@ -398,6 +403,7 @@ private: u32 queue_family_index{0}; bool custom_border_color{}; bool fragment_shader_barycentric{}; + bool vertex_attribute_divisor{}; bool depth_clip_control{}; bool depth_range_unrestricted{}; bool dynamic_state_3{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7dd468f9a..1e845de73 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -122,6 +122,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS case Stage::Vertex: { BuildCommon(regs.vs_program); GatherVertexOutputs(info.vs_info, regs.vs_output_control); + info.vs_info.step_rate_0 = regs.vgt_instance_step_rate_0; + info.vs_info.step_rate_1 = regs.vgt_instance_step_rate_1; info.vs_info.emulate_depth_negative_one_to_one = !instance.IsDepthClipControlSupported() && regs.clipper_control.clip_space == Liverpool::ClipSpace::MinusWToW; @@ -460,10 +462,6 @@ bool PipelineCache::RefreshGraphicsKey() { // Stride will still be handled outside the pipeline using dynamic state. u32 vertex_binding = 0; for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } const auto& buffer = attrib.GetSharp(*vs_info); ASSERT(vertex_binding < MaxVertexBufferCount); key.vertex_buffer_formats[vertex_binding++] = diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 5d0a14ce3..2a645f338 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -20,12 +20,9 @@ namespace Vulkan { static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { - Shader::PushData push_data{}; - push_data.step0 = regs.vgt_instance_step_rate_0; - push_data.step1 = regs.vgt_instance_step_rate_1; - // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex // is encountered and implemented in the recompiler. + Shader::PushData push_data{}; push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f;