From c0878db548dd7a687f0882de04f44228f33ea2c1 Mon Sep 17 00:00:00 2001
From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com>
Date: Sun, 13 Jul 2025 13:16:27 +0300
Subject: [PATCH] shader_recompiler: Replace buffer pulling with attribute
 divisor for instance step rates

---
 .../spirv/emit_spirv_context_get_set.cpp      | 102 +++++++-----------
 .../backend/spirv/emit_spirv_instructions.h   |   2 +-
 .../backend/spirv/spirv_emit_context.cpp      |  62 ++++-------
 .../backend/spirv/spirv_emit_context.h        |   1 -
 src/shader_recompiler/frontend/fetch_shader.h |  12 ---
 .../frontend/translate/translate.cpp          |  16 +--
 src/shader_recompiler/info.h                  |  14 +--
 src/shader_recompiler/ir/ir_emitter.cpp       |   4 +-
 src/shader_recompiler/ir/ir_emitter.h         |   3 +-
 .../ir/passes/ring_access_elimination.cpp     |   2 +-
 src/shader_recompiler/runtime_info.h          |   2 +
 src/shader_recompiler/specialization.h        |  20 ++--
 src/video_core/buffer_cache/buffer_cache.cpp  |   5 +-
 .../renderer_vulkan/vk_graphics_pipeline.cpp  |  43 +++++---
 .../renderer_vulkan/vk_graphics_pipeline.h    |   4 +-
 .../renderer_vulkan/vk_instance.cpp           |   7 ++
 src/video_core/renderer_vulkan/vk_instance.h  |   6 ++
 .../renderer_vulkan/vk_pipeline_cache.cpp     |   6 +-
 .../renderer_vulkan/vk_rasterizer.cpp         |   5 +-
 19 files changed, 140 insertions(+), 176 deletions(-)
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index f3a8c518c..c939f3524 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -120,6 +120,9 @@ std::pair<Id, bool> OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr
 }
 } // Anonymous namespace
 
+using PointerType = EmitContext::PointerType;
+using PointerSize = EmitContext::PointerSize;
+
 Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) {
     const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg);
     const u32 half = PushData::UdRegsIndex + (index >> 2);
@@ -131,41 +134,6 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) {
     return ud_reg;
 }
 
-void EmitGetThreadBitScalarReg(EmitContext& ctx) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitSetThreadBitScalarReg(EmitContext& ctx) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitGetScalarRegister(EmitContext&) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitSetScalarRegister(EmitContext&) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitGetVectorRegister(EmitContext& ctx) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitSetVectorRegister(EmitContext& ctx) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitSetGotoVariable(EmitContext&) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-void EmitGetGotoVariable(EmitContext&) {
-    UNREACHABLE_MSG("Unreachable instruction");
-}
-
-using PointerType = EmitContext::PointerType;
-using PointerSize = EmitContext::PointerSize;
-
 Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
     const u32 flatbuf_off_dw = inst->Flags<u32>();
     if (!Config::directMemoryAccess()) {
@@ -201,18 +169,12 @@ Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {
     return ReadConstBuffer<PointerType::U32>(ctx, handle, index);
 }
 
-Id EmitReadStepRate(EmitContext& ctx, int rate_idx) {
-    const auto index{rate_idx == 0 ? PushData::Step0Index : PushData::Step1Index};
-    return ctx.OpLoad(
-        ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]),
-                                      ctx.push_data_block, ctx.ConstU32(index)));
-}
-
-static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
+static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) {
     if (IR::IsPosition(attr)) {
         ASSERT(attr == IR::Attribute::Position0);
         const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]);
-        const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))};
+        const auto pointer{
+            ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))};
         const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]);
         return ctx.OpLoad(ctx.F32[1],
                           ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp)));
@@ -222,7 +184,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32
         const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)};
         const auto param = ctx.input_params.at(param_id).id;
         const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]);
-        const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)};
+        const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))};
         const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]);
         return ctx.OpLoad(ctx.F32[1],
                           ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp)));
@@ -230,7 +192,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32
     UNREACHABLE();
 }
 
-Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
+Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) {
     if (ctx.info.l_stage == LogicalStage::Geometry) {
         return EmitGetAttributeForGeometry(ctx, attr, comp, index);
     } else if (ctx.info.l_stage == LogicalStage::TessellationControl ||
@@ -248,18 +210,6 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
     if (IR::IsParam(attr)) {
         const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)};
         const auto& param{ctx.input_params.at(param_index)};
-        if (param.buffer_handle >= 0) {
-            const auto step_rate = EmitReadStepRate(ctx, param.id.value);
-            const auto offset = ctx.OpIAdd(
-                ctx.U32[1],
-                ctx.OpIMul(
-                    ctx.U32[1],
-                    ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate),
-                    ctx.ConstU32(param.num_components)),
-                ctx.ConstU32(comp));
-            return ReadConstBuffer<PointerType::F32>(ctx, param.buffer_handle, offset);
-        }
-
         Id result;
         if (param.is_loaded) {
             // Attribute is either default or manually interpolated. The id points to an already
@@ -305,10 +255,6 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
         return ctx.OpLoad(ctx.U32[1], ctx.vertex_index);
     case IR::Attribute::InstanceId:
         return ctx.OpLoad(ctx.U32[1], ctx.instance_id);
-    case IR::Attribute::InstanceId0:
-        return EmitReadStepRate(ctx, 0);
-    case IR::Attribute::InstanceId1:
-        return EmitReadStepRate(ctx, 1);
     case IR::Attribute::WorkgroupIndex:
         return ctx.workgroup_index_id;
     case IR::Attribute::WorkgroupId:
@@ -640,4 +586,36 @@ void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
     UNREACHABLE_MSG("SPIR-V instruction");
 }
 
+void EmitGetThreadBitScalarReg(EmitContext& ctx) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitSetThreadBitScalarReg(EmitContext& ctx) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitGetScalarRegister(EmitContext&) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitSetScalarRegister(EmitContext&) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitGetVectorRegister(EmitContext& ctx) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitSetVectorRegister(EmitContext& ctx) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitSetGotoVariable(EmitContext&) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
+void EmitGetGotoVariable(EmitContext&) {
+    UNREACHABLE_MSG("Unreachable instruction");
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
index 74c94754d..37d5d84c9 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -108,7 +108,7 @@ Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres
 Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                              Id cmp_value);
-Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index);
+Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index);
 Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp);
 void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp);
 Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 6a731d05c..852920ade 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -377,35 +377,13 @@ void EmitContext::DefineInputs() {
             ASSERT(attrib.semantic < IR::NumParams);
             const auto sharp = attrib.GetSharp(info);
             const Id type{GetAttributeType(*this, sharp.GetNumberFmt())[4]};
-            if (attrib.UsesStepRates()) {
-                const u32 rate_idx =
-                    attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::OverStepRate0 ? 0
-                                                                                                : 1;
-                const u32 num_components = AmdGpu::NumComponents(sharp.GetDataFmt());
-                const auto buffer =
-                    std::ranges::find_if(info.buffers, [&attrib](const auto& buffer) {
-                        return buffer.instance_attrib == attrib.semantic;
-                    });
-                // Note that we pass index rather than Id
-                input_params[attrib.semantic] = SpirvAttribute{
-                    .id = {rate_idx},
-                    .pointer_type = input_u32,
-                    .component_type = U32[1],
-                    .num_components = std::min<u16>(attrib.num_elements, num_components),
-                    .is_integer = true,
-                    .is_loaded = false,
-                    .buffer_handle = int(buffer - info.buffers.begin()),
-                };
+            Id id{DefineInput(type, attrib.semantic)};
+            if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) {
+                Name(id, fmt::format("vs_instance_attr{}", attrib.semantic));
             } else {
-                Id id{DefineInput(type, attrib.semantic)};
-                if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) {
-                    Name(id, fmt::format("vs_instance_attr{}", attrib.semantic));
-                } else {
-                    Name(id, fmt::format("vs_in_attr{}", attrib.semantic));
-                }
-                input_params[attrib.semantic] =
-                    GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false);
+                Name(id, fmt::format("vs_in_attr{}", attrib.semantic));
             }
+            input_params[attrib.semantic] = GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false);
         }
         break;
     }
@@ -700,12 +678,10 @@ void EmitContext::DefineOutputs() {
 
 void EmitContext::DefinePushDataBlock() {
     // Create push constants block for instance steps rates
-    const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4],
-                                         U32[4], U32[4], U32[4], U32[4], U32[4], U32[2]),
+    const Id struct_type{Name(TypeStruct(F32[1], F32[1], F32[1], F32[1], U32[4], U32[4], U32[4],
+                                         U32[4], U32[4], U32[4], U32[2]),
                               "AuxData")};
     Decorate(struct_type, spv::Decoration::Block);
-    MemberName(struct_type, PushData::Step0Index, "sr0");
-    MemberName(struct_type, PushData::Step1Index, "sr1");
     MemberName(struct_type, PushData::XOffsetIndex, "xoffset");
     MemberName(struct_type, PushData::YOffsetIndex, "yoffset");
     MemberName(struct_type, PushData::XScaleIndex, "xscale");
@@ -717,19 +693,17 @@ void EmitContext::DefinePushDataBlock() {
     MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0");
     MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1");
     MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2");
-    MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U);
-    MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U);
-    MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U);
-    MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 12U);
-    MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 16U);
-    MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 20U);
-    MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 24U);
-    MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 40U);
-    MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 56U);
-    MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U);
-    MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U);
-    MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U);
-    MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 120U);
+    MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 0U);
+    MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 4U);
+    MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 8U);
+    MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 12U);
+    MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 16U);
+    MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 32U);
+    MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 48U);
+    MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 64U);
+    MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 80U);
+    MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 96U);
+    MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 112U);
     push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant);
     Name(push_data_block, "push_data");
     interfaces.push_back(push_data_block);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 28e9099d8..186925706 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -361,7 +361,6 @@ public:
         u32 num_components;
         bool is_integer{};
         bool is_loaded{};
-        s32 buffer_handle{-1};
     };
     Id input_attr_array;
     Id output_attr_array;
diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h
index 837caafa0..e77925232 100644
--- a/src/shader_recompiler/frontend/fetch_shader.h
+++ b/src/shader_recompiler/frontend/fetch_shader.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include <ranges>
 #include <vector>
 #include "common/types.h"
 #include "shader_recompiler/info.h"
@@ -29,11 +28,6 @@ struct VertexAttribute {
         return static_cast<InstanceIdType>(instance_data);
     }
 
-    [[nodiscard]] bool UsesStepRates() const {
-        const auto step_rate = GetStepRate();
-        return step_rate == OverStepRate0 || step_rate == OverStepRate1;
-    }
-
     [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Shader::Info& info) const noexcept {
         return info.ReadUdReg<AmdGpu::Buffer>(sgpr_base, dword_offset);
     }
@@ -52,12 +46,6 @@ struct FetchShaderData {
     s8 vertex_offset_sgpr = -1;   ///< SGPR of vertex offset from VADDR
     s8 instance_offset_sgpr = -1; ///< SGPR of instance offset from VADDR
 
-    [[nodiscard]] bool UsesStepRates() const {
-        return std::ranges::find_if(attributes, [](const VertexAttribute& attribute) {
-                   return attribute.UsesStepRates();
-               }) != attributes.end();
-    }
-
     bool operator==(const FetchShaderData& other) const {
         return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr &&
                instance_offset_sgpr == other.instance_offset_sgpr;
diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp
index 5853f3e72..9c06dc6a5 100644
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@@ -481,11 +481,11 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra
 }
 
 void Translator::EmitFetch(const GcnInst& inst) {
-    // Read the pointer to the fetch shader assembly.
     const auto code_sgpr_base = inst.src[0].code;
+
+    // The fetch shader must be inlined to access as regular buffers, so that
+    // bounds checks can be emitted to emulate robust buffer access.
     if (!profile.supports_robust_buffer_access) {
-        // The fetch shader must be inlined to access as regular buffers, so that
-        // bounds checks can be emitted to emulate robust buffer access.
         const auto* code = GetFetchShaderCode(info, code_sgpr_base);
         GcnCodeSlice slice(code, code + std::numeric_limits<u32>::max());
         GcnDecodeContext decoder;
@@ -535,16 +535,6 @@ void Translator::EmitFetch(const GcnInst& inst) {
         for (u32 i = 0; i < 4; i++) {
             ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(swizzled, i)});
         }
-
-        // In case of programmable step rates we need to fallback to instance data pulling in
-        // shader, so VBs should be bound as regular data buffers
-        if (attrib.UsesStepRates()) {
-            info.buffers.push_back({
-                .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4),
-                .used_types = IR::Type::F32,
-                .instance_attrib = attrib.semantic,
-            });
-        }
     }
 }
 
diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
index 9703643e8..6e12c6816 100644
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@@ -113,17 +113,13 @@ struct FMaskResource {
 using FMaskResourceList = boost::container::small_vector<FMaskResource, NumFMasks>;
 
 struct PushData {
-    static constexpr u32 Step0Index = 0;
-    static constexpr u32 Step1Index = 1;
-    static constexpr u32 XOffsetIndex = 2;
-    static constexpr u32 YOffsetIndex = 3;
-    static constexpr u32 XScaleIndex = 4;
-    static constexpr u32 YScaleIndex = 5;
-    static constexpr u32 UdRegsIndex = 6;
+    static constexpr u32 XOffsetIndex = 0;
+    static constexpr u32 YOffsetIndex = 1;
+    static constexpr u32 XScaleIndex = 2;
+    static constexpr u32 YScaleIndex = 3;
+    static constexpr u32 UdRegsIndex = 4;
     static constexpr u32 BufOffsetIndex = UdRegsIndex + NumUserDataRegs / 4;
 
-    u32 step0;
-    u32 step1;
     float xoffset;
     float yoffset;
     float xscale;
diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp
index 4997145d7..6ca86b2c0 100644
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@@ -255,8 +255,8 @@ void IREmitter::SetM0(const U32& value) {
     Inst(Opcode::SetM0, value);
 }
 
-F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) {
-    return Inst<F32>(Opcode::GetAttribute, attribute, Imm32(comp), index);
+F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) {
+    return Inst<F32>(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index));
 }
 
 U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) {
diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h
index 6055df565..a105b042d 100644
--- a/src/shader_recompiler/ir/ir_emitter.h
+++ b/src/shader_recompiler/ir/ir_emitter.h
@@ -81,8 +81,7 @@ public:
 
     [[nodiscard]] U1 Condition(IR::Condition cond);
 
-    [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0,
-                                   IR::Value index = IR::Value(u32(0u)));
+    [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0);
     [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0);
     void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0);
 
diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp
index b292b41b9..ca72097e7 100644
--- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp
+++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp
@@ -116,7 +116,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim
                 }
 
                 const auto shl_inst = inst.Arg(1).TryInstRecursive();
-                const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2);
+                const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2;
                 const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1);
                 const auto bucket = offset.Resolve().U32() / 256u;
                 const auto attrib = bucket < 4 ? IR::Attribute::Position0
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index 5a0408e2c..ce0f7bf3a 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -85,6 +85,8 @@ struct VertexRuntimeInfo {
     std::array<VsOutputMap, 3> outputs;
     bool emulate_depth_negative_one_to_one{};
     bool clip_disable{};
+    u32 step_rate_0;
+    u32 step_rate_1;
     // Domain
     AmdGpu::TessellationType tess_type;
     AmdGpu::TessellationTopology tess_topology;
diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h
index e40309aaf..8e6b0f01b 100644
--- a/src/shader_recompiler/specialization.h
+++ b/src/shader_recompiler/specialization.h
@@ -14,6 +14,7 @@ namespace Shader {
 
 struct VsAttribSpecialization {
     s32 num_components{};
+    u32 divisor{};
     AmdGpu::NumberClass num_class{};
     AmdGpu::CompMapping dst_select{};
 
@@ -74,13 +75,13 @@ struct SamplerSpecialization {
  * after the first compilation of a module.
  */
 struct StageSpecialization {
-    static constexpr size_t MaxStageResources = 64;
+    static constexpr size_t MaxStageResources = 128;
 
     const Shader::Info* info;
     RuntimeInfo runtime_info;
+    std::bitset<MaxStageResources> bitset{};
     std::optional<Gcn::FetchShaderData> fetch_shader_data{};
     boost::container::small_vector<VsAttribSpecialization, 32> vs_attribs;
-    std::bitset<MaxStageResources> bitset{};
     boost::container::small_vector<BufferSpecialization, 16> buffers;
     boost::container::small_vector<ImageSpecialization, 16> images;
     boost::container::small_vector<FMaskSpecialization, 8> fmasks;
@@ -94,10 +95,17 @@ struct StageSpecialization {
         if (info_.stage == Stage::Vertex && fetch_shader_data) {
             // Specialize shader on VS input number types to follow spec.
             ForEachSharp(vs_attribs, fetch_shader_data->attributes,
-                         [&profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
-                             spec.num_components = desc.UsesStepRates()
-                                                       ? AmdGpu::NumComponents(sharp.GetDataFmt())
-                                                       : 0;
+                         [&profile_, this](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
+                             spec.num_components = AmdGpu::NumComponents(sharp.GetDataFmt());
+                             using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType;
+                             if (const auto step_rate = desc.GetStepRate();
+                                 step_rate != InstanceIdType::None) {
+                                 spec.divisor = step_rate == InstanceIdType::OverStepRate0
+                                                    ? runtime_info.vs_info.step_rate_0
+                                                    : (step_rate == InstanceIdType::OverStepRate1
+                                                           ? runtime_info.vs_info.step_rate_1
+                                                           : 1);
+                             }
                              spec.num_class = profile_.support_legacy_vertex_attributes
                                                   ? AmdGpu::NumberClass{}
                                                   : AmdGpu::GetNumberClass(sharp.GetNumberFmt());
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index 28444ac60..42e3c61a5 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -198,10 +198,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
 }
 
 void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) {
+    const auto& regs = liverpool->regs;
     Vulkan::VertexInputs<vk::VertexInputAttributeDescription2EXT> attributes;
     Vulkan::VertexInputs<vk::VertexInputBindingDescription2EXT> bindings;
+    Vulkan::VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT> divisors;
     Vulkan::VertexInputs<AmdGpu::Buffer> guest_buffers;
-    pipeline.GetVertexInputs(attributes, bindings, guest_buffers);
+    pipeline.GetVertexInputs(attributes, bindings, divisors, guest_buffers,
+                             regs.vgt_instance_step_rate_0, regs.vgt_instance_step_rate_1);
 
     if (instance.IsVertexInputDynamicState()) {
         // Update current vertex inputs.
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 7c020a012..e971a00fc 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -72,12 +72,21 @@ GraphicsPipeline::GraphicsPipeline(
 
     VertexInputs<vk::VertexInputAttributeDescription> vertex_attributes;
     VertexInputs<vk::VertexInputBindingDescription> vertex_bindings;
+    VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT> divisors;
     VertexInputs<AmdGpu::Buffer> guest_buffers;
     if (!instance.IsVertexInputDynamicState()) {
-        GetVertexInputs(vertex_attributes, vertex_bindings, guest_buffers);
+        const auto& vs_info = runtime_infos[u32(Shader::LogicalStage::Vertex)].vs_info;
+        GetVertexInputs(vertex_attributes, vertex_bindings, divisors, guest_buffers,
+                        vs_info.step_rate_0, vs_info.step_rate_1);
     }
 
+    const vk::PipelineVertexInputDivisorStateCreateInfo divisor_state = {
+        .vertexBindingDivisorCount = static_cast<u32>(divisors.size()),
+        .pVertexBindingDivisors = divisors.data(),
+    };
+
     const vk::PipelineVertexInputStateCreateInfo vertex_input_info = {
+        .pNext = &divisor_state,
         .vertexBindingDescriptionCount = static_cast<u32>(vertex_bindings.size()),
         .pVertexBindingDescriptions = vertex_bindings.data(),
         .vertexAttributeDescriptionCount = static_cast<u32>(vertex_attributes.size()),
@@ -304,19 +313,16 @@ GraphicsPipeline::GraphicsPipeline(
 GraphicsPipeline::~GraphicsPipeline() = default;
 
 template <typename Attribute, typename Binding>
-void GraphicsPipeline::GetVertexInputs(VertexInputs<Attribute>& attributes,
-                                       VertexInputs<Binding>& bindings,
-                                       VertexInputs<AmdGpu::Buffer>& guest_buffers) const {
+void GraphicsPipeline::GetVertexInputs(
+    VertexInputs<Attribute>& attributes, VertexInputs<Binding>& bindings,
+    VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT>& divisors,
+    VertexInputs<AmdGpu::Buffer>& guest_buffers, u32 step_rate_0, u32 step_rate_1) const {
+    using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType;
     if (!fetch_shader || fetch_shader->attributes.empty()) {
         return;
     }
     const auto& vs_info = GetStage(Shader::LogicalStage::Vertex);
     for (const auto& attrib : fetch_shader->attributes) {
-        if (attrib.UsesStepRates()) {
-            // Skip attribute binding as the data will be pulled by shader.
-            continue;
-        }
-
         const auto& buffer = attrib.GetSharp(vs_info);
         attributes.push_back(Attribute{
             .location = attrib.semantic,
@@ -327,12 +333,21 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs<Attribute>& attributes,
         bindings.push_back(Binding{
             .binding = attrib.semantic,
             .stride = buffer.GetStride(),
-            .inputRate = attrib.GetStepRate() == Shader::Gcn::VertexAttribute::InstanceIdType::None
+            .inputRate = attrib.GetStepRate() == InstanceIdType::None
                              ? vk::VertexInputRate::eVertex
                              : vk::VertexInputRate::eInstance,
         });
+        const u32 divisor =
+            attrib.GetStepRate() == InstanceIdType::OverStepRate0
+                ? step_rate_0
+                : (attrib.GetStepRate() == InstanceIdType::OverStepRate1 ? step_rate_1 : 1);
         if constexpr (std::is_same_v<Binding, vk::VertexInputBindingDescription2EXT>) {
-            bindings.back().divisor = 1;
+            bindings.back().divisor = divisor;
+        } else {
+            divisors.push_back(vk::VertexInputBindingDivisorDescriptionEXT{
+                .binding = attrib.semantic,
+                .divisor = divisor,
+            });
         }
         guest_buffers.emplace_back(buffer);
     }
@@ -342,11 +357,13 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs<Attribute>& attributes,
 template void GraphicsPipeline::GetVertexInputs(
     VertexInputs<vk::VertexInputAttributeDescription>& attributes,
     VertexInputs<vk::VertexInputBindingDescription>& bindings,
-    VertexInputs<AmdGpu::Buffer>& guest_buffers) const;
+    VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT>& divisors,
+    VertexInputs<AmdGpu::Buffer>& guest_buffers, u32 step_rate_0, u32 step_rate_1) const;
 template void GraphicsPipeline::GetVertexInputs(
     VertexInputs<vk::VertexInputAttributeDescription2EXT>& attributes,
     VertexInputs<vk::VertexInputBindingDescription2EXT>& bindings,
-    VertexInputs<AmdGpu::Buffer>& guest_buffers) const;
+    VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT>& divisors,
+    VertexInputs<AmdGpu::Buffer>& guest_buffers, u32 step_rate_0, u32 step_rate_1) const;
 
 void GraphicsPipeline::BuildDescSetLayout() {
     boost::container::small_vector<vk::DescriptorSetLayoutBinding, 32> bindings;
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
index 59230ae46..ab67a52b4 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@@ -81,7 +81,9 @@ public:
     /// Gets the attributes and bindings for vertex inputs.
     template <typename Attribute, typename Binding>
     void GetVertexInputs(VertexInputs<Attribute>& attributes, VertexInputs<Binding>& bindings,
-                         VertexInputs<AmdGpu::Buffer>& guest_buffers) const;
+                         VertexInputs<vk::VertexInputBindingDivisorDescriptionEXT>& divisors,
+                         VertexInputs<AmdGpu::Buffer>& guest_buffers, u32 step_rate_0,
+                         u32 step_rate_1) const;
 
 private:
     void BuildDescSetLayout();
diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp
index 237fa202d..3adf0f2ec 100644
--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@@ -277,6 +277,7 @@ bool Instance::CreateDevice() {
     image_load_store_lod = add_extension(VK_AMD_SHADER_IMAGE_LOAD_STORE_LOD_EXTENSION_NAME);
     amd_gcn_shader = add_extension(VK_AMD_GCN_SHADER_EXTENSION_NAME);
     amd_shader_trinary_minmax = add_extension(VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME);
+    vertex_attribute_divisor = add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);
     shader_atomic_float2 = add_extension(VK_EXT_SHADER_ATOMIC_FLOAT_2_EXTENSION_NAME);
     if (shader_atomic_float2) {
         shader_atomic_float2_features =
@@ -436,6 +437,9 @@ bool Instance::CreateDevice() {
         vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{
             .legacyVertexAttributes = true,
         },
+        vk::PhysicalDeviceVertexAttributeDivisorFeatures{
+            .vertexAttributeInstanceRateDivisor = true,
+        },
         vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{
             .shaderBufferFloat32AtomicMinMax =
                 shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax,
@@ -498,6 +502,9 @@ bool Instance::CreateDevice() {
     if (!legacy_vertex_attributes) {
         device_chain.unlink<vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT>();
     }
+    if (!vertex_attribute_divisor) {
+        device_chain.unlink<vk::PhysicalDeviceVertexAttributeDivisorFeatures>();
+    }
     if (!shader_atomic_float2) {
         device_chain.unlink<vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>();
     }
diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h
index 830b1d5c2..edaba4fbb 100644
--- a/src/video_core/renderer_vulkan/vk_instance.h
+++ b/src/video_core/renderer_vulkan/vk_instance.h
@@ -150,6 +150,11 @@ public:
         return legacy_vertex_attributes;
     }
 
+    /// Returns true when VK_EXT_vertex_attribute_divisor is supported.
+    bool IsVertexAttributeDivisorSupported() const {
+        return vertex_attribute_divisor;
+    }
+
     /// Returns true when VK_AMD_shader_image_load_store_lod is supported.
     bool IsImageLoadStoreLodSupported() const {
         return image_load_store_lod;
@@ -398,6 +403,7 @@ private:
     u32 queue_family_index{0};
     bool custom_border_color{};
     bool fragment_shader_barycentric{};
+    bool vertex_attribute_divisor{};
     bool depth_clip_control{};
     bool depth_range_unrestricted{};
     bool dynamic_state_3{};
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 7dd468f9a..1e845de73 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -122,6 +122,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
     case Stage::Vertex: {
         BuildCommon(regs.vs_program);
         GatherVertexOutputs(info.vs_info, regs.vs_output_control);
+        info.vs_info.step_rate_0 = regs.vgt_instance_step_rate_0;
+        info.vs_info.step_rate_1 = regs.vgt_instance_step_rate_1;
         info.vs_info.emulate_depth_negative_one_to_one =
             !instance.IsDepthClipControlSupported() &&
             regs.clipper_control.clip_space == Liverpool::ClipSpace::MinusWToW;
@@ -460,10 +462,6 @@ bool PipelineCache::RefreshGraphicsKey() {
         // Stride will still be handled outside the pipeline using dynamic state.
         u32 vertex_binding = 0;
         for (const auto& attrib : fetch_shader->attributes) {
-            if (attrib.UsesStepRates()) {
-                // Skip attribute binding as the data will be pulled by shader.
-                continue;
-            }
             const auto& buffer = attrib.GetSharp(*vs_info);
             ASSERT(vertex_binding < MaxVertexBufferCount);
             key.vertex_buffer_formats[vertex_binding++] =
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 5d0a14ce3..2a645f338 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -20,12 +20,9 @@
 namespace Vulkan {
 
 static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) {
-    Shader::PushData push_data{};
-    push_data.step0 = regs.vgt_instance_step_rate_0;
-    push_data.step1 = regs.vgt_instance_step_rate_1;
-
     // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex
     // is encountered and implemented in the recompiler.
+    Shader::PushData push_data{};
     push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f;
     push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f;
     push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f;