shader_recompiler: Specialize on vertex attribute number types.

2025-08-04 16:32:39 +00:00 · 2024-12-03 08:12:27 -08:00 · 2024-12-03 08:12:27 -08:00 · 028df5dfef
commit 028df5dfef
parent 063dc4afe3
15 changed files with 110 additions and 66 deletions
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@ -4,6 +4,7 @@
 #include "common/assert.h"
 #include "common/div_ceil.h"
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
+#include "shader_recompiler/frontend/fetch_shader.h"
 #include "shader_recompiler/ir/passes/srt.h"
 #include "video_core/amdgpu/types.h"

@ -155,18 +156,12 @@ void EmitContext::DefineInterfaces() {
 }

 const VectorIds& GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) {
-    switch (fmt) {
-    case AmdGpu::NumberFormat::Float:
-    case AmdGpu::NumberFormat::Unorm:
-    case AmdGpu::NumberFormat::Snorm:
-    case AmdGpu::NumberFormat::SnormNz:
-    case AmdGpu::NumberFormat::Sscaled:
-    case AmdGpu::NumberFormat::Uscaled:
-    case AmdGpu::NumberFormat::Srgb:
+    switch (GetNumberClass(fmt)) {
+    case AmdGpu::NumberClass::Float:
        return ctx.F32;
-    case AmdGpu::NumberFormat::Sint:
+    case AmdGpu::NumberClass::Sint:
        return ctx.S32;
-    case AmdGpu::NumberFormat::Uint:
+    case AmdGpu::NumberClass::Uint:
        return ctx.U32;
    default:
        break;
@ -176,18 +171,12 @@ const VectorIds& GetAttributeType(EmitContext& ctx, AmdGpu::NumberFormat fmt) {

 EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat fmt, Id id,
                                                          u32 num_components, bool output) {
-    switch (fmt) {
-    case AmdGpu::NumberFormat::Float:
-    case AmdGpu::NumberFormat::Unorm:
-    case AmdGpu::NumberFormat::Snorm:
-    case AmdGpu::NumberFormat::SnormNz:
-    case AmdGpu::NumberFormat::Sscaled:
-    case AmdGpu::NumberFormat::Uscaled:
-    case AmdGpu::NumberFormat::Srgb:
+    switch (GetNumberClass(fmt)) {
+    case AmdGpu::NumberClass::Float:
        return {id, output ? output_f32 : input_f32, F32[1], num_components, false};
-    case AmdGpu::NumberFormat::Uint:
+    case AmdGpu::NumberClass::Uint:
        return {id, output ? output_u32 : input_u32, U32[1], num_components, true};
-    case AmdGpu::NumberFormat::Sint:
+    case AmdGpu::NumberClass::Sint:
        return {id, output ? output_s32 : input_s32, S32[1], num_components, true};
    default:
        break;
@ -280,13 +269,13 @@ void EmitContext::DefineInputs() {
        base_vertex = DefineVariable(U32[1], spv::BuiltIn::BaseVertex, spv::StorageClass::Input);
        instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input);

-        const auto fetch_shader = info.LoadFetchShader();
+        const auto fetch_shader = Gcn::ParseFetchShader(info);
        if (!fetch_shader) {
            break;
        }
        for (const auto& attrib : fetch_shader->attributes) {
            ASSERT(attrib.semantic < IR::NumParams);
-            const auto sharp = info.GetSharp(attrib);
+            const auto sharp = attrib.GetSharp(info);
            const Id type{GetAttributeType(*this, sharp.GetNumberFmt())[4]};
            if (attrib.UsesStepRates()) {
                const u32 rate_idx =
--- a/src/shader_recompiler/frontend/fetch_shader.cpp
+++ b/src/shader_recompiler/frontend/fetch_shader.cpp
@ -34,7 +34,13 @@ namespace Shader::Gcn {
 * We take the reverse way, extract the original input semantics from these instructions.
 **/

-FetchShaderData ParseFetchShader(const u32* code) {
+std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info) {
+    if (!info.has_fetch_shader) {
+        return std::nullopt;
+    }
+    const u32* code;
+    std::memcpy(&code, &info.user_data[info.fetch_shader_sgpr_base], sizeof(code));
+
    FetchShaderData data{.code = code};
    GcnCodeSlice code_slice(code, code + std::numeric_limits<u32>::max());
    GcnDecodeContext decoder;
--- a/src/shader_recompiler/frontend/fetch_shader.h
+++ b/src/shader_recompiler/frontend/fetch_shader.h
@ -6,6 +6,7 @@
 #include <ranges>
 #include <vector>
 #include "common/types.h"
+#include "shader_recompiler/info.h"

 namespace Shader::Gcn {

@ -33,6 +34,10 @@ struct VertexAttribute {
        return step_rate == OverStepRate0 || step_rate == OverStepRate1;
    }

+    [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Shader::Info& info) const noexcept {
+        return info.ReadUdReg<AmdGpu::Buffer>(sgpr_base, dword_offset);
+    }
+
    bool operator==(const VertexAttribute& other) const {
        return semantic == other.semantic && dest_vgpr == other.dest_vgpr &&
               num_elements == other.num_elements && sgpr_base == other.sgpr_base &&
@ -59,6 +64,6 @@ struct FetchShaderData {
    }
 };

-FetchShaderData ParseFetchShader(const u32* code);
+std::optional<FetchShaderData> ParseFetchShader(const Shader::Info& info);

 } // namespace Shader::Gcn
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -370,7 +370,9 @@ void Translator::EmitFetch(const GcnInst& inst) {
    // Read the pointer to the fetch shader assembly.
    info.has_fetch_shader = true;
    info.fetch_shader_sgpr_base = inst.src[0].code;
-    const auto fetch_data = info.LoadFetchShader();
+
+    const auto fetch_data = ParseFetchShader(info);
+    ASSERT(fetch_data.has_value());

    if (Config::dumpShaders()) {
        using namespace Common::FS;
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@ -9,7 +9,6 @@
 #include <boost/container/static_vector.hpp>
 #include "common/assert.h"
 #include "common/types.h"
-#include "frontend/fetch_shader.h"
 #include "shader_recompiler/backend/bindings.h"
 #include "shader_recompiler/frontend/copy_shader.h"
 #include "shader_recompiler/ir/attribute.h"
@ -231,22 +230,6 @@ struct Info {
        bnd.user_data += ud_mask.NumRegs();
    }

-    [[nodiscard]] std::pair<u32, u32> GetDrawOffsets(
-        const AmdGpu::Liverpool::Regs& regs,
-        const std::optional<Gcn::FetchShaderData>& fetch_shader) const {
-        u32 vertex_offset = regs.index_offset;
-        u32 instance_offset = 0;
-        if (fetch_shader) {
-            if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) {
-                vertex_offset = user_data[fetch_shader->vertex_offset_sgpr];
-            }
-            if (fetch_shader->instance_offset_sgpr != -1) {
-                instance_offset = user_data[fetch_shader->instance_offset_sgpr];
-            }
-        }
-        return {vertex_offset, instance_offset};
-    }
-
    void RefreshFlatBuf() {
        flattened_ud_buf.resize(srt_info.flattened_bufsize_dw);
        ASSERT(user_data.size() <= NumUserDataRegs);
@ -256,20 +239,6 @@ struct Info {
            srt_info.walker_func(user_data.data(), flattened_ud_buf.data());
        }
    }
-
-    [[nodiscard]] std::optional<Gcn::FetchShaderData> LoadFetchShader() const {
-        if (!has_fetch_shader) {
-            return std::nullopt;
-        }
-        const u32* code;
-        std::memcpy(&code, &user_data[fetch_shader_sgpr_base], sizeof(code));
-        return Gcn::ParseFetchShader(code);
-    }
-
-    [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(
-        const Gcn::VertexAttribute& attrib) const noexcept {
-        return ReadUdReg<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
-    }
 };

 constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
--- a/src/shader_recompiler/profile.h
+++ b/src/shader_recompiler/profile.h
@ -22,6 +22,7 @@ struct Profile {
    bool support_fp32_denorm_preserve{};
    bool support_fp32_denorm_flush{};
    bool support_explicit_workgroup_layout{};
+    bool support_legacy_vertex_attributes{};
    bool has_broken_spirv_clamp{};
    bool lower_left_origin_mode{};
    bool needs_manual_interpolation{};
--- a/src/shader_recompiler/specialization.h
+++ b/src/shader_recompiler/specialization.h
@ -13,6 +13,12 @@

 namespace Shader {

+struct VsAttribSpecialization {
+    AmdGpu::NumberClass num_class{};
+
+    auto operator<=>(const VsAttribSpecialization&) const = default;
+};
+
 struct BufferSpecialization {
    u16 stride : 14;
    u16 is_storage : 1;
@ -52,6 +58,7 @@ struct StageSpecialization {
    const Shader::Info* info;
    RuntimeInfo runtime_info;
    Gcn::FetchShaderData fetch_shader_data{};
+    boost::container::small_vector<VsAttribSpecialization, 32> vs_attribs;
    std::bitset<MaxStageResources> bitset{};
    boost::container::small_vector<BufferSpecialization, 16> buffers;
    boost::container::small_vector<TextureBufferSpecialization, 8> tex_buffers;
@ -59,11 +66,18 @@ struct StageSpecialization {
    boost::container::small_vector<FMaskSpecialization, 8> fmasks;
    Backend::Bindings start{};

-    explicit StageSpecialization(const Shader::Info& info_, RuntimeInfo runtime_info_,
-                                 Backend::Bindings start_)
+    explicit StageSpecialization(const Info& info_, RuntimeInfo runtime_info_,
+                                 const Profile& profile_, Backend::Bindings start_)
        : info{&info_}, runtime_info{runtime_info_}, start{start_} {
-        if (const auto fetch_shader = info_.LoadFetchShader()) {
+        if (const auto fetch_shader = Gcn::ParseFetchShader(info_)) {
            fetch_shader_data = *fetch_shader;
+            if (info_.stage == Stage::Vertex && !profile_.support_legacy_vertex_attributes) {
+                // Specialize shader on VS input number types to follow spec.
+                ForEachSharp(vs_attribs, fetch_shader_data.attributes,
+                             [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
+                                 spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt());
+                             });
+            }
        }
        u32 binding{};
        if (info->has_readconst) {
@ -90,6 +104,17 @@ struct StageSpecialization {
                     });
    }

+    void ForEachSharp(auto& spec_list, auto& desc_list, auto&& func) {
+        for (const auto& desc : desc_list) {
+            auto& spec = spec_list.emplace_back();
+            const auto sharp = desc.GetSharp(*info);
+            if (!sharp) {
+                continue;
+            }
+            func(spec, desc, sharp);
+        }
+    }
+
    void ForEachSharp(u32& binding, auto& spec_list, auto& desc_list, auto&& func) {
        for (const auto& desc : desc_list) {
            auto& spec = spec_list.emplace_back();
@ -113,6 +138,11 @@ struct StageSpecialization {
        if (fetch_shader_data != other.fetch_shader_data) {
            return false;
        }
+        for (u32 i = 0; i < vs_attribs.size(); i++) {
+            if (vs_attribs[i] != other.vs_attribs[i]) {
+                return false;
+            }
+        }
        u32 binding{};
        if (info->has_readconst != other.info->has_readconst) {
            return false;
--- a/src/video_core/amdgpu/pixel_format.h
+++ b/src/video_core/amdgpu/pixel_format.h
@ -10,7 +10,24 @@

 namespace AmdGpu {

-[[nodiscard]] constexpr bool IsInteger(NumberFormat nfmt) {
+enum NumberClass {
+    Float,
+    Sint,
+    Uint,
+};
+
+[[nodiscard]] constexpr NumberClass GetNumberClass(const NumberFormat nfmt) {
+    switch (nfmt) {
+    case NumberFormat::Sint:
+        return Sint;
+    case NumberFormat::Uint:
+        return Uint;
+    default:
+        return Float;
+    }
+}
+
+[[nodiscard]] constexpr bool IsInteger(const NumberFormat nfmt) {
    return nfmt == AmdGpu::NumberFormat::Sint || nfmt == AmdGpu::NumberFormat::Uint;
 }

--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@ -158,7 +158,7 @@ bool BufferCache::BindVertexBuffers(
            continue;
        }

-        const auto& buffer = vs_info.GetSharp(attrib);
+        const auto& buffer = attrib.GetSharp(vs_info);
        if (buffer.GetSize() == 0) {
            continue;
        }
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@ -59,7 +59,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
                continue;
            }

-            const auto buffer = vs_info.GetSharp(attrib);
+            const auto buffer = attrib.GetSharp(vs_info);
            if (buffer.GetSize() == 0) {
                continue;
            }
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h
@ -4,6 +4,7 @@
 #include <xxhash.h>

 #include "common/types.h"
+#include "shader_recompiler/frontend/fetch_shader.h"
 #include "video_core/renderer_vulkan/liverpool_to_vk.h"
 #include "video_core/renderer_vulkan/vk_common.h"
 #include "video_core/renderer_vulkan/vk_pipeline_common.h"
--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@ -265,6 +265,7 @@ bool Instance::CreateDevice() {
    const bool robustness = add_extension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME);
    list_restart = add_extension(VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME);
    maintenance5 = add_extension(VK_KHR_MAINTENANCE_5_EXTENSION_NAME);
+    legacy_vertex_attributes = add_extension(VK_EXT_LEGACY_VERTEX_ATTRIBUTES_EXTENSION_NAME);

    // These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2
    // with extensions.
@ -403,6 +404,9 @@ bool Instance::CreateDevice() {
        vk::PhysicalDeviceFragmentShaderBarycentricFeaturesKHR{
            .fragmentShaderBarycentric = true,
        },
+        vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{
+            .legacyVertexAttributes = true,
+        },
 #ifdef __APPLE__
        feature_chain.get<vk::PhysicalDevicePortabilitySubsetFeaturesKHR>(),
 #endif
@ -445,6 +449,9 @@ bool Instance::CreateDevice() {
    if (!fragment_shader_barycentric) {
        device_chain.unlink<vk::PhysicalDeviceFragmentShaderBarycentricFeaturesKHR>();
    }
+    if (!legacy_vertex_attributes) {
+        device_chain.unlink<vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT>();
+    }

    auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get());
    if (device_result != vk::Result::eSuccess) {
--- a/src/video_core/renderer_vulkan/vk_instance.h
+++ b/src/video_core/renderer_vulkan/vk_instance.h
@ -148,10 +148,16 @@ public:
        return fragment_shader_barycentric;
    }

+    /// Returns true when VK_EXT_primitive_topology_list_restart is supported.
    bool IsListRestartSupported() const {
        return list_restart;
    }

+    /// Returns true when VK_EXT_legacy_vertex_attributes is supported.
+    bool IsLegacyVertexAttributesSupported() const {
+        return legacy_vertex_attributes;
+    }
+
    /// Returns true when geometry shaders are supported by the device
    bool IsGeometryStageSupported() const {
        return features.geometryShader;
@ -320,6 +326,7 @@ private:
    bool null_descriptor{};
    bool maintenance5{};
    bool list_restart{};
+    bool legacy_vertex_attributes{};
    u64 min_imported_host_pointer_alignment{};
    u32 subgroup_size{};
    bool tooling_info{};
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@ -169,6 +169,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_,
        .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32),
        .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32),
        .support_explicit_workgroup_layout = true,
+        .support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(),
        .needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() &&
                                      instance.GetDriverID() == vk::DriverId::eNvidiaProprietary,
    };
@ -352,7 +353,7 @@ bool PipelineCache::RefreshGraphicsKey() {
            if (attrib.UsesStepRates()) {
                continue;
            }
-            const auto& buffer = vs_info->GetSharp(attrib);
+            const auto& buffer = attrib.GetSharp(*vs_info);
            if (buffer.GetSize() == 0) {
                continue;
            }
@ -436,7 +437,7 @@ PipelineCache::GetProgram(Shader::Stage stage, Shader::ShaderParams params,
        Program* program = program_pool.Create(stage, params);
        auto start = binding;
        const auto module = CompileModule(program->info, runtime_info, params.code, 0, binding);
-        const auto spec = Shader::StageSpecialization(program->info, runtime_info, start);
+        const auto spec = Shader::StageSpecialization(program->info, runtime_info, profile, start);
        program->AddPermut(module, std::move(spec));
        it_pgm.value() = program;
        return std::make_tuple(&program->info, module, spec.fetch_shader_data,
@ -446,7 +447,7 @@ PipelineCache::GetProgram(Shader::Stage stage, Shader::ShaderParams params,
    Program* program = it_pgm->second;
    auto& info = program->info;
    info.RefreshFlatBuf();
-    const auto spec = Shader::StageSpecialization(info, runtime_info, binding);
+    const auto spec = Shader::StageSpecialization(info, runtime_info, profile, binding);
    size_t perm_idx = program->modules.size();
    vk::ShaderModule module{};

--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -194,7 +194,16 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
    BeginRendering(*pipeline, state);
    UpdateDynamicState(*pipeline);

-    const auto [vertex_offset, instance_offset] = vs_info.GetDrawOffsets(regs, fetch_shader);
+    u32 vertex_offset = regs.index_offset;
+    u32 instance_offset = 0;
+    if (fetch_shader) {
+        if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) {
+            vertex_offset = vs_info.user_data[fetch_shader->vertex_offset_sgpr];
+        }
+        if (fetch_shader->instance_offset_sgpr != -1) {
+            instance_offset = vs_info.user_data[fetch_shader->instance_offset_sgpr];
+        }
+    }

    const auto cmdbuf = scheduler.CommandBuffer();
    cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle());