shader_recompiler: Remove existing shared memory emulation

* The current impl relies on backend side implementaton and hooking into every shared memory access. It also doesnt handle atomics. Will be replaced by an IR pass that solves these issues
2025-08-04 16:32:39 +00:00 · 2025-02-14 11:52:28 +02:00 · 2025-02-14 11:52:28 +02:00 · 174107b410
commit 174107b410
parent b0dd81a2b9
7 changed files with 22 additions and 127 deletions
--- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp
@ -9,65 +9,35 @@ namespace Shader::Backend::SPIRV {
 Id EmitLoadSharedU32(EmitContext& ctx, Id offset) {
    const Id shift_id{ctx.ConstU32(2U)};
    const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)};
-    if (ctx.info.has_emulated_shared_memory) {
-        const Id pointer =
-            ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index);
-        return ctx.OpLoad(ctx.U32[1], pointer);
-    } else {
    const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index);
    return ctx.OpLoad(ctx.U32[1], pointer);
-    }
 }

 Id EmitLoadSharedU64(EmitContext& ctx, Id offset) {
    const Id shift_id{ctx.ConstU32(2U)};
    const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)};
    const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))};
-    if (ctx.info.has_emulated_shared_memory) {
-        const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
-                                               ctx.u32_zero_value, base_index)};
-        const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
-                                               ctx.u32_zero_value, next_index)};
-        return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer),
-                                        ctx.OpLoad(ctx.U32[1], rhs_pointer));
-    } else {
    const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)};
    const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)};
    return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer),
                                    ctx.OpLoad(ctx.U32[1], rhs_pointer));
-    }
 }

 void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) {
    const Id shift{ctx.ConstU32(2U)};
    const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)};
-    if (ctx.info.has_emulated_shared_memory) {
-        const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
-                                             ctx.u32_zero_value, word_offset);
-        ctx.OpStore(pointer, value);
-    } else {
    const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset);
    ctx.OpStore(pointer, value);
-    }
 }

 void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) {
    const Id shift{ctx.ConstU32(2U)};
    const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)};
    const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))};
-    if (ctx.info.has_emulated_shared_memory) {
-        const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
-                                               ctx.u32_zero_value, word_offset)};
-        const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32,
-                                               ctx.u32_zero_value, next_offset)};
-        ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U));
-        ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U));
-    } else {
    const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)};
    const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)};
    ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U));
    ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U));
-    }
 }

 } // namespace Shader::Backend::SPIRV
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@ -5,7 +5,6 @@
 #include "common/div_ceil.h"
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 #include "shader_recompiler/frontend/fetch_shader.h"
-#include "shader_recompiler/ir/passes/srt.h"
 #include "shader_recompiler/runtime_info.h"
 #include "video_core/amdgpu/types.h"

@ -805,51 +804,18 @@ void EmitContext::DefineImagesAndSamplers() {
 }

 void EmitContext::DefineSharedMemory() {
-    static constexpr size_t DefaultSharedMemSize = 2_KB;
    if (!info.uses_shared) {
        return;
    }
    ASSERT(info.stage == Stage::Compute);
-
-    const u32 max_shared_memory_size = profile.max_shared_memory_size;
-    u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
-    if (shared_memory_size == 0) {
-        shared_memory_size = DefaultSharedMemSize;
-    }
-
+    const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
    const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)};
    const Id type{TypeArray(U32[1], ConstU32(num_elements))};
-
-    if (shared_memory_size <= max_shared_memory_size) {
    shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type);
    shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]);
    shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup);
    Name(shared_memory_u32, "shared_mem");
    interfaces.push_back(shared_memory_u32);
-    } else {
-        shared_memory_u32_type = TypePointer(spv::StorageClass::StorageBuffer, type);
-        shared_u32 = TypePointer(spv::StorageClass::StorageBuffer, U32[1]);
-
-        Decorate(type, spv::Decoration::ArrayStride, 4);
-
-        const Id struct_type{TypeStruct(type)};
-        Name(struct_type, "shared_memory_buf");
-        Decorate(struct_type, spv::Decoration::Block);
-        MemberName(struct_type, 0, "data");
-        MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U);
-
-        const Id struct_pointer_type{TypePointer(spv::StorageClass::StorageBuffer, struct_type)};
-        const Id ssbo_id{AddGlobalVariable(struct_pointer_type, spv::StorageClass::StorageBuffer)};
-        Decorate(ssbo_id, spv::Decoration::Binding, binding.unified++);
-        Decorate(ssbo_id, spv::Decoration::DescriptorSet, 0U);
-        Name(ssbo_id, "shared_mem_ssbo");
-
-        shared_memory_u32 = ssbo_id;
-
-        info.has_emulated_shared_memory = true;
-        info.shared_memory_size = shared_memory_size;
-        interfaces.push_back(ssbo_id);
-    }
 }

 Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) {
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@ -507,7 +507,6 @@ void Translator::EmitFetch(const GcnInst& inst) {
            info.buffers.push_back({
                .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4),
                .used_types = IR::Type::F32,
-                .is_instance_data = true,
                .instance_attrib = attrib.semantic,
            });
        }
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@ -2,7 +2,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 #pragma once

-#include <algorithm>
 #include <span>
 #include <vector>
 #include <boost/container/small_vector.hpp>
@ -19,7 +18,6 @@
 #include "shader_recompiler/params.h"
 #include "shader_recompiler/profile.h"
 #include "shader_recompiler/runtime_info.h"
-#include "video_core/amdgpu/liverpool.h"
 #include "video_core/amdgpu/resource.h"

 namespace Shader {
@ -51,7 +49,6 @@ struct BufferResource {
    IR::Type used_types;
    AmdGpu::Buffer inline_cbuf;
    BufferType buffer_type;
-    bool is_instance_data{};
    u8 instance_attrib{};
    bool is_written{};
    bool is_formatted{};
@ -203,10 +200,8 @@ struct Info {
    bool uses_unpack_10_11_11{};
    bool stores_tess_level_outer{};
    bool stores_tess_level_inner{};
-    bool translation_failed{}; // indicates that shader has unsupported instructions
-    bool has_emulated_shared_memory{};
+    bool translation_failed{};
    bool has_readconst{};
-    u32 shared_memory_size{};
    u8 mrt_mask{0u};
    bool has_fetch_shader{false};
    u32 fetch_shader_sgpr_base{0u};
@ -243,10 +238,8 @@ struct Info {
    }

    void AddBindings(Backend::Bindings& bnd) const {
-        const auto total_buffers =
-            buffers.size() + (has_emulated_shared_memory ? 1 : 0);
-        bnd.buffer += total_buffers;
-        bnd.unified += total_buffers + images.size() + samplers.size();
+        bnd.buffer += buffers.size();
+        bnd.unified += buffers.size() + images.size() + samplers.size();
        bnd.user_data += ud_mask.NumRegs();
    }

--- a/src/shader_recompiler/specialization.h
+++ b/src/shader_recompiler/specialization.h
@ -98,9 +98,6 @@ struct StageSpecialization {
                         });
        }
        u32 binding{};
-        if (info->has_emulated_shared_memory) {
-            binding++;
-        }
        ForEachSharp(binding, buffers, info->buffers,
                     [profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
                         spec.stride = sharp.GetStride();
@ -192,12 +189,6 @@ struct StageSpecialization {
            }
        }
        u32 binding{};
-        if (info->has_emulated_shared_memory != other.info->has_emulated_shared_memory) {
-            return false;
-        }
-        if (info->has_emulated_shared_memory) {
-            binding++;
-        }
        for (u32 i = 0; i < buffers.size(); i++) {
            if (other.bitset[binding++] && buffers[i] != other.buffers[i]) {
                return false;
--- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp
@ -27,15 +27,6 @@ ComputePipeline::ComputePipeline(const Instance& instance, Scheduler& scheduler,

    u32 binding{};
    boost::container::small_vector<vk::DescriptorSetLayoutBinding, 32> bindings;
-
-    if (info->has_emulated_shared_memory) {
-        bindings.push_back({
-            .binding = binding++,
-            .descriptorType = vk::DescriptorType::eStorageBuffer,
-            .descriptorCount = 1,
-            .stageFlags = vk::ShaderStageFlagBits::eCompute,
-        });
-    }
    for (const auto& buffer : info->buffers) {
        const auto sharp = buffer.GetSharp(*info);
        bindings.push_back({
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -519,27 +519,12 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
        }
    }

-    // Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer
-    // (e.g. when the compute shared memory is bigger than the GPU's shared memory)
-    if (stage.has_emulated_shared_memory) {
-        const auto* lds_buf = buffer_cache.GetLdsBuffer();
-        buffer_infos.emplace_back(lds_buf->Handle(), 0, lds_buf->SizeBytes());
-        set_writes.push_back({
-            .dstSet = VK_NULL_HANDLE,
-            .dstBinding = binding.unified++,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = vk::DescriptorType::eStorageBuffer,
-            .pBufferInfo = &buffer_infos.back(),
-        });
-        ++binding.buffer;
-    }
-
    // Second pass to re-bind buffers that were updated after binding
    for (u32 i = 0; i < buffer_bindings.size(); i++) {
        const auto& [buffer_id, vsharp] = buffer_bindings[i];
        const auto& desc = stage.buffers[i];
        const bool is_storage = desc.IsStorage(vsharp, pipeline_cache.GetProfile());
+        // Buffer is not from the cache, either a special buffer or unbound.
        if (!buffer_id) {
            if (desc.buffer_type == Shader::BufferType::GdsBuffer) {
                const auto* gds_buf = buffer_cache.GetGdsBuffer();