diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index f2e6279f4..37d7eea35 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -303,6 +303,11 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddExtension("SPV_KHR_physical_storage_buffer"); } + if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) { + ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); + } } void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index a342b47b6..13fd8e180 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -15,42 +17,40 @@ std::pair AtomicArgs(EmitContext& ctx) { Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + }); +} + +Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics, value); + }); } Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); -} - -Id BufferAtomicU32BoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a conditional branch to make sure that - // the atomic is not mistakenly executed when the index is out of bounds. - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); - const Id ib_label = ctx.OpLabel(); - const Id oob_label = ctx.OpLabel(); - const Id end_label = ctx.OpLabel(); - ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); - ctx.OpBranchConditional(in_bounds, ib_label, oob_label); - ctx.AddLabel(ib_label); - const Id ib_result = emit_func(); - ctx.OpBranch(end_label); - ctx.AddLabel(oob_label); - const Id oob_result = ctx.u32_zero_value; - ctx.OpBranch(end_label); - ctx.AddLabel(end_label); - return ctx.OpPhi(ctx.U32[1], ib_result, ib_label, oob_result, oob_label); - } - // Bounds checking not enabled, just perform the atomic operation. - return emit_func(); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); + }); } Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, @@ -63,7 +63,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value); }); } @@ -79,11 +79,26 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value); }); } +Id BufferAtomicU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U64]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, buffer.size_qwords, [&] { + return (ctx.*atomic_func)(ctx.U64, ptr, scope, semantics, value); + }); +} + Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& texture = ctx.images[handle & 0xFFFF]; @@ -105,6 +120,10 @@ Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } @@ -149,6 +168,10 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h new file mode 100644 index 000000000..3cd13d9cc --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { + +template +auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { + Id zero_value{}; + Id result_type{}; + if constexpr (bit_size == 64) { + zero_value = ctx.u64_zero_value; + result_type = ctx.U64; + } else if constexpr (bit_size == 32) { + zero_value = ctx.u32_zero_value; + result_type = ctx.U32[1]; + } else if constexpr (bit_size == 16) { + zero_value = ctx.u16_zero_value; + result_type = ctx.U16; + } else { + static_assert("type not supported"); + } + if (Sirit::ValidId(buffer_size)) { + // Bounds checking enabled, wrap in a conditional branch to make sure that + // the atomic is not mistakenly executed when the index is out of bounds. + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); + const Id ib_label = ctx.OpLabel(); + const Id oob_label = ctx.OpLabel(); + const Id end_label = ctx.OpLabel(); + ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); + ctx.OpBranchConditional(in_bounds, ib_label, oob_label); + ctx.AddLabel(ib_label); + const auto ib_result = emit_func(); + ctx.OpBranch(end_label); + ctx.AddLabel(oob_label); + ctx.OpBranch(end_label); + ctx.AddLabel(end_label); + return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, oob_label); + } + // Bounds checking not enabled, just perform the atomic operation. + return emit_func(); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 28d3d2626..3441c5a23 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -86,6 +86,7 @@ void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -127,6 +128,7 @@ void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index a75986ee0..d8ea20b8c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -8,50 +10,71 @@ namespace Shader::Backend::SPIRV { Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(1U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, index); - return ctx.OpLoad(ctx.U16, pointer); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + return AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U16, pointer); + }); } Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); - return ctx.OpLoad(ctx.U32[1], pointer); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U32[1], pointer); + }); } Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { - const Id shift_id{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U64, pointer); + }); } void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(1U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, word_offset); - ctx.OpStore(pointer, value); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return ctx.OpUndef(ctx.U16); + }); } void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); - ctx.OpStore(pointer, value); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return ctx.OpUndef(ctx.U32[1]); + }); } void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { - const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; - ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); - ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); + const Id shift{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + ctx.OpStore(pointer, value); + return ctx.OpUndef(ctx.U64); + }); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 245ac7eb2..672856397 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -146,6 +146,7 @@ void EmitContext::DefineArithmeticTypes() { false_value = ConstantFalse(U1[1]); u8_one_value = Constant(U8, 1U); u8_zero_value = Constant(U8, 0U); + u16_zero_value = Constant(U16, 0U); u32_one_value = ConstU32(1U); u32_zero_value = ConstU32(0U); f32_zero_value = ConstF32(0.0f); @@ -285,6 +286,8 @@ void EmitContext::DefineBufferProperties() { Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding)); buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U)); Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding)); + buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U)); + Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding)); } } } @@ -979,18 +982,27 @@ void EmitContext::DefineSharedMemory() { } ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - const Id type16{TypeArray(U16, ConstU32(Common::DivCeil(shared_memory_size, 2U)))}; - const Id type32{TypeArray(U32[1], ConstU32(Common::DivCeil(shared_memory_size, 4U)))}; - shared_memory_u16_type = TypePointer(spv::StorageClass::Workgroup, type16); - shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type32); - shared_u16 = TypePointer(spv::StorageClass::Workgroup, U16); - shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); - shared_memory_u16 = AddGlobalVariable(shared_memory_u16_type, spv::StorageClass::Workgroup); - shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); - Name(shared_memory_u16, "shared_mem_u16"); - Name(shared_memory_u32, "shared_mem_u32"); - interfaces.push_back(shared_memory_u16); - interfaces.push_back(shared_memory_u32); + + const auto make_type = [&](Id element_type, u32 element_size) { + const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)}; + const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; + Decorate(array_type, spv::Decoration::ArrayStride, element_size); + + const Id struct_type{TypeStruct(array_type)}; + MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u); + Decorate(struct_type, spv::Decoration::Block); + + const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type); + const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type); + const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup); + Decorate(variable, spv::Decoration::Aliased); + interfaces.push_back(variable); + + return std::make_tuple(variable, element_pointer, pointer); + }; + std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u); + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u); + std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 358481ed9..93c4ed265 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -235,17 +235,16 @@ public: Id false_value{}; Id u8_one_value{}; Id u8_zero_value{}; + Id u16_zero_value{}; Id u32_one_value{}; Id u32_zero_value{}; Id f32_zero_value{}; Id u64_one_value{}; Id u64_zero_value{}; - Id shared_u8{}; Id shared_u16{}; Id shared_u32{}; - Id shared_u32x2{}; - Id shared_u32x4{}; + Id shared_u64{}; Id input_u32{}; Id input_f32{}; @@ -285,14 +284,13 @@ public: Id image_u32{}; Id image_f32{}; - Id shared_memory_u8{}; Id shared_memory_u16{}; Id shared_memory_u32{}; - Id shared_memory_u32x2{}; - Id shared_memory_u32x4{}; + Id shared_memory_u64{}; Id shared_memory_u16_type{}; Id shared_memory_u32_type{}; + Id shared_memory_u64_type{}; Id bary_coord_persp_id{}; Id bary_coord_linear_id{}; @@ -321,6 +319,7 @@ public: Id size; Id size_shorts; Id size_dwords; + Id size_qwords; std::array aliases; const BufferSpv& operator[](PointerType alias) const { diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index ee90ca611..3aa78ffaa 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -206,7 +206,7 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(32, ir.GetVectorReg(data0), addr0); } else { ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), + 64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), addr0); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); @@ -214,14 +214,17 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(32, ir.GetVectorReg(data1), addr1); } else { ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), + 64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), addr1); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, data, addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + } else if (bit_size == 16) { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); @@ -293,22 +296,25 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data0}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data1}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); } else if (bit_size == 16) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)}; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 33a67b033..2c37c8099 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -298,7 +298,7 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } @@ -320,10 +320,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) } } -U32F32 IREmitter::SharedAtomicIAdd(const U32& address, const U32F32& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { switch (data.Type()) { case Type::U32: return Inst(Opcode::SharedAtomicIAdd32, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicIAdd64, address, data); default: ThrowInvalidType(data.Type()); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 0e41f4b2d..eae44ed04 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -99,7 +99,7 @@ public: [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); - [[nodiscard]] U32F32 SharedAtomicIAdd(const U32& address, const U32F32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 852c09319..e96e32297 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -32,13 +32,14 @@ OPCODE(EmitPrimitive, Void, // Shared memory operations OPCODE(LoadSharedU16, U16, U32, ) OPCODE(LoadSharedU32, U32, U32, ) -OPCODE(LoadSharedU64, U32x2, U32, ) +OPCODE(LoadSharedU64, U64, U32, ) OPCODE(WriteSharedU16, Void, U32, U16, ) OPCODE(WriteSharedU32, Void, U32, U32, ) -OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU64, Void, U32, U64, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) +OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) @@ -118,6 +119,7 @@ OPCODE(StoreBufferFormatF32, Void, Opaq // Buffer atomic operations OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 ) +OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index d4759b32e..ba8d1cca6 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -34,8 +34,10 @@ void Visit(Info& info, const IR::Inst& inst) { info.uses_patches |= 1U << IR::GenericPatchIndex(patch); break; } + case IR::Opcode::LoadSharedU16: case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: info.uses_shared = true; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index 25aaf257c..90e756e61 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -16,6 +16,7 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicSMax32: case IR::Opcode::SharedAtomicUMax32: @@ -33,9 +34,10 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ if (program.info.stage != Stage::Compute) { return; } - // Only perform the transform if the host shared memory is insufficient. + // Only perform the transform if the host shared memory is insufficient + // or the device does not support VK_KHR_workgroup_memory_explicit_layout const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size <= profile.max_shared_memory_size) { + if (shared_memory_size <= profile.max_shared_memory_size && profile.supports_workgroup_explicit_memory_layout) { return; } // Add buffer binding for shared memory storage buffer. @@ -60,6 +62,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); continue; @@ -93,12 +96,19 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.Imm32(shared_memory_size)); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); + break; case IR::Opcode::LoadSharedU32: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); break; case IR::Opcode::LoadSharedU64: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {}); + inst.Invalidate(); + break; case IR::Opcode::WriteSharedU32: ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); inst.Invalidate(); diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 853e4854d..7d313180f 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -23,13 +23,13 @@ struct Profile { bool support_fp32_denorm_preserve{}; bool support_fp32_denorm_flush{}; bool support_fp32_round_to_zero{}; - bool support_explicit_workgroup_layout{}; bool support_legacy_vertex_attributes{}; bool supports_image_load_store_lod{}; bool supports_native_cube_calc{}; bool supports_trinary_minmax{}; bool supports_robust_buffer_access{}; bool supports_image_fp32_atomic_min_max{}; + bool supports_workgroup_explicit_memory_layout{}; bool has_broken_spirv_clamp{}; bool lower_left_origin_mode{}; bool needs_manual_interpolation{}; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 9584329f0..0135b667f 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -212,7 +212,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, vk::PhysicalDevicePortabilitySubsetFeaturesKHR, - vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); + vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>(); features = feature_chain.get().features; const vk::StructureChain properties_chain = physical_device.getProperties2< @@ -283,6 +284,17 @@ bool Instance::CreateDevice() { LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}", shader_atomic_float2_features.shaderImageFloat32AtomicMinMax); } + workgroup_memory_explicit_layout = add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + if (workgroup_memory_explicit_layout) { + workgroup_memory_explicit_layout_features = + feature_chain.get(); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayout: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayoutScalarBlockLayout: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayoutScalarBlockLayout); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayout16BitAccess: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess); + } const bool calibrated_timestamps = TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false; @@ -420,6 +432,14 @@ bool Instance::CreateDevice() { .shaderImageFloat32AtomicMinMax = shader_atomic_float2_features.shaderImageFloat32AtomicMinMax, }, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR{ + .workgroupMemoryExplicitLayout = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout, + .workgroupMemoryExplicitLayoutScalarBlockLayout = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayoutScalarBlockLayout, + .workgroupMemoryExplicitLayout16BitAccess = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess, + }, #ifdef __APPLE__ portability_features, #endif @@ -452,6 +472,9 @@ bool Instance::CreateDevice() { if (!shader_atomic_float2) { device_chain.unlink(); } + if (!workgroup_memory_explicit_layout) { + device_chain.unlink(); + } auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); if (device_result != vk::Result::eSuccess) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 30848e8b7..9b9adb768 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -171,6 +171,11 @@ public: return shader_atomic_float2 && shader_atomic_float2_features.shaderImageFloat32AtomicMinMax; } + /// Returns true when VK_KHR_workgroup_memory_explicit_layout is supported. + bool IsWorkgroupMemoryExplicitLayoutSupported() const { + return workgroup_memory_explicit_layout && workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess; + } + /// Returns true when geometry shaders are supported by the device bool IsGeometryStageSupported() const { return features.geometryShader; @@ -349,6 +354,7 @@ private: vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT dynamic_state_3_features; vk::PhysicalDeviceRobustness2FeaturesEXT robustness2_features; vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT shader_atomic_float2_features; + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR workgroup_memory_explicit_layout_features; vk::DriverIdKHR driver_id; vk::UniqueDebugUtilsMessengerEXT debug_callback{}; std::string vendor_name; @@ -374,6 +380,7 @@ private: bool amd_gcn_shader{}; bool amd_shader_trinary_minmax{}; bool shader_atomic_float2{}; + bool workgroup_memory_explicit_layout{}; bool portability_subset{}; }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index b72f77e55..55f1d3d22 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -209,7 +209,6 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32), .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32), .support_fp32_round_to_zero = bool(vk12_props.shaderRoundingModeRTZFloat32), - .support_explicit_workgroup_layout = true, .support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(), .supports_image_load_store_lod = instance_.IsImageLoadStoreLodSupported(), .supports_native_cube_calc = instance_.IsAmdGcnShaderSupported(), @@ -217,6 +216,7 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, // TODO: Emitted bounds checks cause problems with phi control flow; needs to be fixed. .supports_robust_buffer_access = true, // instance_.IsRobustBufferAccess2Supported(), .supports_image_fp32_atomic_min_max = instance_.IsShaderAtomicFloatImage32MinMaxSupported(), + .supports_workgroup_explicit_memory_layout = instance_.IsWorkgroupMemoryExplicitLayoutSupported(), .needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() && instance.GetDriverID() == vk::DriverId::eNvidiaProprietary, .needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||