From 2d298ef2370c637ab66f17a0859aedd080ccff5b Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Tue, 10 Jun 2025 04:45:42 -0700 Subject: [PATCH] shader_recompiler: Re-type non-32bit load/stores. --- .../backend/spirv/emit_spirv_bounds.h | 66 ++++++++-- .../spirv/emit_spirv_context_get_set.cpp | 118 +++++++++--------- .../backend/spirv/emit_spirv_convert.cpp | 8 ++ .../backend/spirv/emit_spirv_instructions.h | 4 + .../spirv/emit_spirv_shared_memory.cpp | 10 +- .../frontend/translate/data_share.cpp | 72 +++++++---- src/shader_recompiler/ir/ir_emitter.cpp | 36 ++++-- src/shader_recompiler/ir/ir_emitter.h | 13 +- src/shader_recompiler/ir/microinstruction.cpp | 1 + src/shader_recompiler/ir/opcodes.inc | 20 +-- .../ir/passes/hull_shader_transform.cpp | 12 +- .../ir/passes/lower_buffer_format_to_raw.cpp | 16 +-- .../ir/passes/resource_tracking_pass.cpp | 6 + .../ir/passes/ring_access_elimination.cpp | 6 +- .../passes/shared_memory_to_storage_pass.cpp | 40 ++++-- src/shader_recompiler/ir/value.h | 1 + 16 files changed, 276 insertions(+), 153 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h index 41e70c8c3..e66467c6b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h @@ -1,31 +1,54 @@ // SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#pragma once + #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { -template -auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - Id zero_value{}; +template +std::tuple ResolveTypeAndZero(EmitContext& ctx) { Id result_type{}; - if constexpr (bit_size == 64) { - zero_value = ctx.u64_zero_value; + Id zero_value{}; + if constexpr (bit_size == 64 && num_components == 1 && !is_float) { result_type = ctx.U64; + zero_value = ctx.u64_zero_value; } else if constexpr (bit_size == 32) { - zero_value = ctx.u32_zero_value; - result_type = ctx.U32[1]; - } else if constexpr (bit_size == 16) { - zero_value = ctx.u16_zero_value; + if (is_float) { + result_type = ctx.F32[num_components]; + zero_value = ctx.f32_zero_value; + } else { + result_type = ctx.U32[num_components]; + zero_value = ctx.u32_zero_value; + } + } else if constexpr (bit_size == 16 && num_components == 1 && !is_float) { result_type = ctx.U16; + zero_value = ctx.u16_zero_value; + } else if constexpr (bit_size == 8 && num_components == 1 && !is_float) { + result_type = ctx.U8; + zero_value = ctx.u8_zero_value; } else { - static_assert(false, "type not supported"); + static_assert(false, "Type not supported."); } + if (num_components > 1) { + std::array zero_ids; + zero_ids.fill(zero_value); + zero_value = ctx.ConstantComposite(result_type, zero_ids); + } + return {result_type, zero_value}; +} + +template +auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { if (Sirit::ValidId(buffer_size)) { // Bounds checking enabled, wrap in a conditional branch to make sure that // the atomic is not mistakenly executed when the index is out of bounds. - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); + auto compare_index = index; + if (num_components > 1) { + compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1)); + } + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); const Id ib_label = ctx.OpLabel(); const Id end_label = ctx.OpLabel(); ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); @@ -36,6 +59,8 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun ctx.OpBranch(end_label); ctx.AddLabel(end_label); if (Sirit::ValidId(ib_result)) { + const auto [result_type, zero_value] = + ResolveTypeAndZero(ctx); return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label); } else { return Id{0}; @@ -45,4 +70,21 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun return emit_func(); } +template +static Id LoadAccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result) { + if (Sirit::ValidId(buffer_size)) { + // Bounds checking enabled, wrap in a select. + auto compare_index = index; + if (num_components > 1) { + compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1)); + } + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); + const auto [result_type, zero_value] = + ResolveTypeAndZero(ctx); + return ctx.OpSelect(result_type, in_bounds, result, zero_value); + } + // Bounds checking not enabled, just return the plain value. + return result; +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 658d4759f..ccbe54d0a 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -11,6 +11,8 @@ #include +#include "emit_spirv_bounds.h" + namespace Shader::Backend::SPIRV { namespace { @@ -239,8 +241,8 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { } if (IR::IsParam(attr)) { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& param{ctx.input_params.at(index)}; + const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)}; + const auto& param{ctx.input_params.at(param_index)}; if (param.buffer_handle >= 0) { const auto step_rate = EmitReadStepRate(ctx, param.id.value); const auto offset = ctx.OpIAdd( @@ -415,27 +417,6 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) { ctx.OpStore(pointer, value); } -template -static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result, - bool is_float) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a select. - const auto result_type = is_float ? ctx.F32[N] : ctx.U32[N]; - auto compare_index = index; - auto zero_value = is_float ? ctx.f32_zero_value : ctx.u32_zero_value; - if (N > 1) { - compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1)); - std::array zero_ids; - zero_ids.fill(zero_value); - zero_value = ctx.ConstantComposite(result_type, zero_ids); - } - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); - return ctx.OpSelect(result_type, in_bounds, result, zero_value); - } - // Bounds checking not enabled, just return the plain value. - return result; -} - template static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto flags = inst->Flags(); @@ -454,8 +435,9 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a const Id result_i = ctx.OpLoad(data_types[1], ptr_i); if (!flags.typed) { // Untyped loads have bounds checking per-component. - ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, - result_i, alias == PointerType::F32)); + ids.push_back(LoadAccessBoundsCheck < 32, 1, + alias == + PointerType::F32 > (ctx, index_i, spv_buffer.size_dwords, result_i)); } else { ids.push_back(result_i); } @@ -464,8 +446,8 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a const Id result = N == 1 ? ids[0] : ctx.OpCompositeConstruct(data_types[N], ids); if (flags.typed) { // Typed loads have single bounds check for the whole load. - return EmitLoadBufferBoundsCheck(ctx, index, spv_buffer.size_dwords, result, - alias == PointerType::F32); + return LoadAccessBoundsCheck < 32, N, + alias == PointerType::F32 > (ctx, index, spv_buffer.size_dwords, result); } return result; } @@ -477,8 +459,8 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { } const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; - const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))}; - return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false); + const Id result{ctx.OpLoad(ctx.U8, ptr)}; + return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.size, result); } Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { @@ -489,8 +471,8 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))}; - return EmitLoadBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, result, false); + const Id result{ctx.OpLoad(ctx.U16, ptr)}; + return LoadAccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, result); } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { @@ -509,6 +491,18 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address); } +Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[PointerType::U64]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)}; + const Id result{ctx.OpLoad(ctx.U64, ptr)}; + return LoadAccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, result); +} + Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address); } @@ -529,29 +523,6 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr UNREACHABLE_MSG("SPIR-V instruction"); } -template -void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a conditional branch. - auto compare_index = index; - if (N > 1) { - compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1)); - } - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); - const Id in_bounds_label = ctx.OpLabel(); - const Id merge_label = ctx.OpLabel(); - ctx.OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); - ctx.OpBranchConditional(in_bounds, in_bounds_label, merge_label); - ctx.AddLabel(in_bounds_label); - emit_func(); - ctx.OpBranch(merge_label); - ctx.AddLabel(merge_label); - return; - } - // Bounds checking not enabled, just perform the store. - emit_func(); -} - template static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { @@ -569,19 +540,25 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i); const Id value_i = N == 1 ? value : ctx.OpCompositeExtract(data_types[1], value, i); - auto store_i = [&]() { ctx.OpStore(ptr_i, value_i); }; + auto store_i = [&] { + ctx.OpStore(ptr_i, value_i); + return Id{}; + }; if (!flags.typed) { // Untyped stores have bounds checking per-component. - EmitStoreBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, store_i); + AccessBoundsCheck<32, 1, alias == PointerType::F32>( + ctx, index_i, spv_buffer.size_dwords, store_i); } else { store_i(); } } + return Id{}; }; if (flags.typed) { // Typed stores have single bounds check for the whole store. - EmitStoreBufferBoundsCheck(ctx, index, spv_buffer.size_dwords, store); + AccessBoundsCheck<32, N, alias == PointerType::F32>(ctx, index, spv_buffer.size_dwords, + store); } else { store(); } @@ -594,8 +571,10 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v } const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; - const Id result{ctx.OpUConvert(ctx.U8, value)}; - EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); }); + AccessBoundsCheck<8>(ctx, address, spv_buffer.size, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); } void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { @@ -606,9 +585,10 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpUConvert(ctx.U16, value)}; - EmitStoreBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, - [&] { ctx.OpStore(ptr, result); }); + AccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); } void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { @@ -627,6 +607,20 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value); } +void EmitStoreBufferU64(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[PointerType::U64]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)}; + AccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); +} + void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index 945fa6877..c75f43393 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -263,4 +263,12 @@ Id EmitConvertU32U16(EmitContext& ctx, Id value) { return ctx.OpUConvert(ctx.U32[1], value); } +Id EmitConvertU8U32(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U8, value); +} + +Id EmitConvertU32U8(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U32[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index d0060a40d..daf1b973e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -69,6 +69,7 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); @@ -80,6 +81,7 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -462,6 +464,8 @@ Id EmitConvertF64U32(EmitContext& ctx, Id value); Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitConvertU32U16(EmitContext& ctx, Id value); +Id EmitConvertU8U32(EmitContext& ctx, Id value); +Id EmitConvertU32U8(EmitContext& ctx, Id value); Id EmitImageSampleRaw(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address1, Id address2, Id address3, Id address4); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 9b005bc7e..c59406499 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -13,10 +13,10 @@ Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; - return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); - return ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, pointer)); + return ctx.OpLoad(ctx.U16, pointer); }); } @@ -40,7 +40,7 @@ Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { const Id pointer{ ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; - return ctx.OpBitcast(ctx.U32[2], ctx.OpLoad(ctx.U64, pointer)); + return ctx.OpLoad(ctx.U64, pointer); }); } @@ -52,7 +52,7 @@ void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { const Id pointer = ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); - ctx.OpStore(pointer, ctx.OpUConvert(ctx.U16, value)); + ctx.OpStore(pointer, value); return Id{0}; }); } @@ -78,7 +78,7 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { const Id pointer{ ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; - ctx.OpStore(pointer, ctx.OpBitcast(ctx.U64, value)); + ctx.OpStore(pointer, value); return Id{0}; }); } diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c6eea48a0..8ead93f78 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -217,28 +217,37 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); if (bit_size == 64) { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), - addr0); - } else { + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), + ir.GetVectorReg(data0 + 1))), + addr0); + } else if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + } else if (bit_size == 16) { + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 64) { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), - addr1); - } else { + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), + ir.GetVectorReg(data1 + 1))), + addr1); + } else if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + } else if (bit_size == 16) { + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); } - } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = - ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, data, addr0); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + if (bit_size == 64) { + const IR::Value data = + ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + } else if (bit_size == 32) { + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + } else if (bit_size == 16) { + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + } } } @@ -305,28 +314,37 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); if (bit_size == 64) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); - } else { + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); if (bit_size == 64) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); - } else { + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); } - } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; - ir.SetVectorReg(dst_reg, data); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); + } } } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3d9b62e43..3d7cf71dc 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -294,11 +294,11 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { switch (bit_size) { case 16: - return Inst(Opcode::LoadSharedU16, offset); + return Inst(Opcode::LoadSharedU16, offset); case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } @@ -373,12 +373,12 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { return Inst(Opcode::ReadConstBuffer, handle, index); } -U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { - return Inst(Opcode::LoadBufferU8, Flags{info}, handle, address); +U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU8, Flags{info}, handle, address); } -U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) { - return Inst(Opcode::LoadBufferU16, Flags{info}, handle, address); +U16 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU16, Flags{info}, handle, address); } Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address, @@ -397,6 +397,10 @@ Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& } } +U64 IREmitter::LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU64, Flags{info}, handle, address); +} + Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info) { switch (num_dwords) { @@ -417,12 +421,12 @@ Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, Buf return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); } -void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data, +void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U8& data, BufferInstInfo info) { Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data); } -void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data, +void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U16& data, BufferInstInfo info) { Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data); } @@ -447,6 +451,11 @@ void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value& } } +void IREmitter::StoreBufferU64(const Value& handle, const Value& address, const U64& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferU64, Flags{info}, handle, address, data); +} + void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info) { switch (num_dwords) { @@ -1814,8 +1823,15 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s : ConvertUToF(dest_bitsize, src_bitsize, value); } -U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { +U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) { switch (result_bitsize) { + case 8: + switch (value.Type()) { + case Type::U32: + return Inst(Opcode::ConvertU8U32, value); + default: + break; + } case 16: switch (value.Type()) { case Type::U32: @@ -1825,6 +1841,8 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { } case 32: switch (value.Type()) { + case Type::U8: + return Inst(Opcode::ConvertU32U8, value); case Type::U16: return Inst(Opcode::ConvertU32U16, value); default: diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index b8853c065..215a35ee9 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -112,20 +112,23 @@ public: [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); - [[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); - [[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U64 LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info); - void StoreBufferU8(const Value& handle, const Value& address, const U32& data, + void StoreBufferU8(const Value& handle, const Value& address, const U8& data, BufferInstInfo info); - void StoreBufferU16(const Value& handle, const Value& address, const U32& data, + void StoreBufferU16(const Value& handle, const Value& address, const U16& data, BufferInstInfo info); void StoreBufferU32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); + void StoreBufferU64(const Value& handle, const Value& address, const U64& data, + BufferInstInfo info); void StoreBufferF32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); void StoreBufferFormat(const Value& handle, const Value& address, const Value& data, @@ -310,7 +313,7 @@ public: [[nodiscard]] F32F64 ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_signed, const Value& value); - [[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value); + [[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value); [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value); [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords, diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index f55c2b378..c2311afea 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -60,6 +60,7 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferU32x2: case Opcode::StoreBufferU32x3: case Opcode::StoreBufferU32x4: + case Opcode::StoreBufferU64: case Opcode::StoreBufferF32: case Opcode::StoreBufferF32x2: case Opcode::StoreBufferF32x3: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 79c0a9431..1621d2acf 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -30,12 +30,12 @@ OPCODE(EmitVertex, Void, OPCODE(EmitPrimitive, Void, ) // Shared memory operations -OPCODE(LoadSharedU16, U32, U32, ) +OPCODE(LoadSharedU16, U16, U32, ) OPCODE(LoadSharedU32, U32, U32, ) -OPCODE(LoadSharedU64, U32x2, U32, ) -OPCODE(WriteSharedU16, Void, U32, U32, ) +OPCODE(LoadSharedU64, U64, U32, ) +OPCODE(WriteSharedU16, Void, U32, U16, ) OPCODE(WriteSharedU32, Void, U32, U32, ) -OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU64, Void, U32, U64, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) @@ -94,23 +94,25 @@ OPCODE(UndefU32, U32, OPCODE(UndefU64, U64, ) // Buffer operations -OPCODE(LoadBufferU8, U32, Opaque, Opaque, ) -OPCODE(LoadBufferU16, U32, Opaque, Opaque, ) +OPCODE(LoadBufferU8, U8, Opaque, Opaque, ) +OPCODE(LoadBufferU16, U16, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, ) OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, ) OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, ) +OPCODE(LoadBufferU64, U64, Opaque, Opaque, ) OPCODE(LoadBufferF32, F32, Opaque, Opaque, ) OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, ) -OPCODE(StoreBufferU8, Void, Opaque, Opaque, U32, ) -OPCODE(StoreBufferU16, Void, Opaque, Opaque, U32, ) +OPCODE(StoreBufferU8, Void, Opaque, Opaque, U8, ) +OPCODE(StoreBufferU16, Void, Opaque, Opaque, U16, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, ) OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, ) OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, ) +OPCODE(StoreBufferU64, Void, Opaque, Opaque, U64, ) OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) @@ -406,6 +408,8 @@ OPCODE(ConvertF64U32, F64, U32, OPCODE(ConvertF32U16, F32, U16, ) OPCODE(ConvertU16U32, U16, U32, ) OPCODE(ConvertU32U16, U32, U16, ) +OPCODE(ConvertU8U32, U8, U32, ) +OPCODE(ConvertU32U8, U32, U8, ) // Image operations OPCODE(ImageSampleRaw, F32x4, Opaque, F32x4, F32x4, F32x4, F32, ) diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index 5cf8a1525..156cb6628 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -438,7 +438,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; const IR::U32 addr{inst.Arg(0)}; - const IR::U32 data{inst.Arg(1).Resolve()}; + const IR::Value data = num_dwords == 2 + ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) + : inst.Arg(1).Resolve(); const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind, u32 off_dw) { @@ -466,10 +468,10 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); if (num_dwords == 1) { - SetOutput(addr, data, region, 0); + SetOutput(addr, IR::U32{data}, region, 0); } else { for (auto i = 0; i < num_dwords; i++) { - SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i); + SetOutput(addr, IR::U32{ir.CompositeExtract(data, i)}, region, i); } } inst.Invalidate(); @@ -499,7 +501,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { ReadTessControlPointAttribute(addr, stride, ir, i, is_tcs_output_read); read_components.push_back(ir.BitCast(component)); } - attr_read = ir.CompositeConstruct(read_components); + attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components)); } inst.ReplaceUsesWithAndRemove(attr_read); break; @@ -578,7 +580,7 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { const IR::F32 component = GetInput(addr, i); read_components.push_back(ir.BitCast(component)); } - attr_read = ir.CompositeConstruct(read_components); + attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components)); } inst.ReplaceUsesWithAndRemove(attr_read); break; diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp index fcb86e3fb..bb36e2748 100644 --- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp +++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp @@ -34,13 +34,13 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con interpreted = ir.Imm32(0.f); break; case AmdGpu::DataFormat::Format8: { - const auto unpacked = - ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info)); + const auto raw = ir.UConvert(32, ir.LoadBufferU8(handle, address, info)); + const auto unpacked = ir.Unpack4x8(format_info.num_format, raw); interpreted = ir.CompositeExtract(unpacked, 0); break; } case AmdGpu::DataFormat::Format8_8: { - const auto raw = ir.LoadBufferU16(handle, address, info); + const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info)); const auto unpacked = ir.Unpack4x8(format_info.num_format, raw); interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0), ir.CompositeExtract(unpacked, 1)); @@ -51,8 +51,8 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con IR::U32{ir.LoadBufferU32(1, handle, address, info)}); break; case AmdGpu::DataFormat::Format16: { - const auto unpacked = - ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info)); + const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info)); + const auto unpacked = ir.Unpack2x16(format_info.num_format, raw); interpreted = ir.CompositeExtract(unpacked, 0); break; } @@ -126,7 +126,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I const auto packed = ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f), ir.Imm32(0.f))); - ir.StoreBufferU8(handle, address, packed, info); + ir.StoreBufferU8(handle, address, ir.UConvert(8, packed), info); break; } case AmdGpu::DataFormat::Format8_8: { @@ -134,7 +134,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I ir.CompositeConstruct(ir.CompositeExtract(real_value, 0), ir.CompositeExtract(real_value, 1), ir.Imm32(0.f), ir.Imm32(0.f))); - ir.StoreBufferU16(handle, address, packed, info); + ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info); break; } case AmdGpu::DataFormat::Format8_8_8_8: { @@ -145,7 +145,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I case AmdGpu::DataFormat::Format16: { const auto packed = ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f))); - ir.StoreBufferU16(handle, address, packed, info); + ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info); break; } case AmdGpu::DataFormat::Format16_16: { diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index bc6ef3711..ba96d1034 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -44,6 +44,7 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: + case IR::Opcode::StoreBufferU64: case IR::Opcode::StoreBufferF32: case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: @@ -63,6 +64,7 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferU32x2: case IR::Opcode::LoadBufferU32x3: case IR::Opcode::LoadBufferU32x4: + case IR::Opcode::LoadBufferU64: case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: @@ -88,6 +90,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { case IR::Opcode::LoadBufferU16: case IR::Opcode::StoreBufferU16: return IR::Type::U16; + case IR::Opcode::LoadBufferU64: + case IR::Opcode::StoreBufferU64: + case IR::Opcode::BufferAtomicIAdd64: + return IR::Type::U64; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: // Formatted buffer loads can use a variety of types. diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 02745bf9a..b292b41b9 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -39,11 +39,13 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim ASSERT(addr->Arg(1).IsImmediate()); offset = addr->Arg(1).U32(); } - IR::Value data = inst.Arg(1).Resolve(); + IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) + : inst.Arg(1).Resolve(); for (s32 i = 0; i < num_components; i++) { const auto attrib = IR::Attribute::Param0 + (offset / 16); const auto comp = (offset / 4) % 4; - const IR::U32 value = IR::U32{is_composite ? data.Inst()->Arg(i) : data}; + const IR::U32 value = + IR::U32{is_composite ? ir.CompositeExtract(data, i) : data}; ir.SetAttribute(attrib, ir.BitCast(value), comp); offset += 4; } diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index f1d4cbc63..839a8ddc5 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -46,14 +46,8 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ profile.supports_workgroup_explicit_memory_layout)) { return; } - // Add buffer binding for shared memory storage buffer. const u32 binding = static_cast(program.info.buffers.size()); - program.info.buffers.push_back({ - .used_types = IR::Type::U32, - .inline_cbuf = AmdGpu::Buffer::Null(), - .buffer_type = BufferType::SharedMemory, - .is_written = true, - }); + IR::Type used_types{}; for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (!IsSharedAccess(inst)) { @@ -67,19 +61,26 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ // Replace shared atomics first switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U32; + continue; case IR::Opcode::SharedAtomicIAdd64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U64; continue; case IR::Opcode::SharedAtomicISub32: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: { const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); + used_types |= IR::Type::U32; continue; } case IR::Opcode::SharedAtomicSMax32: @@ -87,22 +88,28 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); + used_types |= IR::Type::U32; continue; } case IR::Opcode::SharedAtomicInc32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {})); + used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicDec32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); + used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicAnd32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicOr32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U32; continue; case IR::Opcode::SharedAtomicXor32: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); + used_types |= IR::Type::U32; continue; default: break; @@ -111,30 +118,43 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ switch (inst.GetOpcode()) { case IR::Opcode::LoadSharedU16: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); + used_types |= IR::Type::U16; break; case IR::Opcode::LoadSharedU32: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); + used_types |= IR::Type::U32; break; case IR::Opcode::LoadSharedU64: - inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {})); + used_types |= IR::Type::U64; break; case IR::Opcode::WriteSharedU16: - ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {}); + ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {}); inst.Invalidate(); + used_types |= IR::Type::U16; break; case IR::Opcode::WriteSharedU32: ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); inst.Invalidate(); + used_types |= IR::Type::U32; break; case IR::Opcode::WriteSharedU64: - ir.StoreBufferU32(2, handle, address, inst.Arg(1), {}); + ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {}); inst.Invalidate(); + used_types |= IR::Type::U64; break; default: break; } } } + // Add buffer binding for shared memory storage buffer. + program.info.buffers.push_back({ + .used_types = used_types, + .inline_cbuf = AmdGpu::Buffer::Null(), + .buffer_type = BufferType::SharedMemory, + .is_written = true, + }); } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index ed1e5536a..b92c5d555 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -265,6 +265,7 @@ using U32F32 = TypedValue; using U64F64 = TypedValue; using U32U64 = TypedValue; using U16U32U64 = TypedValue; +using U8U16U32U64 = TypedValue; using F32F64 = TypedValue; using F16F32F64 = TypedValue; using UAny = TypedValue;