From 3019bfb9787d296dfd2af568b94809f178ad8f3f Mon Sep 17 00:00:00 2001 From: baggins183 Date: Fri, 18 Jul 2025 02:04:50 -0700 Subject: [PATCH] Implement MUBUF instructions for shorts/bytes (#2856) * implement loads/store instructions for types smaller than dwords * initialize s16/s8 types * set profile for int8/16/64 * also need to zero extend u8/u16 to u32 result * document unrelated bugs with atomic fmin/max * remove profile checks and simple emit for added opcodes --------- Co-authored-by: georgemoralis --- .../backend/spirv/emit_spirv_atomic.cpp | 2 + .../backend/spirv/emit_spirv_convert.cpp | 68 +++++-------------- .../backend/spirv/emit_spirv_instructions.h | 2 + .../backend/spirv/spirv_emit_context.cpp | 2 + .../frontend/translate/translate.h | 7 +- .../frontend/translate/vector_memory.cpp | 58 ++++++++++++++-- src/shader_recompiler/ir/ir_emitter.cpp | 18 +++++ src/shader_recompiler/ir/ir_emitter.h | 1 + src/shader_recompiler/ir/opcodes.inc | 2 + src/video_core/renderer_vulkan/vk_instance.h | 15 ++++ .../renderer_vulkan/vk_pipeline_cache.cpp | 3 + 11 files changed, 120 insertions(+), 58 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index 80c8b836b..fe2d64d2f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -268,6 +268,7 @@ Id EmitBufferAtomicFMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre const auto sign_bit_set = ctx.OpBitFieldUExtract(ctx.U32[1], u32_value, ctx.ConstU32(31u), ctx.ConstU32(1u)); + // FIXME this needs control flow because it currently executes both atomics const auto result = ctx.OpSelect( ctx.F32[1], sign_bit_set, EmitBitCastF32U32(ctx, EmitBufferAtomicUMax32(ctx, inst, handle, address, u32_value)), @@ -302,6 +303,7 @@ Id EmitBufferAtomicFMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre const auto sign_bit_set = ctx.OpBitFieldUExtract(ctx.U32[1], u32_value, ctx.ConstU32(31u), ctx.ConstU32(1u)); + // FIXME this needs control flow because it currently executes both atomics const auto result = ctx.OpSelect( ctx.F32[1], sign_bit_set, EmitBitCastF32U32(ctx, EmitBufferAtomicUMin32(ctx, inst, handle, address, u32_value)), diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index c75f43393..2f4984f57 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -7,60 +7,32 @@ namespace Shader::Backend::SPIRV { namespace { Id ExtractU16(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpUConvert(ctx.U16, value); - } else { - return ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.u32_zero_value, ctx.ConstU32(16u)); - } + return ctx.OpUConvert(ctx.U16, value); } Id ExtractS16(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpSConvert(ctx.S16, value); - } else { - return ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.u32_zero_value, ctx.ConstU32(16u)); - } + return ctx.OpSConvert(ctx.S16, value); } Id ExtractU8(EmitContext& ctx, Id value) { - if (ctx.profile.support_int8) { - return ctx.OpUConvert(ctx.U8, value); - } else { - return ctx.OpBitFieldUExtract(ctx.U32[1], value, ctx.u32_zero_value, ctx.ConstU32(8u)); - } + return ctx.OpUConvert(ctx.U8, value); } Id ExtractS8(EmitContext& ctx, Id value) { - if (ctx.profile.support_int8) { - return ctx.OpSConvert(ctx.S8, value); - } else { - return ctx.OpBitFieldSExtract(ctx.U32[1], value, ctx.u32_zero_value, ctx.ConstU32(8u)); - } + return ctx.OpSConvert(ctx.S8, value); } } // Anonymous namespace Id EmitConvertS16F16(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); - } else { - return ExtractS16(ctx, ctx.OpConvertFToS(ctx.U32[1], value)); - } + return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); } Id EmitConvertS16F32(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); - } else { - return ExtractS16(ctx, ctx.OpConvertFToS(ctx.U32[1], value)); - } + return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); } Id EmitConvertS16F64(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); - } else { - return ExtractS16(ctx, ctx.OpConvertFToS(ctx.U32[1], value)); - } + return ctx.OpSConvert(ctx.U32[1], ctx.OpConvertFToS(ctx.U16, value)); } Id EmitConvertS32F16(EmitContext& ctx, Id value) { @@ -88,27 +60,15 @@ Id EmitConvertS64F64(EmitContext& ctx, Id value) { } Id EmitConvertU16F16(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); - } else { - return ExtractU16(ctx, ctx.OpConvertFToU(ctx.U32[1], value)); - } + return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); } Id EmitConvertU16F32(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); - } else { - return ExtractU16(ctx, ctx.OpConvertFToU(ctx.U32[1], value)); - } + return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); } Id EmitConvertU16F64(EmitContext& ctx, Id value) { - if (ctx.profile.support_int16) { - return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); - } else { - return ExtractU16(ctx, ctx.OpConvertFToU(ctx.U32[1], value)); - } + return ctx.OpUConvert(ctx.U32[1], ctx.OpConvertFToU(ctx.U16, value)); } Id EmitConvertU32F16(EmitContext& ctx, Id value) { @@ -271,4 +231,12 @@ Id EmitConvertU32U8(EmitContext& ctx, Id value) { return ctx.OpUConvert(ctx.U32[1], value); } +Id EmitConvertS32S8(EmitContext& ctx, Id value) { + return ctx.OpSConvert(ctx.U32[1], value); +} + +Id EmitConvertS32S16(EmitContext& ctx, Id value) { + return ctx.OpSConvert(ctx.U32[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 37d5d84c9..a8c58bdba 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -488,6 +488,8 @@ Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitConvertU32U16(EmitContext& ctx, Id value); Id EmitConvertU8U32(EmitContext& ctx, Id value); Id EmitConvertU32U8(EmitContext& ctx, Id value); +Id EmitConvertS32S8(EmitContext& ctx, Id value); +Id EmitConvertS32S16(EmitContext& ctx, Id value); Id EmitImageSampleRaw(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address1, Id address2, Id address3, Id address4); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index f373808d9..a8ffe6ae5 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -117,7 +117,9 @@ void EmitContext::DefineArithmeticTypes() { void_id = Name(TypeVoid(), "void_id"); U1[1] = Name(TypeBool(), "bool_id"); U8 = Name(TypeUInt(8), "u8_id"); + S8 = Name(TypeSInt(8), "i8_id"); U16 = Name(TypeUInt(16), "u16_id"); + S16 = Name(TypeSInt(16), "i16_id"); if (info.uses_fp16) { F16[1] = Name(TypeFloat(16), "f16_id"); U16 = Name(TypeUInt(16), "u16_id"); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index a29bdc993..ae904b822 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -281,9 +281,10 @@ public: // Buffer Memory // MUBUF / MTBUF - void BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst); - void BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, - const GcnInst& inst); + void BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst, + u32 scalar_width = 32, bool is_signed = false); + void BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, const GcnInst& inst, + u32 scalar_width = 32); template void BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index df20f7f73..ec9bc200d 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -28,6 +28,15 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::BUFFER_LOAD_FORMAT_XYZW: return BUFFER_LOAD(4, false, true, inst); + case Opcode::BUFFER_LOAD_UBYTE: + return BUFFER_LOAD(1, false, false, inst, 8, false); + case Opcode::BUFFER_LOAD_SBYTE: + return BUFFER_LOAD(1, false, false, inst, 8, true); + case Opcode::BUFFER_LOAD_USHORT: + return BUFFER_LOAD(1, false, false, inst, 16, false); + case Opcode::BUFFER_LOAD_SSHORT: + return BUFFER_LOAD(1, false, false, inst, 16, true); + case Opcode::BUFFER_LOAD_DWORD: return BUFFER_LOAD(1, false, false, inst); case Opcode::BUFFER_LOAD_DWORDX2: @@ -56,6 +65,11 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::TBUFFER_STORE_FORMAT_XYZW: return BUFFER_STORE(4, true, false, inst); + case Opcode::BUFFER_STORE_BYTE: + return BUFFER_STORE(1, false, false, inst, 8); + case Opcode::BUFFER_STORE_SHORT: + return BUFFER_STORE(1, false, false, inst, 16); + case Opcode::BUFFER_STORE_DWORD: return BUFFER_STORE(1, false, false, inst); case Opcode::BUFFER_STORE_DWORDX2: @@ -186,7 +200,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { } void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, - const GcnInst& inst) { + const GcnInst& inst, u32 scalar_width, bool is_signed) { const auto& mubuf = inst.control.mubuf; const bool is_ring = mubuf.glc && mubuf.slc; const IR::VectorReg vaddr{inst.src[0].code}; @@ -242,7 +256,26 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)}); } } else { - const IR::Value value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info); + IR::Value value; + switch (scalar_width) { + case 8: { + IR::U8 byte_val = ir.LoadBufferU8(handle, address, buffer_info); + value = is_signed ? ir.SConvert(32, byte_val) : ir.UConvert(32, byte_val); + break; + } + case 16: { + IR::U16 short_val = ir.LoadBufferU16(handle, address, buffer_info); + value = is_signed ? ir.SConvert(32, short_val) : ir.UConvert(32, short_val); + break; + } + case 32: + value = ir.LoadBufferU32(num_dwords, handle, address, buffer_info); + break; + + default: + UNREACHABLE(); + } + if (num_dwords == 1) { ir.SetVectorReg(dst_reg, IR::U32{value}); return; @@ -254,7 +287,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ } void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer_typed, - const GcnInst& inst) { + const GcnInst& inst, u32 scalar_width) { const auto& mubuf = inst.control.mubuf; const bool is_ring = mubuf.glc && mubuf.slc; const IR::VectorReg vaddr{inst.src[0].code}; @@ -314,8 +347,23 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_inst_typed, bool is_buffer } ir.StoreBufferFormat(handle, address, ir.CompositeConstruct(comps), buffer_info); } else { - const auto value = num_dwords == 1 ? comps[0] : ir.CompositeConstruct(comps); - ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info); + IR::Value value = num_dwords == 1 ? comps[0] : ir.CompositeConstruct(comps); + if (scalar_width != 32) { + value = ir.UConvert(scalar_width, IR::U32{value}); + } + switch (scalar_width) { + case 8: + ir.StoreBufferU8(handle, address, IR::U8{value}, buffer_info); + break; + case 16: + ir.StoreBufferU16(handle, address, IR::U16{value}, buffer_info); + break; + case 32: + ir.StoreBufferU32(num_dwords, handle, address, value, buffer_info); + break; + default: + UNREACHABLE(); + } } } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 6ca86b2c0..a6d43d102 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -1979,6 +1979,24 @@ U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize); } +U8U16U32U64 IR::IREmitter::SConvert(size_t result_bitsize, const U8U16U32U64& value) { + switch (result_bitsize) { + case 32: + switch (value.Type()) { + case Type::U8: + return Inst(Opcode::ConvertS32S8, value); + case Type::U16: + return Inst(Opcode::ConvertS32S16, value); + default: + break; + } + default: + break; + } + throw NotImplementedException("Signed Conversion from {} to {} bits", value.Type(), + result_bitsize); +} + F16F32F64 IREmitter::FPConvert(size_t result_bitsize, const F16F32F64& value) { switch (result_bitsize) { case 16: diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a105b042d..e4afb8739 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -325,6 +325,7 @@ public: const Value& value); [[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value); + [[nodiscard]] U8U16U32U64 SConvert(size_t result_bitsize, const U8U16U32U64& value); [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value); [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords, diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 747a27e35..280cd47ec 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -432,6 +432,8 @@ OPCODE(ConvertU16U32, U16, U32, OPCODE(ConvertU32U16, U32, U16, ) OPCODE(ConvertU8U32, U8, U32, ) OPCODE(ConvertU32U8, U32, U8, ) +OPCODE(ConvertS32S8, U32, U8, ) +OPCODE(ConvertS32S16, U32, U16, ) // Image operations OPCODE(ImageSampleRaw, F32x4, Opaque, F32x4, F32x4, F32x4, F32, Opaque, ) diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 9be2d9520..b21e00a71 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -94,6 +94,21 @@ public: return features.shaderFloat64; } + /// Returns true if 64-bit ints are supported in shaders + bool IsShaderInt64Supported() const { + return features.shaderInt64; + } + + /// Returns true if 16-bit ints are supported in shaders + bool IsShaderInt16Supported() const { + return features.shaderInt16; + } + + /// Returns true if 8-bit ints are supported in shaders + bool IsShaderInt8Supported() const { + return vk12_features.shaderInt8; + } + /// Returns true when VK_EXT_custom_border_color is supported bool IsCustomBorderColorSupported() const { return custom_border_color; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4de8fd73b..5f8bd1534 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -203,6 +203,9 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, profile = Shader::Profile{ .supported_spirv = SpirvVersion1_6, .subgroup_size = instance.SubgroupSize(), + .support_int8 = instance.IsShaderInt8Supported(), + .support_int16 = instance.IsShaderInt16Supported(), + .support_int64 = instance.IsShaderInt64Supported(), .support_float64 = instance.IsShaderFloat64Supported(), .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32), .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32),