diff --git a/src/common/number_utils.cpp b/src/common/number_utils.cpp index af29e5cd3..660f539f9 100644 --- a/src/common/number_utils.cpp +++ b/src/common/number_utils.cpp @@ -158,4 +158,4 @@ float S16ToSnorm(s16 val) { return float(val * c); } -} // namespace NumberUtils \ No newline at end of file +} // namespace NumberUtils diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 98f2195c5..a5cdca8f1 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -305,19 +305,23 @@ void SetupCapabilities(const Info& info, const Profile& profile, const RuntimeIn runtime_info.fs_info.addr_flags.persp_sample_ena) { ctx.AddCapability(spv::Capability::SampleRateShading); } + if (info.loads.GetAny(IR::Attribute::RenderTargetIndex)) { + ctx.AddCapability(spv::Capability::Geometry); + } } if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) { ctx.AddCapability(spv::Capability::Tessellation); } if (stage == LogicalStage::Vertex || stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) { - if (info.has_layer_output) { + if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) { ctx.AddCapability(spv::Capability::ShaderLayer); } - if (info.has_viewport_index_output) { + if (info.stores.GetAny(IR::Attribute::ViewportIndex)) { ctx.AddCapability(spv::Capability::ShaderViewportIndex); } - } else if (stage == LogicalStage::Geometry && info.has_viewport_index_output) { + } else if (stage == LogicalStage::Geometry && + info.stores.GetAny(IR::Attribute::ViewportIndex)) { ctx.AddCapability(spv::Capability::MultiViewport); } if (info.uses_dma) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 6df8f74fd..554448b13 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -3,7 +3,6 @@ #include "common/assert.h" #include "common/config.h" -#include "common/logging/log.h" #include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -14,55 +13,11 @@ #include namespace Shader::Backend::SPIRV { -namespace { -Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { - if (IR::IsParam(attr)) { - const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)}; - if (ctx.stage == Stage::Local) { - const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); - return ctx.OpAccessChain(component_ptr, ctx.output_attr_array, ctx.ConstU32(attr_index), - ctx.ConstU32(element)); - } else { - const auto& info{ctx.output_params.at(attr_index)}; - ASSERT(info.num_components > 0); - if (info.num_components == 1) { - return info.id; - } else { - return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)); - } - } - } - if (IR::IsMrt(attr)) { - const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)}; - const auto& info{ctx.frag_outputs.at(index)}; - if (info.num_components == 1) { - return info.id; - } else { - return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)); - } - } - switch (attr) { - case IR::Attribute::Position0: - return ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element)); - case IR::Attribute::ClipDistance: - return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, ctx.ConstU32(element)); - case IR::Attribute::CullDistance: - return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, ctx.ConstU32(element)); - case IR::Attribute::PointSize: - return ctx.output_point_size; - case IR::Attribute::RenderTargetIndex: - return ctx.output_layer; - case IR::Attribute::ViewportIndex: - return ctx.output_viewport_index; - case IR::Attribute::Depth: - return ctx.frag_depth; - default: - UNREACHABLE_MSG("Write attribute {}", attr); - } -} +using PointerType = EmitContext::PointerType; +using PointerSize = EmitContext::PointerSize; -std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { +static std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { if (IR::IsParam(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; const auto& info{ctx.output_params.at(index)}; @@ -82,15 +37,13 @@ std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr return {ctx.F32[1], false}; case IR::Attribute::RenderTargetIndex: case IR::Attribute::ViewportIndex: - return {ctx.S32[1], true}; + case IR::Attribute::SampleMask: + case IR::Attribute::StencilRef: + return {ctx.U32[1], true}; default: UNREACHABLE_MSG("Write attribute {}", attr); } } -} // Anonymous namespace - -using PointerType = EmitContext::PointerType; -using PointerSize = EmitContext::PointerSize; Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg); @@ -212,6 +165,10 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { case IR::Attribute::IsFrontFace: return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value, ctx.u32_zero_value); + case IR::Attribute::SampleIndex: + return ctx.OpLoad(ctx.U32[1], ctx.sample_index); + case IR::Attribute::RenderTargetIndex: + return ctx.OpLoad(ctx.U32[1], ctx.output_layer); case IR::Attribute::PrimitiveId: return ctx.OpLoad(ctx.U32[1], ctx.primitive_id); case IR::Attribute::InvocationId: @@ -243,12 +200,62 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { } void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) { - const Id pointer{OutputAttrPointer(ctx, attr, element)}; - const auto [component_type, is_integer]{OutputAttrComponentType(ctx, attr)}; - if (is_integer) { - ctx.OpStore(pointer, ctx.OpBitcast(component_type, value)); - } else { - ctx.OpStore(pointer, value); + const auto op_store = [&](Id pointer) { + const auto [component_type, is_integer] = OutputAttrComponentType(ctx, attr); + if (is_integer) { + ctx.OpStore(pointer, ctx.OpBitcast(component_type, value)); + } else { + ctx.OpStore(pointer, value); + } + }; + if (IR::IsParam(attr)) { + const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)}; + if (ctx.stage == Stage::Local) { + const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); + return op_store(ctx.OpAccessChain(component_ptr, ctx.output_attr_array, + ctx.ConstU32(attr_index), ctx.ConstU32(element))); + } else { + const auto& info{ctx.output_params.at(attr_index)}; + ASSERT(info.num_components > 0); + if (info.num_components == 1) { + return op_store(info.id); + } else { + return op_store( + ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element))); + } + } + } + if (IR::IsMrt(attr)) { + const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)}; + const auto& info{ctx.frag_outputs.at(index)}; + if (info.num_components == 1) { + return op_store(info.id); + } else { + return op_store(ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element))); + } + } + switch (attr) { + case IR::Attribute::Position0: + return op_store( + ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element))); + case IR::Attribute::ClipDistance: + return op_store( + ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, ctx.ConstU32(element))); + case IR::Attribute::CullDistance: + return op_store( + ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, ctx.ConstU32(element))); + case IR::Attribute::PointSize: + return op_store(ctx.output_point_size); + case IR::Attribute::RenderTargetIndex: + return op_store(ctx.output_layer); + case IR::Attribute::ViewportIndex: + return op_store(ctx.output_viewport_index); + case IR::Attribute::Depth: + return op_store(ctx.frag_depth); + case IR::Attribute::SampleMask: + return op_store(ctx.OpAccessChain(ctx.output_u32, ctx.sample_mask, ctx.u32_zero_value)); + default: + UNREACHABLE_MSG("Write attribute {}", attr); } } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index 440f80fa9..804d98b74 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -28,7 +28,7 @@ void ConvertDepthMode(EmitContext& ctx) { } void ConvertPositionToClipSpace(EmitContext& ctx) { - ASSERT_MSG(!ctx.info.has_viewport_index_output, + ASSERT_MSG(!ctx.info.stores.GetAny(IR::Attribute::ViewportIndex), "Multi-viewport with shader clip space conversion not yet implemented."); const Id type{ctx.F32[1]}; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 131b475fc..4152420d0 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -370,13 +370,18 @@ void EmitContext::DefineInputs() { if (info.loads.GetAny(IR::Attribute::FragCoord)) { frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); } - if (info.stores.Get(IR::Attribute::Depth)) { - frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); - } if (info.loads.Get(IR::Attribute::IsFrontFace)) { front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); } + if (info.loads.GetAny(IR::Attribute::RenderTargetIndex)) { + output_layer = DefineVariable(U32[1], spv::BuiltIn::Layer, spv::StorageClass::Input); + Decorate(output_layer, spv::Decoration::Flat); + } + if (info.loads.Get(IR::Attribute::SampleIndex)) { + sample_index = DefineVariable(U32[1], spv::BuiltIn::SampleId, spv::StorageClass::Input); + Decorate(sample_index, spv::Decoration::Flat); + } if (info.loads.GetAny(IR::Attribute::BaryCoordSmooth)) { if (profile.supports_amd_shader_explicit_vertex_parameter) { bary_coord_smooth = DefineVariable(F32[2], spv::BuiltIn::BaryCoordSmoothAMD, @@ -560,11 +565,11 @@ void EmitContext::DefineVertexBlock() { DefineVariable(F32[1], spv::BuiltIn::PointSize, spv::StorageClass::Output); } if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) { - output_layer = DefineVariable(S32[1], spv::BuiltIn::Layer, spv::StorageClass::Output); + output_layer = DefineVariable(U32[1], spv::BuiltIn::Layer, spv::StorageClass::Output); } if (info.stores.GetAny(IR::Attribute::ViewportIndex)) { output_viewport_index = - DefineVariable(S32[1], spv::BuiltIn::ViewportIndex, spv::StorageClass::Output); + DefineVariable(U32[1], spv::BuiltIn::ViewportIndex, spv::StorageClass::Output); } } @@ -646,6 +651,13 @@ void EmitContext::DefineOutputs() { break; } case LogicalStage::Fragment: { + if (info.stores.Get(IR::Attribute::Depth)) { + frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); + } + if (info.stores.Get(IR::Attribute::SampleMask)) { + sample_mask = DefineVariable(TypeArray(U32[1], u32_one_value), spv::BuiltIn::SampleMask, + spv::StorageClass::Output); + } u32 num_render_targets = 0; for (u32 i = 0; i < IR::NumRenderTargets; i++) { const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i}; @@ -1080,36 +1092,26 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie Name(func, name); AddLabel(); - const auto raw_mantissa{ - OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))}; - const auto mantissa{OpConvertUToF(F32[1], raw_mantissa)}; - const auto exponent{OpBitcast( - S32[1], OpBitFieldSExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U)))}; - - const auto is_exp_neg_one{OpIEqual(U1[1], exponent, ConstS32(-1))}; - const auto is_exp_zero{OpIEqual(U1[1], exponent, ConstS32(0))}; - - const auto is_zero{OpIEqual(U1[1], value, ConstU32(0u))}; - const auto is_nan{ - OpLogicalAnd(U1[1], is_exp_neg_one, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))}; - const auto is_inf{ - OpLogicalAnd(U1[1], is_exp_neg_one, OpIEqual(U1[1], raw_mantissa, ConstU32(0u)))}; - const auto is_denorm{ - OpLogicalAnd(U1[1], is_exp_zero, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))}; - - const auto denorm{OpFMul(F32[1], mantissa, ConstF32(1.f / (1 << 20)))}; - const auto norm{OpLdexp( - F32[1], - OpFAdd(F32[1], - OpFMul(F32[1], mantissa, ConstF32(1.f / static_cast(1 << mantissa_bits))), - ConstF32(1.f)), - exponent)}; - - const auto result{OpSelect(F32[1], is_zero, ConstF32(0.f), - OpSelect(F32[1], is_nan, ConstF32(NAN), - OpSelect(F32[1], is_inf, ConstF32(INFINITY), - OpSelect(F32[1], is_denorm, denorm, norm))))}; - + const Id exponent{OpBitFieldUExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U))}; + const Id mantissa{OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))}; + const Id mantissa_f{OpConvertUToF(F32[1], mantissa)}; + const Id a{OpSelect(F32[1], OpINotEqual(U1[1], mantissa, u32_zero_value), + OpFMul(F32[1], ConstF32(1.f / (1 << (14 + mantissa_bits))), mantissa_f), + f32_zero_value)}; + const Id b{OpBitcast(F32[1], OpBitwiseOr(U32[1], mantissa, ConstU32(0x7f800000U)))}; + const Id exponent_c{OpISub(U32[1], exponent, ConstU32(15U))}; + const Id scale_a{ + OpFDiv(F32[1], ConstF32(1.f), + OpConvertUToF(F32[1], OpShiftLeftLogical(U32[1], u32_one_value, + OpSNegate(U32[1], exponent_c))))}; + const Id scale_b{OpConvertUToF(F32[1], OpShiftLeftLogical(U32[1], u32_one_value, exponent_c))}; + const Id scale{ + OpSelect(F32[1], OpSLessThan(U1[1], exponent_c, u32_zero_value), scale_a, scale_b)}; + const Id c{OpFMul(F32[1], scale, + OpFAdd(F32[1], ConstF32(1.f), + OpFDiv(F32[1], mantissa_f, ConstF32(f32(1 << mantissa_bits)))))}; + const Id result{OpSelect(F32[1], OpIEqual(U1[1], exponent, u32_zero_value), a, + OpSelect(F32[1], OpIEqual(U1[1], exponent, ConstU32(31U)), b, c))}; OpReturnValue(result); OpFunctionEnd(); return func; diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 4daba8903..9bb2b7d7a 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -256,6 +256,8 @@ public: Id frag_coord{}; Id front_facing{}; Id frag_depth{}; + Id sample_mask{}; + Id sample_index{}; Id clip_distances{}; Id cull_distances{}; diff --git a/src/shader_recompiler/frontend/copy_shader.cpp b/src/shader_recompiler/frontend/copy_shader.cpp index 52b433dbc..795003e43 100644 --- a/src/shader_recompiler/frontend/copy_shader.cpp +++ b/src/shader_recompiler/frontend/copy_shader.cpp @@ -49,6 +49,9 @@ CopyShaderData ParseCopyShader(std::span code) { const auto& exp = inst.control.exp; const IR::Attribute semantic = static_cast(exp.target); for (int i = 0; i < inst.src_count; ++i) { + if ((exp.en & (1 << i)) == 0) { + continue; + } const auto ofs = offsets[inst.src[i].code]; if (ofs != -1) { data.attr_map[ofs] = {semantic, i}; diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 9dccf1105..e1e39105f 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -22,7 +22,7 @@ static AmdGpu::NumberFormat NumberFormatCompressed( case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SINT16: return AmdGpu::NumberFormat::Sint; default: - UNREACHABLE_MSG("Unimplemented compressed MRT export format {}", + UNREACHABLE_MSG("Unimplemented compressed export format {}", static_cast(export_format)); } } @@ -42,7 +42,7 @@ static u32 MaskFromExportFormat(u8 mask, AmdGpu::Liverpool::ShaderExportFormat e // All components return mask; default: - UNREACHABLE_MSG("Unimplemented uncompressed MRT export format {}", + UNREACHABLE_MSG("Unimplemented uncompressed export format {}", static_cast(export_format)); } } @@ -118,25 +118,68 @@ void Translator::ExportRenderTarget(const GcnInst& inst) { } } +void Translator::ExportDepth(const GcnInst& inst) { + const auto& exp = inst.control.exp; + if (exp.en == 0) { + // No export + return; + } + + std::array components{}; + if (exp.compr) { + // Components are float16 packed into a VGPR + const auto num_format = NumberFormatCompressed(runtime_info.fs_info.z_export_format); + // Export R, G + if (exp.en & 1) { + const IR::Value unpacked_value = + ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[0].code))); + components[0] = IR::F32{ir.CompositeExtract(unpacked_value, 0)}; + components[1] = IR::F32{ir.CompositeExtract(unpacked_value, 1)}; + } + // Export B, A + if ((exp.en >> 2) & 1) { + const IR::Value unpacked_value = + ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[1].code))); + components[2] = IR::F32{ir.CompositeExtract(unpacked_value, 0)}; + // components[3] = IR::F32{ir.CompositeExtract(unpacked_value, 1)}; + } + } else { + // Components are float32 into separate VGPRS + u32 mask = MaskFromExportFormat(exp.en, runtime_info.fs_info.z_export_format); + for (u32 i = 0; i < 4; i++, mask >>= 1) { + if ((mask & 1) == 0) { + continue; + } + components[i] = ir.GetVectorReg(IR::VectorReg(inst.src[i].code)); + } + } + + static constexpr std::array MrtzBuiltins = {IR::Attribute::Depth, IR::Attribute::StencilRef, + IR::Attribute::SampleMask, IR::Attribute::Null}; + for (u32 i = 0; i < 4; ++i) { + if (components[i].IsEmpty()) { + continue; + } + ir.SetAttribute(MrtzBuiltins[i], components[i]); + } +} + void Translator::EmitExport(const GcnInst& inst) { if (info.stage == Stage::Fragment && inst.control.exp.vm) { ir.Discard(ir.LogicalNot(ir.GetExec())); } - const auto& exp = inst.control.exp; - const IR::Attribute attrib{exp.target}; + const IR::Attribute attrib{inst.control.exp.target}; if (IR::IsMrt(attrib)) { return ExportRenderTarget(inst); } - - if (attrib == IR::Attribute::Depth && exp.en != 0 && exp.en != 1) { - LOG_WARNING(Render_Vulkan, "Unsupported depth export"); - return; + if (attrib == IR::Attribute::Depth) { + return ExportDepth(inst); } - ASSERT_MSG(!exp.compr, "Compressed exports only supported for render targets"); + ASSERT_MSG(!inst.control.exp.compr, "Compressed exports only supported for render targets"); - u32 mask = exp.en; + u32 mask = inst.control.exp.en; for (u32 i = 0; i < 4; i++, mask >>= 1) { if ((mask & 1) == 0) { continue; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index e7a7b3be6..9e42ebea9 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -171,6 +171,13 @@ void Translator::EmitPrologue(IR::Block* first_block) { ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); } } + if (runtime_info.fs_info.addr_flags.ancillary_ena) { + if (runtime_info.fs_info.en_flags.ancillary_ena) { + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::PackedAncillary)); + } else { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + } break; case LogicalStage::TessellationControl: { ir.SetVectorReg(IR::VectorReg::V0, ir.GetAttributeU32(IR::Attribute::PrimitiveId)); @@ -460,7 +467,7 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier)); } if (operand.output_modifier.clamp) { - result = ir.FPSaturate(value); + result = ir.FPSaturate(result); } } @@ -490,7 +497,7 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra ir.FPMul(value_untyped, ir.Imm64(f64(operand.output_modifier.multiplier))); } if (operand.output_modifier.clamp) { - value_untyped = ir.FPSaturate(value_raw); + value_untyped = ir.FPSaturate(value_untyped); } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index dad2cc829..b3b6a3977 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -319,6 +319,7 @@ private: const IR::F32& x_res, const IR::F32& y_res, const IR::F32& z_res); void ExportRenderTarget(const GcnInst& inst); + void ExportDepth(const GcnInst& inst); void LogMissingOpcode(const GcnInst& inst); IR::VectorReg GetScratchVgpr(u32 offset); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 689264c6a..ccf2c45e0 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -210,8 +210,6 @@ struct Info { bool has_bitwise_xor{}; bool has_image_gather{}; bool has_image_query{}; - bool has_layer_output{}; - bool has_viewport_index_output{}; bool uses_buffer_atomic_float_min_max{}; bool uses_image_atomic_float_min_max{}; bool uses_lane_id{}; diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 388f8de8c..382f9b1d9 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -160,6 +160,12 @@ std::string NameOf(Attribute attribute) { return "TessFactorsBufferBase"; case Attribute::PointSize: return "PointSize"; + case Attribute::StencilRef: + return "StencilRef"; + case Attribute::SampleMask: + return "SampleMask"; + case Attribute::PackedAncillary: + return "PackedAncillary"; default: break; } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 28950ab52..c8a6e6b20 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -88,6 +88,9 @@ enum class Attribute : u64 { OffChipLdsBase = 91, TessFactorsBufferBase = 92, PointSize = 93, + StencilRef = 94, + SampleMask = 95, + PackedAncillary = 96, Max, }; diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index b877a6e87..5f9a3cc55 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -257,12 +257,50 @@ void FoldCmpClass(IR::Block& block, IR::Inst& inst) { IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; const IR::F32 value = IR::F32{inst.Arg(0)}; inst.ReplaceUsesWithAndRemove( - ir.LogicalNot(ir.LogicalOr(ir.FPIsInf(value), ir.FPIsInf(value)))); + ir.LogicalNot(ir.LogicalOr(ir.FPIsNan(value), ir.FPIsInf(value)))); } else { UNREACHABLE(); } } +bool FoldPackedAncillary(IR::Block& block, IR::Inst& inst) { + if (inst.Arg(0).IsImmediate() || !inst.Arg(1).IsImmediate() || !inst.Arg(2).IsImmediate()) { + return false; + } + IR::Inst* value = inst.Arg(0).InstRecursive(); + if (value->GetOpcode() != IR::Opcode::GetAttributeU32 || + value->Arg(0).Attribute() != IR::Attribute::PackedAncillary) { + return false; + } + const u32 offset = inst.Arg(1).U32(); + const u32 bits = inst.Arg(2).U32(); + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + if (offset >= 8 && offset + bits <= 12) { + const auto sample_index = ir.GetAttributeU32(IR::Attribute::SampleIndex); + if (offset == 8 && bits == 4) { + inst.ReplaceUsesWithAndRemove(sample_index); + } else { + inst.ReplaceUsesWithAndRemove( + ir.BitFieldExtract(sample_index, ir.Imm32(offset - 8), ir.Imm32(bits))); + } + } else if (offset >= 16 && offset + bits <= 27) { + const auto mrt_index = ir.GetAttributeU32(IR::Attribute::RenderTargetIndex); + if (offset == 16 && bits == 11) { + inst.ReplaceUsesWithAndRemove(mrt_index); + } else { + inst.ReplaceUsesWithAndRemove( + ir.BitFieldExtract(mrt_index, ir.Imm32(offset - 16), ir.Imm32(bits))); + } + } else { + UNREACHABLE_MSG("Unhandled bitfield extract from ancillary VGPR offset={}, bits={}", offset, + bits); + } + + value->ReplaceUsesWithAndRemove(ir.Imm32(0U)); + + return true; +} + void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: @@ -475,6 +513,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { FoldWhenAllImmediates(inst, [](u64 a) { return static_cast(std::popcount(a)); }); return; case IR::Opcode::BitFieldUExtract: + if (FoldPackedAncillary(block, inst)) { + return; + } FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) { if (static_cast(shift) + static_cast(count) > 32) { UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract, diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 625c8676e..38aad55c4 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -934,14 +934,25 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info, } }(); - const auto unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized; - // Query dimensions of image if needed for normalization. - // We can't use the image sharp because it could be bound to a different image later. + const bool is_msaa = view_type == AmdGpu::ImageType::Color2DMsaa || + view_type == AmdGpu::ImageType::Color2DMsaaArray; + const bool unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized; + const bool needs_dimentions = (!is_msaa && unnormalized) || (is_msaa && !unnormalized); const auto dimensions = - unnormalized ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info) - : IR::Value{}; + needs_dimentions ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info) + : IR::Value{}; const auto get_coord = [&](u32 coord_idx, u32 dim_idx) -> IR::Value { const auto coord = get_addr_reg(coord_idx); + if (is_msaa) { + // For MSAA images preserve the unnormalized coord or manually unnormalize it + if (unnormalized) { + return ir.ConvertFToU(32, coord); + } else { + const auto dim = + ir.ConvertUToF(32, 32, IR::U32{ir.CompositeExtract(dimensions, dim_idx)}); + return ir.ConvertFToU(32, ir.FPMul(coord, dim)); + } + } if (unnormalized) { // Normalize the coordinate for sampling, dividing by its corresponding dimension. const auto dim = @@ -958,12 +969,10 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info, addr_reg = addr_reg + 1; return get_coord(addr_reg - 1, 0); case AmdGpu::ImageType::Color1DArray: // x, slice - [[fallthrough]]; - case AmdGpu::ImageType::Color2D: // x, y + case AmdGpu::ImageType::Color2D: // x, y + case AmdGpu::ImageType::Color2DMsaa: // x, y addr_reg = addr_reg + 2; return ir.CompositeConstruct(get_coord(addr_reg - 2, 0), get_coord(addr_reg - 1, 1)); - case AmdGpu::ImageType::Color2DMsaa: // x, y, frag - [[fallthrough]]; case AmdGpu::ImageType::Color2DArray: // x, y, slice addr_reg = addr_reg + 3; // Note we can use FixCubeCoords with fallthrough cases since it checks for image type. @@ -986,6 +995,9 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info, const IR::F32 lod_clamp = inst_info.has_lod_clamp ? get_addr_reg(addr_reg++) : IR::F32{}; auto texel = [&] -> IR::Value { + if (is_msaa) { + return ir.ImageRead(handle, coords, ir.Imm32(0U), ir.Imm32(0U), inst_info); + } if (inst_info.is_gather) { if (inst_info.is_depth) { return ir.ImageGatherDref(handle, coords, offset, dref, inst_info); diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index a7108a5ef..8f0e61da2 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -160,13 +160,6 @@ void CollectShaderInfoPass(IR::Program& program, const Profile& profile) { } } - if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) { - info.has_layer_output = true; - } - if (info.stores.GetAny(IR::Attribute::ViewportIndex)) { - info.has_viewport_index_output = true; - } - // In case Flatbuf has not already been bound by IR and is needed // to query buffer sizes, bind it now. if (!profile.supports_robust_buffer_access && !info.uses_dma) { diff --git a/src/shader_recompiler/ir/reinterpret.h b/src/shader_recompiler/ir/reinterpret.h index 10728d8dd..84a4a51d5 100644 --- a/src/shader_recompiler/ir/reinterpret.h +++ b/src/shader_recompiler/ir/reinterpret.h @@ -22,7 +22,7 @@ inline Value ApplySwizzle(IREmitter& ir, const Value& vector, const AmdGpu::Comp } /// Converts gamma corrected value to linear space -inline F32 ApplyGammaToLinear(IREmitter& ir, F32& c) { +inline F32 ApplyGammaToLinear(IREmitter& ir, const F32& c) { const F32 a = ir.FPPow(ir.FPMul(ir.FPAdd(c, ir.Imm32(0.055f)), ir.Imm32(1.0f / 1.055f)), ir.Imm32(2.4f)); const F32 b = ir.FPMul(c, ir.Imm32(1.0f / 12.92f)); @@ -80,6 +80,9 @@ inline F32 ApplyReadNumberConversion(IREmitter& ir, const F32& value, const auto float_val = ir.ConvertUToF(32, 32, ir.BitCast(value)); return ir.FPDiv(float_val, ir.Imm32(static_cast(std::numeric_limits::max()))); } + case AmdGpu::NumberConversion::SrgbToNorm: { + return ApplyGammaToLinear(ir, value); + } default: UNREACHABLE(); } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index f532dcbad..53e4ecd11 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -205,12 +205,13 @@ struct FragmentRuntimeInfo { u32 num_inputs; std::array inputs; std::array color_buffers; + AmdGpu::Liverpool::ShaderExportFormat z_export_format; bool dual_source_blending; bool operator==(const FragmentRuntimeInfo& other) const noexcept { return std::ranges::equal(color_buffers, other.color_buffers) && en_flags.raw == other.en_flags.raw && addr_flags.raw == other.addr_flags.raw && - num_inputs == other.num_inputs && + num_inputs == other.num_inputs && z_export_format == other.z_export_format && dual_source_blending == other.dual_source_blending && std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(), other.inputs.begin() + num_inputs); diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 27c9313a2..21c2eee2a 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -102,6 +102,7 @@ enum class NumberConversion : u32 { Sint8ToSnormNz = 4, Sint16ToSnormNz = 5, Uint32ToUnorm = 6, + SrgbToNorm = 7, }; union CompMapping { @@ -219,6 +220,8 @@ constexpr NumberFormat RemapNumberFormat(const NumberFormat format, const DataFo return format; } } + case NumberFormat::Srgb: + return data_format == DataFormat::FormatBc6 ? NumberFormat::Unorm : format; case NumberFormat::Uscaled: return NumberFormat::Uint; case NumberFormat::Sscaled: @@ -295,6 +298,9 @@ constexpr NumberConversion MapNumberConversion(const NumberFormat num_fmt, return NumberConversion::None; } } + case NumberFormat::Srgb: + return data_fmt == DataFormat::FormatBc6 ? NumberConversion::SrgbToNorm + : NumberConversion::None; case NumberFormat::Uscaled: return NumberConversion::UintToUscaled; case NumberFormat::Sscaled: diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index d13aeec99..5206edbec 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -254,6 +254,15 @@ bool Instance::CreateDevice() { // Optional maintenance_8 = add_extension(VK_KHR_MAINTENANCE_8_EXTENSION_NAME); + attachment_feedback_loop = add_extension(VK_EXT_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_EXTENSION_NAME); + if (attachment_feedback_loop) { + attachment_feedback_loop = + add_extension(VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_EXTENSION_NAME); + if (!attachment_feedback_loop) { + // We want both extensions so remove the first if the second isn't available + enabled_extensions.pop_back(); + } + } depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); dynamic_state_3 = add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); if (dynamic_state_3) { @@ -464,6 +473,12 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceMaintenance8FeaturesKHR{ .maintenance8 = true, }, + vk::PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT{ + .attachmentFeedbackLoopLayout = true, + }, + vk::PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT{ + .attachmentFeedbackLoopDynamicState = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, @@ -535,6 +550,10 @@ bool Instance::CreateDevice() { if (!maintenance_8) { device_chain.unlink(); } + if (!attachment_feedback_loop) { + device_chain.unlink(); + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index e1fa180fb..09f68d764 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -114,6 +114,11 @@ public: return maintenance_8; } + /// Returns true if VK_EXT_attachment_feedback_loop_layout is supported + bool IsAttachmentFeedbackLoopLayoutSupported() const { + return attachment_feedback_loop; + } + /// Returns true when VK_EXT_custom_border_color is supported bool IsCustomBorderColorSupported() const { return custom_border_color; @@ -475,6 +480,7 @@ private: bool workgroup_memory_explicit_layout{}; bool portability_subset{}; bool maintenance_8{}; + bool attachment_feedback_loop{}; bool supports_memory_budget{}; u64 total_memory_budget{}; std::vector valid_heaps; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 89b48f0e4..c250f4d13 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -167,8 +167,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS BuildCommon(regs.ps_program); info.fs_info.en_flags = regs.ps_input_ena; info.fs_info.addr_flags = regs.ps_input_addr; - const auto& ps_inputs = regs.ps_inputs; info.fs_info.num_inputs = regs.num_interp; + info.fs_info.z_export_format = regs.z_export_format; const auto& cb0_blend = regs.blend_control[0]; if (cb0_blend.enable) { info.fs_info.dual_source_blending = @@ -182,6 +182,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS } else { info.fs_info.dual_source_blending = false; } + const auto& ps_inputs = regs.ps_inputs; for (u32 i = 0; i < regs.num_interp; i++) { info.fs_info.inputs[i] = { .param_index = u8(ps_inputs[i].input_offset.Value()), diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index b7cb570f4..3ff78f967 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -251,15 +251,14 @@ void Rasterizer::EliminateFastClear() { if (!col_buf || !col_buf.info.fast_clear) { return; } + VideoCore::TextureCache::RenderTargetDesc desc(col_buf, liverpool->last_cb_extent[0]); + const auto& image_view = texture_cache.FindRenderTarget(desc); if (!texture_cache.IsMetaCleared(col_buf.CmaskAddress(), col_buf.view.slice_start)) { return; } for (u32 slice = col_buf.view.slice_start; slice <= col_buf.view.slice_max; ++slice) { texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false); } - const auto& hint = liverpool->last_cb_extent[0]; - VideoCore::TextureCache::RenderTargetDesc desc(col_buf, hint); - const auto& image_view = texture_cache.FindRenderTarget(desc); auto& image = texture_cache.GetImage(image_view.image_id); const vk::ImageSubresourceRange range = { .aspectMask = vk::ImageAspectFlagBits::eColor, @@ -723,11 +722,6 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin // to force general layout on it. image->binding.force_general |= image_desc.is_written; } - if (image->binding.is_target) { - // The image is already bound as target. Since we read and output to it need to force - // general layout too. - image->binding.force_general = 1u; - } image->binding.is_bound = 1u; } @@ -754,8 +748,15 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin auto& image = texture_cache.GetImage(image_id); auto& image_view = texture_cache.FindTexture(image_id, desc); - if (image.binding.force_general || image.binding.is_target) { - image.Transit(vk::ImageLayout::eGeneral, + // The image is either bound as storage in a separate descriptor or bound as render + // target in feedback loop. Depth images are excluded because they can't be bound as + // storage and feedback loop doesn't make sense for them + if ((image.binding.force_general || image.binding.is_target) && + !image.info.props.is_depth) { + image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() && + image.binding.is_target + ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT + : vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | (image.info.props.is_depth ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite @@ -816,6 +817,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& state) { int cb_index = 0; + attachment_feedback_loop = false; for (auto attach_idx = 0u; attach_idx < state.num_color_attachments; ++attach_idx) { if (state.color_attachments[attach_idx].imageView == VK_NULL_HANDLE) { continue; @@ -835,11 +837,14 @@ void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& s state.height = std::min(state.height, std::max(image.info.size.height >> mip, 1u)); } auto& image = texture_cache.GetImage(image_id); - if (image.binding.force_general) { - image.Transit( - vk::ImageLayout::eGeneral, - vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eShaderRead, {}); - + if (image.binding.is_bound) { + ASSERT_MSG(!image.binding.force_general, + "Having image both as storage and render target is unsupported"); + image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() + ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT + : vk::ImageLayout::eGeneral, + vk::AccessFlagBits2::eColorAttachmentWrite, {}); + attachment_feedback_loop = true; } else { image.Transit(vk::ImageLayout::eColorAttachmentOptimal, vk::AccessFlagBits2::eColorAttachmentWrite | @@ -859,23 +864,15 @@ void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& s if (has_stencil) { image.aspect_mask |= vk::ImageAspectFlagBits::eStencil; } - if (image.binding.force_general) { - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits2::eDepthStencilAttachmentWrite | - vk::AccessFlagBits2::eShaderRead, - {}); - } else { - const auto new_layout = desc.view_info.is_storage - ? has_stencil - ? vk::ImageLayout::eDepthStencilAttachmentOptimal - : vk::ImageLayout::eDepthAttachmentOptimal - : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal - : vk::ImageLayout::eDepthReadOnlyOptimal; - image.Transit(new_layout, - vk::AccessFlagBits2::eDepthStencilAttachmentWrite | - vk::AccessFlagBits2::eDepthStencilAttachmentRead, - desc.view_info.range); - } + const auto new_layout = desc.view_info.is_storage + ? has_stencil ? vk::ImageLayout::eDepthStencilAttachmentOptimal + : vk::ImageLayout::eDepthAttachmentOptimal + : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal + : vk::ImageLayout::eDepthReadOnlyOptimal; + image.Transit(new_layout, + vk::AccessFlagBits2::eDepthStencilAttachmentWrite | + vk::AccessFlagBits2::eDepthStencilAttachmentRead, + desc.view_info.range); state.depth_attachment.imageLayout = image.last_state.layout; state.stencil_attachment.imageLayout = image.last_state.layout; image.usage.depth_target = true; @@ -1101,6 +1098,7 @@ void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline, const bool auto& dynamic_state = scheduler.GetDynamicState(); dynamic_state.SetBlendConstants(liverpool->regs.blend_constants); dynamic_state.SetColorWriteMasks(pipeline.GetWriteMasks()); + dynamic_state.SetAttachmentFeedbackLoopEnabled(attachment_feedback_loop); // Commit new dynamic state to the command buffer. dynamic_state.Commit(instance, scheduler.CommandBuffer()); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a6848d527..b32cfa424 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -143,7 +143,8 @@ private: boost::container::static_vector buffer_bindings; using ImageBindingInfo = std::pair; boost::container::static_vector image_bindings; - bool fault_process_pending{false}; + bool fault_process_pending{}; + bool attachment_feedback_loop{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index a34bb15ad..f1e5937fe 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -333,6 +333,12 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd dirty_state.line_width = false; cmdbuf.setLineWidth(line_width); } + if (dirty_state.feedback_loop_enabled && instance.IsAttachmentFeedbackLoopLayoutSupported()) { + dirty_state.feedback_loop_enabled = false; + cmdbuf.setAttachmentFeedbackLoopEnableEXT(feedback_loop_enabled + ? vk::ImageAspectFlagBits::eColor + : vk::ImageAspectFlagBits::eNone); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 7dbc2b260..ef0f84822 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -113,6 +113,7 @@ struct DynamicState { bool blend_constants : 1; bool color_write_masks : 1; bool line_width : 1; + bool feedback_loop_enabled : 1; } dirty_state{}; Viewports viewports{}; @@ -149,6 +150,7 @@ struct DynamicState { std::array blend_constants{}; ColorWriteMasks color_write_masks{}; float line_width{}; + bool feedback_loop_enabled{}; /// Commits the dynamic state to the provided command buffer. void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf); @@ -324,6 +326,13 @@ struct DynamicState { dirty_state.line_width = true; } } + + void SetAttachmentFeedbackLoopEnabled(const bool enabled) { + if (feedback_loop_enabled != enabled) { + feedback_loop_enabled = enabled; + dirty_state.feedback_loop_enabled = true; + } + } }; class Scheduler { diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 8dda6aa18..a0daab362 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -14,7 +14,8 @@ namespace VideoCore { using namespace Vulkan; -static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { +static vk::ImageUsageFlags ImageUsageFlags(const Vulkan::Instance* instance, + const ImageInfo& info) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; @@ -23,13 +24,12 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment; } else { usage |= vk::ImageUsageFlagBits::eColorAttachment; - - // In cases where an image is created as a render/depth target and cleared with compute, - // we cannot predict whether it will be used as a storage image. A proper solution would - // involve re-creating the resource with a new configuration and copying previous - // content into it. However, for now, we will set storage usage for all images (if the - // format allows), sacrificing a bit of performance. Note use of ExtendedUsage flag set - // by default. + if (instance->IsAttachmentFeedbackLoopLayoutSupported()) { + usage |= vk::ImageUsageFlagBits::eAttachmentFeedbackLoopEXT; + } + // Always create images with storage flag to avoid needing re-creation in case of e.g + // compute clears This sacrifices a bit of performance but is less work. ExtendedUsage + // flag is also used. usage |= vk::ImageUsageFlagBits::eStorage; } } @@ -128,7 +128,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; } - usage_flags = ImageUsageFlags(info); + usage_flags = ImageUsageFlags(instance, info); format_features = FormatFeatureFlags(usage_flags); switch (info.pixel_format) { @@ -348,11 +348,16 @@ void Image::CopyImage(Image& src_image) { const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1); + const u32 width = src_info.size.width; + const u32 height = src_info.size.height; + const u32 depth = + info.type == AmdGpu::ImageType::Color3D ? info.size.depth : src_info.size.depth; + boost::container::small_vector image_copies; for (u32 mip = 0; mip < num_mips; ++mip) { - const auto mip_w = std::max(src_info.size.width >> mip, 1u); - const auto mip_h = std::max(src_info.size.height >> mip, 1u); - const auto mip_d = std::max(src_info.size.depth >> mip, 1u); + const auto mip_w = std::max(width >> mip, 1u); + const auto mip_h = std::max(height >> mip, 1u); + const auto mip_d = std::max(depth >> mip, 1u); image_copies.emplace_back(vk::ImageCopy{ .srcSubresource{ @@ -365,7 +370,7 @@ void Image::CopyImage(Image& src_image) { .aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil, .mipLevel = mip, .baseArrayLayer = 0, - .layerCount = src_info.resources.layers, + .layerCount = info.resources.layers, }, .extent = {mip_w, mip_h, mip_d}, }); diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 00f56b1c7..583b0d7fa 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -45,8 +45,9 @@ struct ImageInfo { bool IsTiled() const { return tile_mode != AmdGpu::TileMode::DisplayLinearAligned; } - Extent3D BlockDim() const { - return props.is_block ? Extent3D{size.width >> 2, size.height >> 2, size.depth} : size; + Extent2D BlockDim() const { + const auto dim = props.is_block ? 2 : 0; + return Extent2D{size.width >> dim, size.height >> dim}; } s32 MipOf(const ImageInfo& info) const; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 2da037a6e..f39bc16fd 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -304,6 +304,12 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {ExpandImage(image_info, cache_image_id), -1, -1}; } + if (image_info.guest_size == tex_cache_image.info.guest_size && + (image_info.type == AmdGpu::ImageType::Color3D || + tex_cache_image.info.type == AmdGpu::ImageType::Color3D)) { + return {ExpandImage(image_info, cache_image_id), -1, -1}; + } + // Size and resources are less than or equal, use image view. if (image_info.pixel_format != tex_cache_image.info.pixel_format || image_info.guest_size <= tex_cache_image.info.guest_size) { diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h index e20d4dcd0..097fdcb96 100644 --- a/src/video_core/texture_cache/types.h +++ b/src/video_core/texture_cache/types.h @@ -30,6 +30,10 @@ struct Region2D { struct Extent2D { u32 width; u32 height; + + bool operator==(const Extent2D& other) const { + return width == other.width && height == other.height; + } }; struct Extent3D { @@ -37,8 +41,6 @@ struct Extent3D { u32 height; u32 depth; - auto operator<=>(const Extent3D&) const = default; - bool operator==(const Extent3D& other) const { return width == other.width && height == other.height && depth == other.depth; }