video_core: Address various UE bugs (#3559)

* vk_rasterizer: Reorder image query in fast clear elimination Fixes missing clears when a texture is being cleared using this method but never actually used for rendering purposes by ensuring the texture cache has at least a chance to register cmask * shader_recompiler: Partial support for ANCILLARY_ENA * pixel_format: Add number conversion of BC6 srgb format * texture_cache: Support aliases of 3D and 2D array images Used be UE to render its post processing LUT * pixel_format: Test BC6 srgb as unorm Still not sure what is up with snorm/unorm can be useful to have both actions to compare for now * video_core: Use attachment feedback layout instead of general if possible UE games often do mipgen passes where the previous mip of the image being rendered to is bound for reading. This appears to cause corruption issues so use attachment feedback loop extension to ensure correct output * renderer_vulkan: Improve feedback loop code * Set proper usage flag for feedback loop usage * Add dynamic state extension and enable it for color aspect when necessary * Check if image is bound instead of force_general for better code consistency * shader_recompiler: More proper depth export implementation * shader_recompiler: Fix bug in output modifiers * shader_recompiler: Fix sampling from MSAA images This is not allowed by any graphics API but seems hardware supports it somehow and it can be encountered. To avoid glitched output translate to to a texelFetch call on sample 0 * clang format * image: Add back missing code * shader_recompiler: Better ancillary implementation Now is implemented with a custom attribute that is constant propagated depending on which parts of it are extracted. It will assert if an unknown part is used or if the attribute itself is not removed by dead code elim * copy_shader: Ignore not enabled export channels * constant_propagation: Invalidate ancillary after successful elimination * spirv: Fix f11/f10 conversion to f32 --------- Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
2025-12-12 14:48:52 +00:00 · 2025-09-12 19:29:16 +03:00
parent de7652384d
commit 374c2194d4
30 changed files with 369 additions and 183 deletions
--- a/src/shader_recompiler/ir/attribute.cpp
+++ b/src/shader_recompiler/ir/attribute.cpp
@@ -160,6 +160,12 @@ std::string NameOf(Attribute attribute) {
        return "TessFactorsBufferBase";
    case Attribute::PointSize:
        return "PointSize";
+    case Attribute::StencilRef:
+        return "StencilRef";
+    case Attribute::SampleMask:
+        return "SampleMask";
+    case Attribute::PackedAncillary:
+        return "PackedAncillary";
    default:
        break;
    }
--- a/src/shader_recompiler/ir/attribute.h
+++ b/src/shader_recompiler/ir/attribute.h
@@ -88,6 +88,9 @@ enum class Attribute : u64 {
    OffChipLdsBase = 91,
    TessFactorsBufferBase = 92,
    PointSize = 93,
+    StencilRef = 94,
+    SampleMask = 95,
+    PackedAncillary = 96,
    Max,
 };

--- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
@@ -257,12 +257,50 @@ void FoldCmpClass(IR::Block& block, IR::Inst& inst) {
        IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
        const IR::F32 value = IR::F32{inst.Arg(0)};
        inst.ReplaceUsesWithAndRemove(
-            ir.LogicalNot(ir.LogicalOr(ir.FPIsInf(value), ir.FPIsInf(value))));
+            ir.LogicalNot(ir.LogicalOr(ir.FPIsNan(value), ir.FPIsInf(value))));
    } else {
        UNREACHABLE();
    }
 }

+bool FoldPackedAncillary(IR::Block& block, IR::Inst& inst) {
+    if (inst.Arg(0).IsImmediate() || !inst.Arg(1).IsImmediate() || !inst.Arg(2).IsImmediate()) {
+        return false;
+    }
+    IR::Inst* value = inst.Arg(0).InstRecursive();
+    if (value->GetOpcode() != IR::Opcode::GetAttributeU32 ||
+        value->Arg(0).Attribute() != IR::Attribute::PackedAncillary) {
+        return false;
+    }
+    const u32 offset = inst.Arg(1).U32();
+    const u32 bits = inst.Arg(2).U32();
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    if (offset >= 8 && offset + bits <= 12) {
+        const auto sample_index = ir.GetAttributeU32(IR::Attribute::SampleIndex);
+        if (offset == 8 && bits == 4) {
+            inst.ReplaceUsesWithAndRemove(sample_index);
+        } else {
+            inst.ReplaceUsesWithAndRemove(
+                ir.BitFieldExtract(sample_index, ir.Imm32(offset - 8), ir.Imm32(bits)));
+        }
+    } else if (offset >= 16 && offset + bits <= 27) {
+        const auto mrt_index = ir.GetAttributeU32(IR::Attribute::RenderTargetIndex);
+        if (offset == 16 && bits == 11) {
+            inst.ReplaceUsesWithAndRemove(mrt_index);
+        } else {
+            inst.ReplaceUsesWithAndRemove(
+                ir.BitFieldExtract(mrt_index, ir.Imm32(offset - 16), ir.Imm32(bits)));
+        }
+    } else {
+        UNREACHABLE_MSG("Unhandled bitfield extract from ancillary VGPR offset={}, bits={}", offset,
+                        bits);
+    }
+
+    value->ReplaceUsesWithAndRemove(ir.Imm32(0U));
+
+    return true;
+}
+
 void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
    switch (inst.GetOpcode()) {
    case IR::Opcode::IAdd32:
@@ -475,6 +513,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
        FoldWhenAllImmediates(inst, [](u64 a) { return static_cast<u32>(std::popcount(a)); });
        return;
    case IR::Opcode::BitFieldUExtract:
+        if (FoldPackedAncillary(block, inst)) {
+            return;
+        }
        FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) {
            if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) {
                UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -934,14 +934,25 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
        }
    }();

-    const auto unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized;
-    // Query dimensions of image if needed for normalization.
-    // We can't use the image sharp because it could be bound to a different image later.
+    const bool is_msaa = view_type == AmdGpu::ImageType::Color2DMsaa ||
+                         view_type == AmdGpu::ImageType::Color2DMsaaArray;
+    const bool unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized;
+    const bool needs_dimentions = (!is_msaa && unnormalized) || (is_msaa && !unnormalized);
    const auto dimensions =
-        unnormalized ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info)
-                     : IR::Value{};
+        needs_dimentions ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info)
+                         : IR::Value{};
    const auto get_coord = [&](u32 coord_idx, u32 dim_idx) -> IR::Value {
        const auto coord = get_addr_reg(coord_idx);
+        if (is_msaa) {
+            // For MSAA images preserve the unnormalized coord or manually unnormalize it
+            if (unnormalized) {
+                return ir.ConvertFToU(32, coord);
+            } else {
+                const auto dim =
+                    ir.ConvertUToF(32, 32, IR::U32{ir.CompositeExtract(dimensions, dim_idx)});
+                return ir.ConvertFToU(32, ir.FPMul(coord, dim));
+            }
+        }
        if (unnormalized) {
            // Normalize the coordinate for sampling, dividing by its corresponding dimension.
            const auto dim =
@@ -958,12 +969,10 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
            addr_reg = addr_reg + 1;
            return get_coord(addr_reg - 1, 0);
        case AmdGpu::ImageType::Color1DArray: // x, slice
-            [[fallthrough]];
-        case AmdGpu::ImageType::Color2D: // x, y
+        case AmdGpu::ImageType::Color2D:      // x, y
+        case AmdGpu::ImageType::Color2DMsaa:  // x, y
            addr_reg = addr_reg + 2;
            return ir.CompositeConstruct(get_coord(addr_reg - 2, 0), get_coord(addr_reg - 1, 1));
-        case AmdGpu::ImageType::Color2DMsaa: // x, y, frag
-            [[fallthrough]];
        case AmdGpu::ImageType::Color2DArray: // x, y, slice
            addr_reg = addr_reg + 3;
            // Note we can use FixCubeCoords with fallthrough cases since it checks for image type.
@@ -986,6 +995,9 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
    const IR::F32 lod_clamp = inst_info.has_lod_clamp ? get_addr_reg(addr_reg++) : IR::F32{};

    auto texel = [&] -> IR::Value {
+        if (is_msaa) {
+            return ir.ImageRead(handle, coords, ir.Imm32(0U), ir.Imm32(0U), inst_info);
+        }
        if (inst_info.is_gather) {
            if (inst_info.is_depth) {
                return ir.ImageGatherDref(handle, coords, offset, dref, inst_info);
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@@ -160,13 +160,6 @@ void CollectShaderInfoPass(IR::Program& program, const Profile& profile) {
        }
    }

-    if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) {
-        info.has_layer_output = true;
-    }
-    if (info.stores.GetAny(IR::Attribute::ViewportIndex)) {
-        info.has_viewport_index_output = true;
-    }
-
    // In case Flatbuf has not already been bound by IR and is needed
    // to query buffer sizes, bind it now.
    if (!profile.supports_robust_buffer_access && !info.uses_dma) {
--- a/src/shader_recompiler/ir/reinterpret.h
+++ b/src/shader_recompiler/ir/reinterpret.h
@@ -22,7 +22,7 @@ inline Value ApplySwizzle(IREmitter& ir, const Value& vector, const AmdGpu::Comp
 }

 /// Converts gamma corrected value to linear space
-inline F32 ApplyGammaToLinear(IREmitter& ir, F32& c) {
+inline F32 ApplyGammaToLinear(IREmitter& ir, const F32& c) {
    const F32 a =
        ir.FPPow(ir.FPMul(ir.FPAdd(c, ir.Imm32(0.055f)), ir.Imm32(1.0f / 1.055f)), ir.Imm32(2.4f));
    const F32 b = ir.FPMul(c, ir.Imm32(1.0f / 12.92f));
@@ -80,6 +80,9 @@ inline F32 ApplyReadNumberConversion(IREmitter& ir, const F32& value,
        const auto float_val = ir.ConvertUToF(32, 32, ir.BitCast<U32>(value));
        return ir.FPDiv(float_val, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
    }
+    case AmdGpu::NumberConversion::SrgbToNorm: {
+        return ApplyGammaToLinear(ir, value);
+    }
    default:
        UNREACHABLE();
    }