From 374c2194d44b9efd2f64546de3551a5fb84303ce Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Fri, 12 Sep 2025 19:29:16 +0300
Subject: [PATCH] video_core: Address various UE bugs (#3559)

* vk_rasterizer: Reorder image query in fast clear elimination

Fixes missing clears when a texture is being cleared using this method but never actually used for rendering purposes by ensuring the texture cache has at least a chance to register cmask

* shader_recompiler: Partial support for ANCILLARY_ENA

* pixel_format: Add number conversion of BC6 srgb format

* texture_cache: Support aliases of 3D and 2D array images

Used be UE to render its post processing LUT

* pixel_format: Test BC6 srgb as unorm

Still not sure what is up with snorm/unorm can be useful to have both actions to compare for now

* video_core: Use attachment feedback layout instead of general if possible

UE games often do mipgen passes where the previous mip of the image being rendered to is bound for reading. This appears to cause corruption issues so use attachment feedback loop extension to ensure correct output

* renderer_vulkan: Improve feedback loop code

* Set proper usage flag for feedback loop usage
* Add dynamic state extension and enable it for color aspect when necessary
* Check if image is bound instead of force_general for better code consistency

* shader_recompiler: More proper depth export implementation

* shader_recompiler: Fix bug in output modifiers

* shader_recompiler: Fix sampling from MSAA images

This is not allowed by any graphics API but seems hardware supports it somehow and it can be encountered. To avoid glitched output translate to to a texelFetch call on sample 0

* clang format

* image: Add back missing code

* shader_recompiler: Better ancillary implementation

Now is implemented with a custom attribute that is constant propagated depending on which parts of it are extracted. It will assert if an unknown part is used or if the attribute itself is not removed by dead code elim

* copy_shader: Ignore not enabled export channels

* constant_propagation: Invalidate ancillary after successful elimination

* spirv: Fix f11/f10 conversion to f32

---------

Co-authored-by: georgemoralis <giorgosmrls@gmail.com>
---
 src/common/number_utils.cpp                   |   2 +-
 .../backend/spirv/emit_spirv.cpp              |  10 +-
 .../spirv/emit_spirv_context_get_set.cpp      | 125 +++++++++---------
 .../backend/spirv/emit_spirv_special.cpp      |   2 +-
 .../backend/spirv/spirv_emit_context.cpp      |  72 +++++-----
 .../backend/spirv/spirv_emit_context.h        |   2 +
 .../frontend/copy_shader.cpp                  |   3 +
 .../frontend/translate/export.cpp             |  63 +++++++--
 .../frontend/translate/translate.cpp          |  11 +-
 .../frontend/translate/translate.h            |   1 +
 src/shader_recompiler/info.h                  |   2 -
 src/shader_recompiler/ir/attribute.cpp        |   6 +
 src/shader_recompiler/ir/attribute.h          |   3 +
 .../ir/passes/constant_propagation_pass.cpp   |  43 +++++-
 .../ir/passes/resource_tracking_pass.cpp      |  30 +++--
 .../ir/passes/shader_info_collection_pass.cpp |   7 -
 src/shader_recompiler/ir/reinterpret.h        |   5 +-
 src/shader_recompiler/runtime_info.h          |   3 +-
 src/video_core/amdgpu/pixel_format.h          |   6 +
 .../renderer_vulkan/vk_instance.cpp           |  19 +++
 src/video_core/renderer_vulkan/vk_instance.h  |   6 +
 .../renderer_vulkan/vk_pipeline_cache.cpp     |   3 +-
 .../renderer_vulkan/vk_rasterizer.cpp         |  62 +++++----
 .../renderer_vulkan/vk_rasterizer.h           |   3 +-
 .../renderer_vulkan/vk_scheduler.cpp          |   6 +
 src/video_core/renderer_vulkan/vk_scheduler.h |   9 ++
 src/video_core/texture_cache/image.cpp        |  31 +++--
 src/video_core/texture_cache/image_info.h     |   5 +-
 .../texture_cache/texture_cache.cpp           |   6 +
 src/video_core/texture_cache/types.h          |   6 +-
 30 files changed, 369 insertions(+), 183 deletions(-)

diff --git a/src/common/number_utils.cpp b/src/common/number_utils.cpp
index af29e5cd3..660f539f9 100644
--- a/src/common/number_utils.cpp
+++ b/src/common/number_utils.cpp
@@ -158,4 +158,4 @@ float S16ToSnorm(s16 val) {
     return float(val * c);
 }
 
-} // namespace NumberUtils
\ No newline at end of file
+} // namespace NumberUtils
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
index 98f2195c5..a5cdca8f1 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@@ -305,19 +305,23 @@ void SetupCapabilities(const Info& info, const Profile& profile, const RuntimeIn
             runtime_info.fs_info.addr_flags.persp_sample_ena) {
             ctx.AddCapability(spv::Capability::SampleRateShading);
         }
+        if (info.loads.GetAny(IR::Attribute::RenderTargetIndex)) {
+            ctx.AddCapability(spv::Capability::Geometry);
+        }
     }
     if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) {
         ctx.AddCapability(spv::Capability::Tessellation);
     }
     if (stage == LogicalStage::Vertex || stage == LogicalStage::TessellationControl ||
         stage == LogicalStage::TessellationEval) {
-        if (info.has_layer_output) {
+        if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) {
             ctx.AddCapability(spv::Capability::ShaderLayer);
         }
-        if (info.has_viewport_index_output) {
+        if (info.stores.GetAny(IR::Attribute::ViewportIndex)) {
             ctx.AddCapability(spv::Capability::ShaderViewportIndex);
         }
-    } else if (stage == LogicalStage::Geometry && info.has_viewport_index_output) {
+    } else if (stage == LogicalStage::Geometry &&
+               info.stores.GetAny(IR::Attribute::ViewportIndex)) {
         ctx.AddCapability(spv::Capability::MultiViewport);
     }
     if (info.uses_dma) {
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 6df8f74fd..554448b13 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -3,7 +3,6 @@
 
 #include "common/assert.h"
 #include "common/config.h"
-#include "common/logging/log.h"
 #include "shader_recompiler/backend/spirv/emit_spirv_bounds.h"
 #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
@@ -14,55 +13,11 @@
 #include <magic_enum/magic_enum.hpp>
 
 namespace Shader::Backend::SPIRV {
-namespace {
 
-Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) {
-    if (IR::IsParam(attr)) {
-        const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)};
-        if (ctx.stage == Stage::Local) {
-            const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]);
-            return ctx.OpAccessChain(component_ptr, ctx.output_attr_array, ctx.ConstU32(attr_index),
-                                     ctx.ConstU32(element));
-        } else {
-            const auto& info{ctx.output_params.at(attr_index)};
-            ASSERT(info.num_components > 0);
-            if (info.num_components == 1) {
-                return info.id;
-            } else {
-                return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element));
-            }
-        }
-    }
-    if (IR::IsMrt(attr)) {
-        const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)};
-        const auto& info{ctx.frag_outputs.at(index)};
-        if (info.num_components == 1) {
-            return info.id;
-        } else {
-            return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element));
-        }
-    }
-    switch (attr) {
-    case IR::Attribute::Position0:
-        return ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element));
-    case IR::Attribute::ClipDistance:
-        return ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, ctx.ConstU32(element));
-    case IR::Attribute::CullDistance:
-        return ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, ctx.ConstU32(element));
-    case IR::Attribute::PointSize:
-        return ctx.output_point_size;
-    case IR::Attribute::RenderTargetIndex:
-        return ctx.output_layer;
-    case IR::Attribute::ViewportIndex:
-        return ctx.output_viewport_index;
-    case IR::Attribute::Depth:
-        return ctx.frag_depth;
-    default:
-        UNREACHABLE_MSG("Write attribute {}", attr);
-    }
-}
+using PointerType = EmitContext::PointerType;
+using PointerSize = EmitContext::PointerSize;
 
-std::pair<Id, bool> OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) {
+static std::pair<Id, bool> OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) {
     if (IR::IsParam(attr)) {
         const u32 index{u32(attr) - u32(IR::Attribute::Param0)};
         const auto& info{ctx.output_params.at(index)};
@@ -82,15 +37,13 @@ std::pair<Id, bool> OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr
         return {ctx.F32[1], false};
     case IR::Attribute::RenderTargetIndex:
     case IR::Attribute::ViewportIndex:
-        return {ctx.S32[1], true};
+    case IR::Attribute::SampleMask:
+    case IR::Attribute::StencilRef:
+        return {ctx.U32[1], true};
     default:
         UNREACHABLE_MSG("Write attribute {}", attr);
     }
 }
-} // Anonymous namespace
-
-using PointerType = EmitContext::PointerType;
-using PointerSize = EmitContext::PointerSize;
 
 Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) {
     const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg);
@@ -212,6 +165,10 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
     case IR::Attribute::IsFrontFace:
         return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value,
                             ctx.u32_zero_value);
+    case IR::Attribute::SampleIndex:
+        return ctx.OpLoad(ctx.U32[1], ctx.sample_index);
+    case IR::Attribute::RenderTargetIndex:
+        return ctx.OpLoad(ctx.U32[1], ctx.output_layer);
     case IR::Attribute::PrimitiveId:
         return ctx.OpLoad(ctx.U32[1], ctx.primitive_id);
     case IR::Attribute::InvocationId:
@@ -243,12 +200,62 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) {
 }
 
 void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 element) {
-    const Id pointer{OutputAttrPointer(ctx, attr, element)};
-    const auto [component_type, is_integer]{OutputAttrComponentType(ctx, attr)};
-    if (is_integer) {
-        ctx.OpStore(pointer, ctx.OpBitcast(component_type, value));
-    } else {
-        ctx.OpStore(pointer, value);
+    const auto op_store = [&](Id pointer) {
+        const auto [component_type, is_integer] = OutputAttrComponentType(ctx, attr);
+        if (is_integer) {
+            ctx.OpStore(pointer, ctx.OpBitcast(component_type, value));
+        } else {
+            ctx.OpStore(pointer, value);
+        }
+    };
+    if (IR::IsParam(attr)) {
+        const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)};
+        if (ctx.stage == Stage::Local) {
+            const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]);
+            return op_store(ctx.OpAccessChain(component_ptr, ctx.output_attr_array,
+                                              ctx.ConstU32(attr_index), ctx.ConstU32(element)));
+        } else {
+            const auto& info{ctx.output_params.at(attr_index)};
+            ASSERT(info.num_components > 0);
+            if (info.num_components == 1) {
+                return op_store(info.id);
+            } else {
+                return op_store(
+                    ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)));
+            }
+        }
+    }
+    if (IR::IsMrt(attr)) {
+        const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)};
+        const auto& info{ctx.frag_outputs.at(index)};
+        if (info.num_components == 1) {
+            return op_store(info.id);
+        } else {
+            return op_store(ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)));
+        }
+    }
+    switch (attr) {
+    case IR::Attribute::Position0:
+        return op_store(
+            ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element)));
+    case IR::Attribute::ClipDistance:
+        return op_store(
+            ctx.OpAccessChain(ctx.output_f32, ctx.clip_distances, ctx.ConstU32(element)));
+    case IR::Attribute::CullDistance:
+        return op_store(
+            ctx.OpAccessChain(ctx.output_f32, ctx.cull_distances, ctx.ConstU32(element)));
+    case IR::Attribute::PointSize:
+        return op_store(ctx.output_point_size);
+    case IR::Attribute::RenderTargetIndex:
+        return op_store(ctx.output_layer);
+    case IR::Attribute::ViewportIndex:
+        return op_store(ctx.output_viewport_index);
+    case IR::Attribute::Depth:
+        return op_store(ctx.frag_depth);
+    case IR::Attribute::SampleMask:
+        return op_store(ctx.OpAccessChain(ctx.output_u32, ctx.sample_mask, ctx.u32_zero_value));
+    default:
+        UNREACHABLE_MSG("Write attribute {}", attr);
     }
 }
 
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp
index 440f80fa9..804d98b74 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp
@@ -28,7 +28,7 @@ void ConvertDepthMode(EmitContext& ctx) {
 }
 
 void ConvertPositionToClipSpace(EmitContext& ctx) {
-    ASSERT_MSG(!ctx.info.has_viewport_index_output,
+    ASSERT_MSG(!ctx.info.stores.GetAny(IR::Attribute::ViewportIndex),
                "Multi-viewport with shader clip space conversion not yet implemented.");
 
     const Id type{ctx.F32[1]};
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 131b475fc..4152420d0 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -370,13 +370,18 @@ void EmitContext::DefineInputs() {
         if (info.loads.GetAny(IR::Attribute::FragCoord)) {
             frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input);
         }
-        if (info.stores.Get(IR::Attribute::Depth)) {
-            frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output);
-        }
         if (info.loads.Get(IR::Attribute::IsFrontFace)) {
             front_facing =
                 DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input);
         }
+        if (info.loads.GetAny(IR::Attribute::RenderTargetIndex)) {
+            output_layer = DefineVariable(U32[1], spv::BuiltIn::Layer, spv::StorageClass::Input);
+            Decorate(output_layer, spv::Decoration::Flat);
+        }
+        if (info.loads.Get(IR::Attribute::SampleIndex)) {
+            sample_index = DefineVariable(U32[1], spv::BuiltIn::SampleId, spv::StorageClass::Input);
+            Decorate(sample_index, spv::Decoration::Flat);
+        }
         if (info.loads.GetAny(IR::Attribute::BaryCoordSmooth)) {
             if (profile.supports_amd_shader_explicit_vertex_parameter) {
                 bary_coord_smooth = DefineVariable(F32[2], spv::BuiltIn::BaryCoordSmoothAMD,
@@ -560,11 +565,11 @@ void EmitContext::DefineVertexBlock() {
             DefineVariable(F32[1], spv::BuiltIn::PointSize, spv::StorageClass::Output);
     }
     if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) {
-        output_layer = DefineVariable(S32[1], spv::BuiltIn::Layer, spv::StorageClass::Output);
+        output_layer = DefineVariable(U32[1], spv::BuiltIn::Layer, spv::StorageClass::Output);
     }
     if (info.stores.GetAny(IR::Attribute::ViewportIndex)) {
         output_viewport_index =
-            DefineVariable(S32[1], spv::BuiltIn::ViewportIndex, spv::StorageClass::Output);
+            DefineVariable(U32[1], spv::BuiltIn::ViewportIndex, spv::StorageClass::Output);
     }
 }
 
@@ -646,6 +651,13 @@ void EmitContext::DefineOutputs() {
         break;
     }
     case LogicalStage::Fragment: {
+        if (info.stores.Get(IR::Attribute::Depth)) {
+            frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output);
+        }
+        if (info.stores.Get(IR::Attribute::SampleMask)) {
+            sample_mask = DefineVariable(TypeArray(U32[1], u32_one_value), spv::BuiltIn::SampleMask,
+                                         spv::StorageClass::Output);
+        }
         u32 num_render_targets = 0;
         for (u32 i = 0; i < IR::NumRenderTargets; i++) {
             const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i};
@@ -1080,36 +1092,26 @@ Id EmitContext::DefineUfloatM5ToFloat32(u32 mantissa_bits, const std::string_vie
     Name(func, name);
     AddLabel();
 
-    const auto raw_mantissa{
-        OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))};
-    const auto mantissa{OpConvertUToF(F32[1], raw_mantissa)};
-    const auto exponent{OpBitcast(
-        S32[1], OpBitFieldSExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U)))};
-
-    const auto is_exp_neg_one{OpIEqual(U1[1], exponent, ConstS32(-1))};
-    const auto is_exp_zero{OpIEqual(U1[1], exponent, ConstS32(0))};
-
-    const auto is_zero{OpIEqual(U1[1], value, ConstU32(0u))};
-    const auto is_nan{
-        OpLogicalAnd(U1[1], is_exp_neg_one, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))};
-    const auto is_inf{
-        OpLogicalAnd(U1[1], is_exp_neg_one, OpIEqual(U1[1], raw_mantissa, ConstU32(0u)))};
-    const auto is_denorm{
-        OpLogicalAnd(U1[1], is_exp_zero, OpINotEqual(U1[1], raw_mantissa, ConstU32(0u)))};
-
-    const auto denorm{OpFMul(F32[1], mantissa, ConstF32(1.f / (1 << 20)))};
-    const auto norm{OpLdexp(
-        F32[1],
-        OpFAdd(F32[1],
-               OpFMul(F32[1], mantissa, ConstF32(1.f / static_cast<float>(1 << mantissa_bits))),
-               ConstF32(1.f)),
-        exponent)};
-
-    const auto result{OpSelect(F32[1], is_zero, ConstF32(0.f),
-                               OpSelect(F32[1], is_nan, ConstF32(NAN),
-                                        OpSelect(F32[1], is_inf, ConstF32(INFINITY),
-                                                 OpSelect(F32[1], is_denorm, denorm, norm))))};
-
+    const Id exponent{OpBitFieldUExtract(U32[1], value, ConstU32(mantissa_bits), ConstU32(5U))};
+    const Id mantissa{OpBitFieldUExtract(U32[1], value, ConstU32(0U), ConstU32(mantissa_bits))};
+    const Id mantissa_f{OpConvertUToF(F32[1], mantissa)};
+    const Id a{OpSelect(F32[1], OpINotEqual(U1[1], mantissa, u32_zero_value),
+                        OpFMul(F32[1], ConstF32(1.f / (1 << (14 + mantissa_bits))), mantissa_f),
+                        f32_zero_value)};
+    const Id b{OpBitcast(F32[1], OpBitwiseOr(U32[1], mantissa, ConstU32(0x7f800000U)))};
+    const Id exponent_c{OpISub(U32[1], exponent, ConstU32(15U))};
+    const Id scale_a{
+        OpFDiv(F32[1], ConstF32(1.f),
+               OpConvertUToF(F32[1], OpShiftLeftLogical(U32[1], u32_one_value,
+                                                        OpSNegate(U32[1], exponent_c))))};
+    const Id scale_b{OpConvertUToF(F32[1], OpShiftLeftLogical(U32[1], u32_one_value, exponent_c))};
+    const Id scale{
+        OpSelect(F32[1], OpSLessThan(U1[1], exponent_c, u32_zero_value), scale_a, scale_b)};
+    const Id c{OpFMul(F32[1], scale,
+                      OpFAdd(F32[1], ConstF32(1.f),
+                             OpFDiv(F32[1], mantissa_f, ConstF32(f32(1 << mantissa_bits)))))};
+    const Id result{OpSelect(F32[1], OpIEqual(U1[1], exponent, u32_zero_value), a,
+                             OpSelect(F32[1], OpIEqual(U1[1], exponent, ConstU32(31U)), b, c))};
     OpReturnValue(result);
     OpFunctionEnd();
     return func;
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 4daba8903..9bb2b7d7a 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -256,6 +256,8 @@ public:
     Id frag_coord{};
     Id front_facing{};
     Id frag_depth{};
+    Id sample_mask{};
+    Id sample_index{};
     Id clip_distances{};
     Id cull_distances{};
 
diff --git a/src/shader_recompiler/frontend/copy_shader.cpp b/src/shader_recompiler/frontend/copy_shader.cpp
index 52b433dbc..795003e43 100644
--- a/src/shader_recompiler/frontend/copy_shader.cpp
+++ b/src/shader_recompiler/frontend/copy_shader.cpp
@@ -49,6 +49,9 @@ CopyShaderData ParseCopyShader(std::span<const u32> code) {
             const auto& exp = inst.control.exp;
             const IR::Attribute semantic = static_cast<IR::Attribute>(exp.target);
             for (int i = 0; i < inst.src_count; ++i) {
+                if ((exp.en & (1 << i)) == 0) {
+                    continue;
+                }
                 const auto ofs = offsets[inst.src[i].code];
                 if (ofs != -1) {
                     data.attr_map[ofs] = {semantic, i};
diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp
index 9dccf1105..e1e39105f 100644
--- a/src/shader_recompiler/frontend/translate/export.cpp
+++ b/src/shader_recompiler/frontend/translate/export.cpp
@@ -22,7 +22,7 @@ static AmdGpu::NumberFormat NumberFormatCompressed(
     case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SINT16:
         return AmdGpu::NumberFormat::Sint;
     default:
-        UNREACHABLE_MSG("Unimplemented compressed MRT export format {}",
+        UNREACHABLE_MSG("Unimplemented compressed export format {}",
                         static_cast<u32>(export_format));
     }
 }
@@ -42,7 +42,7 @@ static u32 MaskFromExportFormat(u8 mask, AmdGpu::Liverpool::ShaderExportFormat e
         // All components
         return mask;
     default:
-        UNREACHABLE_MSG("Unimplemented uncompressed MRT export format {}",
+        UNREACHABLE_MSG("Unimplemented uncompressed export format {}",
                         static_cast<u32>(export_format));
     }
 }
@@ -118,25 +118,68 @@ void Translator::ExportRenderTarget(const GcnInst& inst) {
     }
 }
 
+void Translator::ExportDepth(const GcnInst& inst) {
+    const auto& exp = inst.control.exp;
+    if (exp.en == 0) {
+        // No export
+        return;
+    }
+
+    std::array<IR::F32, 4> components{};
+    if (exp.compr) {
+        // Components are float16 packed into a VGPR
+        const auto num_format = NumberFormatCompressed(runtime_info.fs_info.z_export_format);
+        // Export R, G
+        if (exp.en & 1) {
+            const IR::Value unpacked_value =
+                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[0].code)));
+            components[0] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
+            components[1] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
+        }
+        // Export B, A
+        if ((exp.en >> 2) & 1) {
+            const IR::Value unpacked_value =
+                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[1].code)));
+            components[2] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
+            // components[3] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
+        }
+    } else {
+        // Components are float32 into separate VGPRS
+        u32 mask = MaskFromExportFormat(exp.en, runtime_info.fs_info.z_export_format);
+        for (u32 i = 0; i < 4; i++, mask >>= 1) {
+            if ((mask & 1) == 0) {
+                continue;
+            }
+            components[i] = ir.GetVectorReg<IR::F32>(IR::VectorReg(inst.src[i].code));
+        }
+    }
+
+    static constexpr std::array MrtzBuiltins = {IR::Attribute::Depth, IR::Attribute::StencilRef,
+                                                IR::Attribute::SampleMask, IR::Attribute::Null};
+    for (u32 i = 0; i < 4; ++i) {
+        if (components[i].IsEmpty()) {
+            continue;
+        }
+        ir.SetAttribute(MrtzBuiltins[i], components[i]);
+    }
+}
+
 void Translator::EmitExport(const GcnInst& inst) {
     if (info.stage == Stage::Fragment && inst.control.exp.vm) {
         ir.Discard(ir.LogicalNot(ir.GetExec()));
     }
 
-    const auto& exp = inst.control.exp;
-    const IR::Attribute attrib{exp.target};
+    const IR::Attribute attrib{inst.control.exp.target};
     if (IR::IsMrt(attrib)) {
         return ExportRenderTarget(inst);
     }
-
-    if (attrib == IR::Attribute::Depth && exp.en != 0 && exp.en != 1) {
-        LOG_WARNING(Render_Vulkan, "Unsupported depth export");
-        return;
+    if (attrib == IR::Attribute::Depth) {
+        return ExportDepth(inst);
     }
 
-    ASSERT_MSG(!exp.compr, "Compressed exports only supported for render targets");
+    ASSERT_MSG(!inst.control.exp.compr, "Compressed exports only supported for render targets");
 
-    u32 mask = exp.en;
+    u32 mask = inst.control.exp.en;
     for (u32 i = 0; i < 4; i++, mask >>= 1) {
         if ((mask & 1) == 0) {
             continue;
diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp
index e7a7b3be6..9e42ebea9 100644
--- a/src/shader_recompiler/frontend/translate/translate.cpp
+++ b/src/shader_recompiler/frontend/translate/translate.cpp
@@ -171,6 +171,13 @@ void Translator::EmitPrologue(IR::Block* first_block) {
                 ir.SetVectorReg(dst_vreg++, ir.Imm32(0));
             }
         }
+        if (runtime_info.fs_info.addr_flags.ancillary_ena) {
+            if (runtime_info.fs_info.en_flags.ancillary_ena) {
+                ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::PackedAncillary));
+            } else {
+                ir.SetVectorReg(dst_vreg++, ir.Imm32(0));
+            }
+        }
         break;
     case LogicalStage::TessellationControl: {
         ir.SetVectorReg(IR::VectorReg::V0, ir.GetAttributeU32(IR::Attribute::PrimitiveId));
@@ -460,7 +467,7 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) {
             result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier));
         }
         if (operand.output_modifier.clamp) {
-            result = ir.FPSaturate(value);
+            result = ir.FPSaturate(result);
         }
     }
 
@@ -490,7 +497,7 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra
                 ir.FPMul(value_untyped, ir.Imm64(f64(operand.output_modifier.multiplier)));
         }
         if (operand.output_modifier.clamp) {
-            value_untyped = ir.FPSaturate(value_raw);
+            value_untyped = ir.FPSaturate(value_untyped);
         }
     }
 
diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h
index dad2cc829..b3b6a3977 100644
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@@ -319,6 +319,7 @@ private:
                              const IR::F32& x_res, const IR::F32& y_res, const IR::F32& z_res);
 
     void ExportRenderTarget(const GcnInst& inst);
+    void ExportDepth(const GcnInst& inst);
     void LogMissingOpcode(const GcnInst& inst);
 
     IR::VectorReg GetScratchVgpr(u32 offset);
diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
index 689264c6a..ccf2c45e0 100644
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@@ -210,8 +210,6 @@ struct Info {
     bool has_bitwise_xor{};
     bool has_image_gather{};
     bool has_image_query{};
-    bool has_layer_output{};
-    bool has_viewport_index_output{};
     bool uses_buffer_atomic_float_min_max{};
     bool uses_image_atomic_float_min_max{};
     bool uses_lane_id{};
diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp
index 388f8de8c..382f9b1d9 100644
--- a/src/shader_recompiler/ir/attribute.cpp
+++ b/src/shader_recompiler/ir/attribute.cpp
@@ -160,6 +160,12 @@ std::string NameOf(Attribute attribute) {
         return "TessFactorsBufferBase";
     case Attribute::PointSize:
         return "PointSize";
+    case Attribute::StencilRef:
+        return "StencilRef";
+    case Attribute::SampleMask:
+        return "SampleMask";
+    case Attribute::PackedAncillary:
+        return "PackedAncillary";
     default:
         break;
     }
diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h
index 28950ab52..c8a6e6b20 100644
--- a/src/shader_recompiler/ir/attribute.h
+++ b/src/shader_recompiler/ir/attribute.h
@@ -88,6 +88,9 @@ enum class Attribute : u64 {
     OffChipLdsBase = 91,
     TessFactorsBufferBase = 92,
     PointSize = 93,
+    StencilRef = 94,
+    SampleMask = 95,
+    PackedAncillary = 96,
     Max,
 };
 
diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
index b877a6e87..5f9a3cc55 100644
--- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
@@ -257,12 +257,50 @@ void FoldCmpClass(IR::Block& block, IR::Inst& inst) {
         IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
         const IR::F32 value = IR::F32{inst.Arg(0)};
         inst.ReplaceUsesWithAndRemove(
-            ir.LogicalNot(ir.LogicalOr(ir.FPIsInf(value), ir.FPIsInf(value))));
+            ir.LogicalNot(ir.LogicalOr(ir.FPIsNan(value), ir.FPIsInf(value))));
     } else {
         UNREACHABLE();
     }
 }
 
+bool FoldPackedAncillary(IR::Block& block, IR::Inst& inst) {
+    if (inst.Arg(0).IsImmediate() || !inst.Arg(1).IsImmediate() || !inst.Arg(2).IsImmediate()) {
+        return false;
+    }
+    IR::Inst* value = inst.Arg(0).InstRecursive();
+    if (value->GetOpcode() != IR::Opcode::GetAttributeU32 ||
+        value->Arg(0).Attribute() != IR::Attribute::PackedAncillary) {
+        return false;
+    }
+    const u32 offset = inst.Arg(1).U32();
+    const u32 bits = inst.Arg(2).U32();
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    if (offset >= 8 && offset + bits <= 12) {
+        const auto sample_index = ir.GetAttributeU32(IR::Attribute::SampleIndex);
+        if (offset == 8 && bits == 4) {
+            inst.ReplaceUsesWithAndRemove(sample_index);
+        } else {
+            inst.ReplaceUsesWithAndRemove(
+                ir.BitFieldExtract(sample_index, ir.Imm32(offset - 8), ir.Imm32(bits)));
+        }
+    } else if (offset >= 16 && offset + bits <= 27) {
+        const auto mrt_index = ir.GetAttributeU32(IR::Attribute::RenderTargetIndex);
+        if (offset == 16 && bits == 11) {
+            inst.ReplaceUsesWithAndRemove(mrt_index);
+        } else {
+            inst.ReplaceUsesWithAndRemove(
+                ir.BitFieldExtract(mrt_index, ir.Imm32(offset - 16), ir.Imm32(bits)));
+        }
+    } else {
+        UNREACHABLE_MSG("Unhandled bitfield extract from ancillary VGPR offset={}, bits={}", offset,
+                        bits);
+    }
+
+    value->ReplaceUsesWithAndRemove(ir.Imm32(0U));
+
+    return true;
+}
+
 void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
     switch (inst.GetOpcode()) {
     case IR::Opcode::IAdd32:
@@ -475,6 +513,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
         FoldWhenAllImmediates(inst, [](u64 a) { return static_cast<u32>(std::popcount(a)); });
         return;
     case IR::Opcode::BitFieldUExtract:
+        if (FoldPackedAncillary(block, inst)) {
+            return;
+        }
         FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) {
             if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) {
                 UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index 625c8676e..38aad55c4 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -934,14 +934,25 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
         }
     }();
 
-    const auto unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized;
-    // Query dimensions of image if needed for normalization.
-    // We can't use the image sharp because it could be bound to a different image later.
+    const bool is_msaa = view_type == AmdGpu::ImageType::Color2DMsaa ||
+                         view_type == AmdGpu::ImageType::Color2DMsaaArray;
+    const bool unnormalized = sampler.force_unnormalized || inst_info.is_unnormalized;
+    const bool needs_dimentions = (!is_msaa && unnormalized) || (is_msaa && !unnormalized);
     const auto dimensions =
-        unnormalized ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info)
-                     : IR::Value{};
+        needs_dimentions ? ir.ImageQueryDimension(handle, ir.Imm32(0u), ir.Imm1(false), inst_info)
+                         : IR::Value{};
     const auto get_coord = [&](u32 coord_idx, u32 dim_idx) -> IR::Value {
         const auto coord = get_addr_reg(coord_idx);
+        if (is_msaa) {
+            // For MSAA images preserve the unnormalized coord or manually unnormalize it
+            if (unnormalized) {
+                return ir.ConvertFToU(32, coord);
+            } else {
+                const auto dim =
+                    ir.ConvertUToF(32, 32, IR::U32{ir.CompositeExtract(dimensions, dim_idx)});
+                return ir.ConvertFToU(32, ir.FPMul(coord, dim));
+            }
+        }
         if (unnormalized) {
             // Normalize the coordinate for sampling, dividing by its corresponding dimension.
             const auto dim =
@@ -958,12 +969,10 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
             addr_reg = addr_reg + 1;
             return get_coord(addr_reg - 1, 0);
         case AmdGpu::ImageType::Color1DArray: // x, slice
-            [[fallthrough]];
-        case AmdGpu::ImageType::Color2D: // x, y
+        case AmdGpu::ImageType::Color2D:      // x, y
+        case AmdGpu::ImageType::Color2DMsaa:  // x, y
             addr_reg = addr_reg + 2;
             return ir.CompositeConstruct(get_coord(addr_reg - 2, 0), get_coord(addr_reg - 1, 1));
-        case AmdGpu::ImageType::Color2DMsaa: // x, y, frag
-            [[fallthrough]];
         case AmdGpu::ImageType::Color2DArray: // x, y, slice
             addr_reg = addr_reg + 3;
             // Note we can use FixCubeCoords with fallthrough cases since it checks for image type.
@@ -986,6 +995,9 @@ void PatchImageSampleArgs(IR::Block& block, IR::Inst& inst, Info& info,
     const IR::F32 lod_clamp = inst_info.has_lod_clamp ? get_addr_reg(addr_reg++) : IR::F32{};
 
     auto texel = [&] -> IR::Value {
+        if (is_msaa) {
+            return ir.ImageRead(handle, coords, ir.Imm32(0U), ir.Imm32(0U), inst_info);
+        }
         if (inst_info.is_gather) {
             if (inst_info.is_depth) {
                 return ir.ImageGatherDref(handle, coords, offset, dref, inst_info);
diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
index a7108a5ef..8f0e61da2 100644
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@@ -160,13 +160,6 @@ void CollectShaderInfoPass(IR::Program& program, const Profile& profile) {
         }
     }
 
-    if (info.stores.GetAny(IR::Attribute::RenderTargetIndex)) {
-        info.has_layer_output = true;
-    }
-    if (info.stores.GetAny(IR::Attribute::ViewportIndex)) {
-        info.has_viewport_index_output = true;
-    }
-
     // In case Flatbuf has not already been bound by IR and is needed
     // to query buffer sizes, bind it now.
     if (!profile.supports_robust_buffer_access && !info.uses_dma) {
diff --git a/src/shader_recompiler/ir/reinterpret.h b/src/shader_recompiler/ir/reinterpret.h
index 10728d8dd..84a4a51d5 100644
--- a/src/shader_recompiler/ir/reinterpret.h
+++ b/src/shader_recompiler/ir/reinterpret.h
@@ -22,7 +22,7 @@ inline Value ApplySwizzle(IREmitter& ir, const Value& vector, const AmdGpu::Comp
 }
 
 /// Converts gamma corrected value to linear space
-inline F32 ApplyGammaToLinear(IREmitter& ir, F32& c) {
+inline F32 ApplyGammaToLinear(IREmitter& ir, const F32& c) {
     const F32 a =
         ir.FPPow(ir.FPMul(ir.FPAdd(c, ir.Imm32(0.055f)), ir.Imm32(1.0f / 1.055f)), ir.Imm32(2.4f));
     const F32 b = ir.FPMul(c, ir.Imm32(1.0f / 12.92f));
@@ -80,6 +80,9 @@ inline F32 ApplyReadNumberConversion(IREmitter& ir, const F32& value,
         const auto float_val = ir.ConvertUToF(32, 32, ir.BitCast<U32>(value));
         return ir.FPDiv(float_val, ir.Imm32(static_cast<float>(std::numeric_limits<u32>::max())));
     }
+    case AmdGpu::NumberConversion::SrgbToNorm: {
+        return ApplyGammaToLinear(ir, value);
+    }
     default:
         UNREACHABLE();
     }
diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h
index f532dcbad..53e4ecd11 100644
--- a/src/shader_recompiler/runtime_info.h
+++ b/src/shader_recompiler/runtime_info.h
@@ -205,12 +205,13 @@ struct FragmentRuntimeInfo {
     u32 num_inputs;
     std::array<PsInput, 32> inputs;
     std::array<PsColorBuffer, MaxColorBuffers> color_buffers;
+    AmdGpu::Liverpool::ShaderExportFormat z_export_format;
     bool dual_source_blending;
 
     bool operator==(const FragmentRuntimeInfo& other) const noexcept {
         return std::ranges::equal(color_buffers, other.color_buffers) &&
                en_flags.raw == other.en_flags.raw && addr_flags.raw == other.addr_flags.raw &&
-               num_inputs == other.num_inputs &&
+               num_inputs == other.num_inputs && z_export_format == other.z_export_format &&
                dual_source_blending == other.dual_source_blending &&
                std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(),
                                   other.inputs.begin() + num_inputs);
diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h
index 27c9313a2..21c2eee2a 100644
--- a/src/video_core/amdgpu/pixel_format.h
+++ b/src/video_core/amdgpu/pixel_format.h
@@ -102,6 +102,7 @@ enum class NumberConversion : u32 {
     Sint8ToSnormNz = 4,
     Sint16ToSnormNz = 5,
     Uint32ToUnorm = 6,
+    SrgbToNorm = 7,
 };
 
 union CompMapping {
@@ -219,6 +220,8 @@ constexpr NumberFormat RemapNumberFormat(const NumberFormat format, const DataFo
             return format;
         }
     }
+    case NumberFormat::Srgb:
+        return data_format == DataFormat::FormatBc6 ? NumberFormat::Unorm : format;
     case NumberFormat::Uscaled:
         return NumberFormat::Uint;
     case NumberFormat::Sscaled:
@@ -295,6 +298,9 @@ constexpr NumberConversion MapNumberConversion(const NumberFormat num_fmt,
             return NumberConversion::None;
         }
     }
+    case NumberFormat::Srgb:
+        return data_fmt == DataFormat::FormatBc6 ? NumberConversion::SrgbToNorm
+                                                 : NumberConversion::None;
     case NumberFormat::Uscaled:
         return NumberConversion::UintToUscaled;
     case NumberFormat::Sscaled:
diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp
index d13aeec99..5206edbec 100644
--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@@ -254,6 +254,15 @@ bool Instance::CreateDevice() {
 
     // Optional
     maintenance_8 = add_extension(VK_KHR_MAINTENANCE_8_EXTENSION_NAME);
+    attachment_feedback_loop = add_extension(VK_EXT_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_EXTENSION_NAME);
+    if (attachment_feedback_loop) {
+        attachment_feedback_loop =
+            add_extension(VK_EXT_ATTACHMENT_FEEDBACK_LOOP_DYNAMIC_STATE_EXTENSION_NAME);
+        if (!attachment_feedback_loop) {
+            // We want both extensions so remove the first if the second isn't available
+            enabled_extensions.pop_back();
+        }
+    }
     depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME);
     dynamic_state_3 = add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME);
     if (dynamic_state_3) {
@@ -464,6 +473,12 @@ bool Instance::CreateDevice() {
         vk::PhysicalDeviceMaintenance8FeaturesKHR{
             .maintenance8 = true,
         },
+        vk::PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT{
+            .attachmentFeedbackLoopLayout = true,
+        },
+        vk::PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT{
+            .attachmentFeedbackLoopDynamicState = true,
+        },
         vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{
             .shaderBufferFloat32AtomicMinMax =
                 shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax,
@@ -535,6 +550,10 @@ bool Instance::CreateDevice() {
     if (!maintenance_8) {
         device_chain.unlink<vk::PhysicalDeviceMaintenance8FeaturesKHR>();
     }
+    if (!attachment_feedback_loop) {
+        device_chain.unlink<vk::PhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT>();
+        device_chain.unlink<vk::PhysicalDeviceAttachmentFeedbackLoopDynamicStateFeaturesEXT>();
+    }
     if (!shader_atomic_float2) {
         device_chain.unlink<vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>();
     }
diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h
index e1fa180fb..09f68d764 100644
--- a/src/video_core/renderer_vulkan/vk_instance.h
+++ b/src/video_core/renderer_vulkan/vk_instance.h
@@ -114,6 +114,11 @@ public:
         return maintenance_8;
     }
 
+    /// Returns true if VK_EXT_attachment_feedback_loop_layout is supported
+    bool IsAttachmentFeedbackLoopLayoutSupported() const {
+        return attachment_feedback_loop;
+    }
+
     /// Returns true when VK_EXT_custom_border_color is supported
     bool IsCustomBorderColorSupported() const {
         return custom_border_color;
@@ -475,6 +480,7 @@ private:
     bool workgroup_memory_explicit_layout{};
     bool portability_subset{};
     bool maintenance_8{};
+    bool attachment_feedback_loop{};
     bool supports_memory_budget{};
     u64 total_memory_budget{};
     std::vector<size_t> valid_heaps;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 89b48f0e4..c250f4d13 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -167,8 +167,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
         BuildCommon(regs.ps_program);
         info.fs_info.en_flags = regs.ps_input_ena;
         info.fs_info.addr_flags = regs.ps_input_addr;
-        const auto& ps_inputs = regs.ps_inputs;
         info.fs_info.num_inputs = regs.num_interp;
+        info.fs_info.z_export_format = regs.z_export_format;
         const auto& cb0_blend = regs.blend_control[0];
         if (cb0_blend.enable) {
             info.fs_info.dual_source_blending =
@@ -182,6 +182,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS
         } else {
             info.fs_info.dual_source_blending = false;
         }
+        const auto& ps_inputs = regs.ps_inputs;
         for (u32 i = 0; i < regs.num_interp; i++) {
             info.fs_info.inputs[i] = {
                 .param_index = u8(ps_inputs[i].input_offset.Value()),
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index b7cb570f4..3ff78f967 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -251,15 +251,14 @@ void Rasterizer::EliminateFastClear() {
     if (!col_buf || !col_buf.info.fast_clear) {
         return;
     }
+    VideoCore::TextureCache::RenderTargetDesc desc(col_buf, liverpool->last_cb_extent[0]);
+    const auto& image_view = texture_cache.FindRenderTarget(desc);
     if (!texture_cache.IsMetaCleared(col_buf.CmaskAddress(), col_buf.view.slice_start)) {
         return;
     }
     for (u32 slice = col_buf.view.slice_start; slice <= col_buf.view.slice_max; ++slice) {
         texture_cache.TouchMeta(col_buf.CmaskAddress(), slice, false);
     }
-    const auto& hint = liverpool->last_cb_extent[0];
-    VideoCore::TextureCache::RenderTargetDesc desc(col_buf, hint);
-    const auto& image_view = texture_cache.FindRenderTarget(desc);
     auto& image = texture_cache.GetImage(image_view.image_id);
     const vk::ImageSubresourceRange range = {
         .aspectMask = vk::ImageAspectFlagBits::eColor,
@@ -723,11 +722,6 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin
             // to force general layout on it.
             image->binding.force_general |= image_desc.is_written;
         }
-        if (image->binding.is_target) {
-            // The image is already bound as target. Since we read and output to it need to force
-            // general layout too.
-            image->binding.force_general = 1u;
-        }
         image->binding.is_bound = 1u;
     }
 
@@ -754,8 +748,15 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin
             auto& image = texture_cache.GetImage(image_id);
             auto& image_view = texture_cache.FindTexture(image_id, desc);
 
-            if (image.binding.force_general || image.binding.is_target) {
-                image.Transit(vk::ImageLayout::eGeneral,
+            // The image is either bound as storage in a separate descriptor or bound as render
+            // target in feedback loop. Depth images are excluded because they can't be bound as
+            // storage and feedback loop doesn't make sense for them
+            if ((image.binding.force_general || image.binding.is_target) &&
+                !image.info.props.is_depth) {
+                image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported() &&
+                                      image.binding.is_target
+                                  ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT
+                                  : vk::ImageLayout::eGeneral,
                               vk::AccessFlagBits2::eShaderRead |
                                   (image.info.props.is_depth
                                        ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite
@@ -816,6 +817,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin
 
 void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& state) {
     int cb_index = 0;
+    attachment_feedback_loop = false;
     for (auto attach_idx = 0u; attach_idx < state.num_color_attachments; ++attach_idx) {
         if (state.color_attachments[attach_idx].imageView == VK_NULL_HANDLE) {
             continue;
@@ -835,11 +837,14 @@ void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& s
             state.height = std::min<u32>(state.height, std::max(image.info.size.height >> mip, 1u));
         }
         auto& image = texture_cache.GetImage(image_id);
-        if (image.binding.force_general) {
-            image.Transit(
-                vk::ImageLayout::eGeneral,
-                vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eShaderRead, {});
-
+        if (image.binding.is_bound) {
+            ASSERT_MSG(!image.binding.force_general,
+                       "Having image both as storage and render target is unsupported");
+            image.Transit(instance.IsAttachmentFeedbackLoopLayoutSupported()
+                              ? vk::ImageLayout::eAttachmentFeedbackLoopOptimalEXT
+                              : vk::ImageLayout::eGeneral,
+                          vk::AccessFlagBits2::eColorAttachmentWrite, {});
+            attachment_feedback_loop = true;
         } else {
             image.Transit(vk::ImageLayout::eColorAttachmentOptimal,
                           vk::AccessFlagBits2::eColorAttachmentWrite |
@@ -859,23 +864,15 @@ void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline, RenderState& s
         if (has_stencil) {
             image.aspect_mask |= vk::ImageAspectFlagBits::eStencil;
         }
-        if (image.binding.force_general) {
-            image.Transit(vk::ImageLayout::eGeneral,
-                          vk::AccessFlagBits2::eDepthStencilAttachmentWrite |
-                              vk::AccessFlagBits2::eShaderRead,
-                          {});
-        } else {
-            const auto new_layout = desc.view_info.is_storage
-                                        ? has_stencil
-                                              ? vk::ImageLayout::eDepthStencilAttachmentOptimal
-                                              : vk::ImageLayout::eDepthAttachmentOptimal
-                                    : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal
-                                                  : vk::ImageLayout::eDepthReadOnlyOptimal;
-            image.Transit(new_layout,
-                          vk::AccessFlagBits2::eDepthStencilAttachmentWrite |
-                              vk::AccessFlagBits2::eDepthStencilAttachmentRead,
-                          desc.view_info.range);
-        }
+        const auto new_layout = desc.view_info.is_storage
+                                    ? has_stencil ? vk::ImageLayout::eDepthStencilAttachmentOptimal
+                                                  : vk::ImageLayout::eDepthAttachmentOptimal
+                                : has_stencil ? vk::ImageLayout::eDepthStencilReadOnlyOptimal
+                                              : vk::ImageLayout::eDepthReadOnlyOptimal;
+        image.Transit(new_layout,
+                      vk::AccessFlagBits2::eDepthStencilAttachmentWrite |
+                          vk::AccessFlagBits2::eDepthStencilAttachmentRead,
+                      desc.view_info.range);
         state.depth_attachment.imageLayout = image.last_state.layout;
         state.stencil_attachment.imageLayout = image.last_state.layout;
         image.usage.depth_target = true;
@@ -1101,6 +1098,7 @@ void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline, const bool
     auto& dynamic_state = scheduler.GetDynamicState();
     dynamic_state.SetBlendConstants(liverpool->regs.blend_constants);
     dynamic_state.SetColorWriteMasks(pipeline.GetWriteMasks());
+    dynamic_state.SetAttachmentFeedbackLoopEnabled(attachment_feedback_loop);
 
     // Commit new dynamic state to the command buffer.
     dynamic_state.Commit(instance, scheduler.CommandBuffer());
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index a6848d527..b32cfa424 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -143,7 +143,8 @@ private:
     boost::container::static_vector<BufferBindingInfo, Shader::NumBuffers> buffer_bindings;
     using ImageBindingInfo = std::pair<VideoCore::ImageId, VideoCore::TextureCache::TextureDesc>;
     boost::container::static_vector<ImageBindingInfo, Shader::NumImages> image_bindings;
-    bool fault_process_pending{false};
+    bool fault_process_pending{};
+    bool attachment_feedback_loop{};
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index a34bb15ad..f1e5937fe 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -333,6 +333,12 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd
         dirty_state.line_width = false;
         cmdbuf.setLineWidth(line_width);
     }
+    if (dirty_state.feedback_loop_enabled && instance.IsAttachmentFeedbackLoopLayoutSupported()) {
+        dirty_state.feedback_loop_enabled = false;
+        cmdbuf.setAttachmentFeedbackLoopEnableEXT(feedback_loop_enabled
+                                                      ? vk::ImageAspectFlagBits::eColor
+                                                      : vk::ImageAspectFlagBits::eNone);
+    }
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index 7dbc2b260..ef0f84822 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -113,6 +113,7 @@ struct DynamicState {
         bool blend_constants : 1;
         bool color_write_masks : 1;
         bool line_width : 1;
+        bool feedback_loop_enabled : 1;
     } dirty_state{};
 
     Viewports viewports{};
@@ -149,6 +150,7 @@ struct DynamicState {
     std::array<float, 4> blend_constants{};
     ColorWriteMasks color_write_masks{};
     float line_width{};
+    bool feedback_loop_enabled{};
 
     /// Commits the dynamic state to the provided command buffer.
     void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf);
@@ -324,6 +326,13 @@ struct DynamicState {
             dirty_state.line_width = true;
         }
     }
+
+    void SetAttachmentFeedbackLoopEnabled(const bool enabled) {
+        if (feedback_loop_enabled != enabled) {
+            feedback_loop_enabled = enabled;
+            dirty_state.feedback_loop_enabled = true;
+        }
+    }
 };
 
 class Scheduler {
diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp
index 8dda6aa18..a0daab362 100644
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@@ -14,7 +14,8 @@ namespace VideoCore {
 
 using namespace Vulkan;
 
-static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) {
+static vk::ImageUsageFlags ImageUsageFlags(const Vulkan::Instance* instance,
+                                           const ImageInfo& info) {
     vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc |
                                 vk::ImageUsageFlagBits::eTransferDst |
                                 vk::ImageUsageFlagBits::eSampled;
@@ -23,13 +24,12 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) {
             usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment;
         } else {
             usage |= vk::ImageUsageFlagBits::eColorAttachment;
-
-            // In cases where an image is created as a render/depth target and cleared with compute,
-            // we cannot predict whether it will be used as a storage image. A proper solution would
-            // involve re-creating the resource with a new configuration and copying previous
-            // content into it. However, for now, we will set storage usage for all images (if the
-            // format allows), sacrificing a bit of performance. Note use of ExtendedUsage flag set
-            // by default.
+            if (instance->IsAttachmentFeedbackLoopLayoutSupported()) {
+                usage |= vk::ImageUsageFlagBits::eAttachmentFeedbackLoopEXT;
+            }
+            // Always create images with storage flag to avoid needing re-creation in case of e.g
+            // compute clears This sacrifices a bit of performance but is less work. ExtendedUsage
+            // flag is also used.
             usage |= vk::ImageUsageFlagBits::eStorage;
         }
     }
@@ -128,7 +128,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
         flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible;
     }
 
-    usage_flags = ImageUsageFlags(info);
+    usage_flags = ImageUsageFlags(instance, info);
     format_features = FormatFeatureFlags(usage_flags);
 
     switch (info.pixel_format) {
@@ -348,11 +348,16 @@ void Image::CopyImage(Image& src_image) {
     const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels);
     ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1);
 
+    const u32 width = src_info.size.width;
+    const u32 height = src_info.size.height;
+    const u32 depth =
+        info.type == AmdGpu::ImageType::Color3D ? info.size.depth : src_info.size.depth;
+
     boost::container::small_vector<vk::ImageCopy, 8> image_copies;
     for (u32 mip = 0; mip < num_mips; ++mip) {
-        const auto mip_w = std::max(src_info.size.width >> mip, 1u);
-        const auto mip_h = std::max(src_info.size.height >> mip, 1u);
-        const auto mip_d = std::max(src_info.size.depth >> mip, 1u);
+        const auto mip_w = std::max(width >> mip, 1u);
+        const auto mip_h = std::max(height >> mip, 1u);
+        const auto mip_d = std::max(depth >> mip, 1u);
 
         image_copies.emplace_back(vk::ImageCopy{
             .srcSubresource{
@@ -365,7 +370,7 @@ void Image::CopyImage(Image& src_image) {
                 .aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
                 .mipLevel = mip,
                 .baseArrayLayer = 0,
-                .layerCount = src_info.resources.layers,
+                .layerCount = info.resources.layers,
             },
             .extent = {mip_w, mip_h, mip_d},
         });
diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h
index 00f56b1c7..583b0d7fa 100644
--- a/src/video_core/texture_cache/image_info.h
+++ b/src/video_core/texture_cache/image_info.h
@@ -45,8 +45,9 @@ struct ImageInfo {
     bool IsTiled() const {
         return tile_mode != AmdGpu::TileMode::DisplayLinearAligned;
     }
-    Extent3D BlockDim() const {
-        return props.is_block ? Extent3D{size.width >> 2, size.height >> 2, size.depth} : size;
+    Extent2D BlockDim() const {
+        const auto dim = props.is_block ? 2 : 0;
+        return Extent2D{size.width >> dim, size.height >> dim};
     }
 
     s32 MipOf(const ImageInfo& info) const;
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index 2da037a6e..f39bc16fd 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -304,6 +304,12 @@ std::tuple<ImageId, int, int> TextureCache::ResolveOverlap(const ImageInfo& imag
             return {ExpandImage(image_info, cache_image_id), -1, -1};
         }
 
+        if (image_info.guest_size == tex_cache_image.info.guest_size &&
+            (image_info.type == AmdGpu::ImageType::Color3D ||
+             tex_cache_image.info.type == AmdGpu::ImageType::Color3D)) {
+            return {ExpandImage(image_info, cache_image_id), -1, -1};
+        }
+
         // Size and resources are less than or equal, use image view.
         if (image_info.pixel_format != tex_cache_image.info.pixel_format ||
             image_info.guest_size <= tex_cache_image.info.guest_size) {
diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h
index e20d4dcd0..097fdcb96 100644
--- a/src/video_core/texture_cache/types.h
+++ b/src/video_core/texture_cache/types.h
@@ -30,6 +30,10 @@ struct Region2D {
 struct Extent2D {
     u32 width;
     u32 height;
+
+    bool operator==(const Extent2D& other) const {
+        return width == other.width && height == other.height;
+    }
 };
 
 struct Extent3D {
@@ -37,8 +41,6 @@ struct Extent3D {
     u32 height;
     u32 depth;
 
-    auto operator<=>(const Extent3D&) const = default;
-
     bool operator==(const Extent3D& other) const {
         return width == other.width && height == other.height && depth == other.depth;
     }