From e0c930f2d801e0d2998202760b785a76f9346ecd Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Tue, 10 Jun 2025 18:57:16 +0300
Subject: [PATCH 1/7] shader_recompiler: Cleanup fragment attribute handling
 (#3076)

* image: Take minimum of mip levels

Avoids validation error

* texture_cache: Update depth target image

Avoids using undefined depth target in rendering

* shader_recompiler: Cleanup fragment attribute handling
---
 .../backend/spirv/spirv_emit_context.cpp      | 28 ++++++++++---------
 .../translate/vector_interpolation.cpp        | 12 ++++----
 src/video_core/texture_cache/image.cpp        |  3 +-
 .../texture_cache/texture_cache.cpp           |  2 +-
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 672856397..c47a75739 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -299,8 +299,7 @@ void EmitContext::DefineInterpolatedAttribs() {
     // Iterate all input attributes, load them and manually interpolate.
     for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
         const auto& input = runtime_info.fs_info.inputs[i];
-        const u32 semantic = input.param_index;
-        auto& params = input_params[semantic];
+        auto& params = input_params[i];
         if (input.is_flat || params.is_loaded) {
             continue;
         }
@@ -318,7 +317,7 @@ void EmitContext::DefineInterpolatedAttribs() {
         const Id p10_y{OpVectorTimesScalar(F32[4], p10, bary_coord_y)};
         const Id p20_z{OpVectorTimesScalar(F32[4], p20, bary_coord_z)};
         params.id = OpFAdd(F32[4], p0, OpFAdd(F32[4], p10_y, p20_z));
-        Name(params.id, fmt::format("fs_in_attr{}", semantic));
+        Name(params.id, fmt::format("fs_in_attr{}", i));
         params.is_loaded = true;
     }
 }
@@ -427,25 +426,28 @@ void EmitContext::DefineInputs() {
         }
         for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
             const auto& input = runtime_info.fs_info.inputs[i];
-            const u32 semantic = input.param_index;
-            ASSERT(semantic < IR::NumParams);
             if (input.IsDefault()) {
-                input_params[semantic] = {
-                    MakeDefaultValue(*this, input.default_value), input_f32, F32[1], 4, false, true,
+                input_params[i] = {
+                    .id = MakeDefaultValue(*this, input.default_value),
+                    .pointer_type = input_f32,
+                    .component_type = F32[1],
+                    .num_components = 4,
+                    .is_integer = false,
+                    .is_loaded = true,
                 };
                 continue;
             }
-            const IR::Attribute param{IR::Attribute::Param0 + input.param_index};
+            const IR::Attribute param{IR::Attribute::Param0 + i};
             const u32 num_components = info.loads.NumComponents(param);
             const Id type{F32[num_components]};
             Id attr_id{};
             if (profile.needs_manual_interpolation && !input.is_flat) {
-                attr_id = DefineInput(TypeArray(type, ConstU32(3U)), semantic);
+                attr_id = DefineInput(TypeArray(type, ConstU32(3U)), input.param_index);
                 Decorate(attr_id, spv::Decoration::PerVertexKHR);
-                Name(attr_id, fmt::format("fs_in_attr{}_p", semantic));
+                Name(attr_id, fmt::format("fs_in_attr{}_p", i));
             } else {
-                attr_id = DefineInput(type, semantic);
-                Name(attr_id, fmt::format("fs_in_attr{}", semantic));
+                attr_id = DefineInput(type, input.param_index);
+                Name(attr_id, fmt::format("fs_in_attr{}", i));
 
                 if (input.is_flat) {
                     Decorate(attr_id, spv::Decoration::Flat);
@@ -453,7 +455,7 @@ void EmitContext::DefineInputs() {
                     Decorate(attr_id, spv::Decoration::NoPerspective);
                 }
             }
-            input_params[semantic] =
+            input_params[i] =
                 GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, false);
         }
         break;
diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
index 2d7297c12..5a287dbe2 100644
--- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
@@ -22,15 +22,17 @@ void Translator::EmitVectorInterpolation(const GcnInst& inst) {
 // VINTRP
 
 void Translator::V_INTERP_P2_F32(const GcnInst& inst) {
-    const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr);
-    info.interp_qualifiers[attr.param_index] = vgpr_to_interp[inst.src[0].code];
-    const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
+    const u32 attr_index = inst.control.vintrp.attr;
+    const auto& attr = runtime_info.fs_info.inputs.at(attr_index);
+    info.interp_qualifiers[attr_index] = vgpr_to_interp[inst.src[0].code];
+    const IR::Attribute attrib{IR::Attribute::Param0 + attr_index};
     SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan));
 }
 
 void Translator::V_INTERP_MOV_F32(const GcnInst& inst) {
-    const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr);
-    const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
+    const u32 attr_index = inst.control.vintrp.attr;
+    const auto& attr = runtime_info.fs_info.inputs.at(attr_index);
+    const IR::Attribute attrib{IR::Attribute::Param0 + attr_index};
     SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan));
 }
 
diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp
index d8070da61..6241100a0 100644
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@@ -319,7 +319,8 @@ void Image::CopyImage(const Image& image) {
     auto cmdbuf = scheduler->CommandBuffer();
 
     boost::container::small_vector<vk::ImageCopy, 14> image_copy{};
-    for (u32 m = 0; m < image.info.resources.levels; ++m) {
+    const u32 num_mips = std::min(image.info.resources.levels, info.resources.levels);
+    for (u32 m = 0; m < num_mips; ++m) {
         const auto mip_w = std::max(image.info.size.width >> m, 1u);
         const auto mip_h = std::max(image.info.size.height >> m, 1u);
         const auto mip_d = std::max(image.info.size.depth >> m, 1u);
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index f070b9132..cc244eb6b 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -461,9 +461,9 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) {
     const ImageId image_id = FindImage(desc);
     Image& image = slot_images[image_id];
     image.flags |= ImageFlagBits::GpuModified;
-    image.flags &= ~ImageFlagBits::Dirty;
     image.usage.depth_target = 1u;
     image.usage.stencil = image.info.HasStencil();
+    UpdateImage(image_id);
 
     // Register meta data for this depth buffer
     if (!(image.flags & ImageFlagBits::MetaRegistered)) {

From 9981c8df03dcaf00d3e3d6b59f731961424397d5 Mon Sep 17 00:00:00 2001
From: Fire Cube <ben7@gmx.ch>
Date: Tue, 10 Jun 2025 21:30:45 +0200
Subject: [PATCH 2/7] Add option to ignore game patch (#3039)

* impl

* fix

* cleanup

* more

* clang +

* why
---
 src/core/file_sys/fs.cpp |  6 ++++--
 src/core/file_sys/fs.h   |  1 +
 src/emulator.cpp         |  2 +-
 src/main.cpp             | 26 +++++++++++++++-----------
 src/qt_gui/main.cpp      | 32 ++++++++++++++++++--------------
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp
index 4dad44874..b237ab7d9 100644
--- a/src/core/file_sys/fs.cpp
+++ b/src/core/file_sys/fs.cpp
@@ -10,6 +10,8 @@
 
 namespace Core::FileSys {
 
+bool MntPoints::ignore_game_patches = false;
+
 std::string RemoveTrailingSlashes(const std::string& path) {
     // Remove trailing slashes to make comparisons simpler.
     std::string path_sanitized = path;
@@ -77,7 +79,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea
     patch_path /= rel_path;
 
     if ((corrected_path.starts_with("/app0") || corrected_path.starts_with("/hostapp")) &&
-        !force_base_path && std::filesystem::exists(patch_path)) {
+        !force_base_path && !ignore_game_patches && std::filesystem::exists(patch_path)) {
         return patch_path;
     }
 
@@ -137,7 +139,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea
         return std::optional<std::filesystem::path>(current_path);
     };
 
-    if (!force_base_path) {
+    if (!force_base_path && !ignore_game_patches) {
         if (const auto path = search(patch_path)) {
             return *path;
         }
diff --git a/src/core/file_sys/fs.h b/src/core/file_sys/fs.h
index 6638b48e8..4a2aa56c1 100644
--- a/src/core/file_sys/fs.h
+++ b/src/core/file_sys/fs.h
@@ -21,6 +21,7 @@ class MntPoints {
     static constexpr bool NeedsCaseInsensitiveSearch = true;
 #endif
 public:
+    static bool ignore_game_patches;
     struct MntPair {
         std::filesystem::path host_path;
         std::string mount; // e.g /app0
diff --git a/src/emulator.cpp b/src/emulator.cpp
index bb50b8686..f50147818 100644
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@@ -75,7 +75,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector<std::string> ar
         game_folder_name.ends_with("-UPDATE") || game_folder_name.ends_with("-patch")) {
         // If an executable was launched from a separate update directory,
         // use the base game directory as the game folder.
-        const auto base_name = game_folder_name.substr(0, game_folder_name.size() - 7);
+        const std::string base_name = game_folder_name.substr(0, game_folder_name.rfind('-'));
         const auto base_path = game_folder.parent_path() / base_name;
         if (std::filesystem::is_directory(base_path)) {
             game_folder = base_path;
diff --git a/src/main.cpp b/src/main.cpp
index 85581774b..8a251c55a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -35,17 +35,19 @@ int main(int argc, char* argv[]) {
     std::unordered_map<std::string, std::function<void(int&)>> arg_map = {
         {"-h",
          [&](int&) {
-             std::cout << "Usage: shadps4 [options] <elf or eboot.bin path>\n"
-                          "Options:\n"
-                          "  -g, --game <path|ID>          Specify game path to launch\n"
-                          " -- ...                         Parameters passed to the game ELF. "
-                          "Needs to be at the end of the line, and everything after \"--\" is a "
-                          "game argument.\n"
-                          "  -p, --patch <patch_file>      Apply specified patch file\n"
-                          "  -f, --fullscreen <true|false> Specify window initial fullscreen "
-                          "state. Does not overwrite the config file.\n"
-                          "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
-                          "  -h, --help                    Display this help message\n";
+             std::cout
+                 << "Usage: shadps4 [options] <elf or eboot.bin path>\n"
+                    "Options:\n"
+                    "  -g, --game <path|ID>          Specify game path to launch\n"
+                    " -- ...                         Parameters passed to the game ELF. "
+                    "Needs to be at the end of the line, and everything after \"--\" is a "
+                    "game argument.\n"
+                    "  -p, --patch <patch_file>      Apply specified patch file\n"
+                    "  -i, --ignore-game-patch       Disable automatic loading of game patch\n"
+                    "  -f, --fullscreen <true|false> Specify window initial fullscreen "
+                    "state. Does not overwrite the config file.\n"
+                    "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
+                    "  -h, --help                    Display this help message\n";
              exit(0);
          }},
         {"--help", [&](int& i) { arg_map["-h"](i); }},
@@ -72,6 +74,8 @@ int main(int argc, char* argv[]) {
              }
          }},
         {"--patch", [&](int& i) { arg_map["-p"](i); }},
+        {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }},
+        {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }},
         {"-f",
          [&](int& i) {
              if (++i >= argc) {
diff --git a/src/qt_gui/main.cpp b/src/qt_gui/main.cpp
index bd9dca6ce..b7de517e8 100644
--- a/src/qt_gui/main.cpp
+++ b/src/qt_gui/main.cpp
@@ -41,20 +41,22 @@ int main(int argc, char* argv[]) {
     std::unordered_map<std::string, std::function<void(int&)>> arg_map = {
         {"-h",
          [&](int&) {
-             std::cout << "Usage: shadps4 [options]\n"
-                          "Options:\n"
-                          "  No arguments: Opens the GUI.\n"
-                          "  -g, --game <path|ID>          Specify <eboot.bin or elf path> or "
-                          "<game ID (CUSAXXXXX)> to launch\n"
-                          " -- ...                         Parameters passed to the game ELF. "
-                          "Needs to be at the end of the line, and everything after \"--\" is a "
-                          "game argument.\n"
-                          "  -p, --patch <patch_file>      Apply specified patch file\n"
-                          "  -s, --show-gui                Show the GUI\n"
-                          "  -f, --fullscreen <true|false> Specify window initial fullscreen "
-                          "state. Does not overwrite the config file.\n"
-                          "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
-                          "  -h, --help                    Display this help message\n";
+             std::cout
+                 << "Usage: shadps4 [options]\n"
+                    "Options:\n"
+                    "  No arguments: Opens the GUI.\n"
+                    "  -g, --game <path|ID>          Specify <eboot.bin or elf path> or "
+                    "<game ID (CUSAXXXXX)> to launch\n"
+                    " -- ...                         Parameters passed to the game ELF. "
+                    "Needs to be at the end of the line, and everything after \"--\" is a "
+                    "game argument.\n"
+                    "  -p, --patch <patch_file>      Apply specified patch file\n"
+                    "  -i, --ignore-game-patch       Disable automatic loading of game patch\n"
+                    "  -s, --show-gui                Show the GUI\n"
+                    "  -f, --fullscreen <true|false> Specify window initial fullscreen "
+                    "state. Does not overwrite the config file.\n"
+                    "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
+                    "  -h, --help                    Display this help message\n";
              exit(0);
          }},
         {"--help", [&](int& i) { arg_map["-h"](i); }}, // Redirect --help to -h
@@ -84,6 +86,8 @@ int main(int argc, char* argv[]) {
              }
          }},
         {"--patch", [&](int& i) { arg_map["-p"](i); }},
+        {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }},
+        {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }},
         {"-f",
          [&](int& i) {
              if (++i >= argc) {

From b49340dff8e28abcf96fe07ad0e90c4dda0bcaf2 Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:22:50 -0500
Subject: [PATCH 3/7] libSceVideodec2: Update structs to match newer firmwares
 (#3077)

* Update file_system.cpp

* libSceVideodec2 struct fixes

Our code was based on an old version of the libSceVideodec2 library. Based on what I've decompiled, these structs changed somewhere around firmware 6.50, and newer versions of the library have these flexible checks to accommodate both variants of the structs.

* Static assert for AvcPictureInfo struct

All the other Videodec2 structs have static asserts, might as well use one here too.

* Initialize new values

Set proper values for frameFormat and framePitchInBytes.
`frame->linesize[0]` appears to be in bytes already, I'm not sure if that means framePitch is being set wrong though.
---
 src/core/libraries/kernel/file_system.cpp      |  1 +
 src/core/libraries/videodec/videodec2.cpp      |  6 +++---
 src/core/libraries/videodec/videodec2.h        |  4 +++-
 src/core/libraries/videodec/videodec2_avc.h    | 17 +++++++++++++++++
 src/core/libraries/videodec/videodec2_impl.cpp |  4 ++++
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/core/libraries/kernel/file_system.cpp b/src/core/libraries/kernel/file_system.cpp
index ad372325c..fecc606fd 100644
--- a/src/core/libraries/kernel/file_system.cpp
+++ b/src/core/libraries/kernel/file_system.cpp
@@ -1050,6 +1050,7 @@ void RegisterFileSystem(Core::Loader::SymbolsResolver* sym) {
     LIB_FUNCTION("4wSze92BhLI", "libkernel", 1, "libkernel", 1, 1, sceKernelWrite);
     LIB_FUNCTION("+WRlkKjZvag", "libkernel", 1, "libkernel", 1, 1, readv);
     LIB_FUNCTION("YSHRBRLn2pI", "libkernel", 1, "libkernel", 1, 1, writev);
+    LIB_FUNCTION("kAt6VDbHmro", "libkernel", 1, "libkernel", 1, 1, sceKernelWritev);
     LIB_FUNCTION("Oy6IpwgtYOk", "libScePosix", 1, "libkernel", 1, 1, posix_lseek);
     LIB_FUNCTION("Oy6IpwgtYOk", "libkernel", 1, "libkernel", 1, 1, posix_lseek);
     LIB_FUNCTION("oib76F-12fk", "libkernel", 1, "libkernel", 1, 1, sceKernelLseek);
diff --git a/src/core/libraries/videodec/videodec2.cpp b/src/core/libraries/videodec/videodec2.cpp
index 4f9379151..1c6044fe2 100644
--- a/src/core/libraries/videodec/videodec2.cpp
+++ b/src/core/libraries/videodec/videodec2.cpp
@@ -140,7 +140,7 @@ s32 PS4_SYSV_ABI sceVideodec2Flush(OrbisVideodec2Decoder decoder,
         return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER;
     }
     if (frameBuffer->thisSize != sizeof(OrbisVideodec2FrameBuffer) ||
-        outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) {
+        (outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) {
         LOG_ERROR(Lib_Vdec2, "Invalid struct size");
         return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
     }
@@ -167,7 +167,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp
         LOG_ERROR(Lib_Vdec2, "Invalid arguments");
         return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER;
     }
-    if (outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) {
+    if ((outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) {
         LOG_ERROR(Lib_Vdec2, "Invalid struct size");
         return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
     }
@@ -179,7 +179,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp
     if (p1stPictureInfoOut) {
         OrbisVideodec2AvcPictureInfo* picInfo =
             static_cast<OrbisVideodec2AvcPictureInfo*>(p1stPictureInfoOut);
-        if (picInfo->thisSize != sizeof(OrbisVideodec2AvcPictureInfo)) {
+        if ((picInfo->thisSize | 16) != sizeof(OrbisVideodec2AvcPictureInfo)) {
             LOG_ERROR(Lib_Vdec2, "Invalid struct size");
             return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
         }
diff --git a/src/core/libraries/videodec/videodec2.h b/src/core/libraries/videodec/videodec2.h
index abc8f8ab5..410ee8ea6 100644
--- a/src/core/libraries/videodec/videodec2.h
+++ b/src/core/libraries/videodec/videodec2.h
@@ -73,8 +73,10 @@ struct OrbisVideodec2OutputInfo {
     u32 frameHeight;
     void* frameBuffer;
     u64 frameBufferSize;
+    u32 frameFormat;
+    u32 framePitchInBytes;
 };
-static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x30);
+static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x38);
 
 struct OrbisVideodec2FrameBuffer {
     u64 thisSize;
diff --git a/src/core/libraries/videodec/videodec2_avc.h b/src/core/libraries/videodec/videodec2_avc.h
index 22293ee93..1975209cb 100644
--- a/src/core/libraries/videodec/videodec2_avc.h
+++ b/src/core/libraries/videodec/videodec2_avc.h
@@ -55,6 +55,23 @@ struct OrbisVideodec2AvcPictureInfo {
     u8 pic_struct;
     u8 field_pic_flag;
     u8 bottom_field_flag;
+
+    u8 sequenceParameterSetPresentFlag;
+    u8 pictureParameterSetPresentFlag;
+    u8 auDelimiterPresentFlag;
+    u8 endOfSequencePresentFlag;
+    u8 endOfStreamPresentFlag;
+    u8 fillerDataPresentFlag;
+    u8 pictureTimingSeiPresentFlag;
+    u8 bufferingPeriodSeiPresentFlag;
+
+    u8 constraint_set0_flag;
+    u8 constraint_set1_flag;
+    u8 constraint_set2_flag;
+    u8 constraint_set3_flag;
+    u8 constraint_set4_flag;
+    u8 constraint_set5_flag;
 };
+static_assert(sizeof(OrbisVideodec2AvcPictureInfo) == 0x78);
 
 } // namespace Libraries::Vdec2
\ No newline at end of file
diff --git a/src/core/libraries/videodec/videodec2_impl.cpp b/src/core/libraries/videodec/videodec2_impl.cpp
index 22b17c86c..a643239a3 100644
--- a/src/core/libraries/videodec/videodec2_impl.cpp
+++ b/src/core/libraries/videodec/videodec2_impl.cpp
@@ -48,6 +48,7 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
+    outputInfo.frameFormat = 0;
 
     if (!inputData.auData) {
         return ORBIS_VIDEODEC2_ERROR_ACCESS_UNIT_POINTER;
@@ -106,6 +107,7 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
+        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 
@@ -144,6 +146,7 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
+    outputInfo.frameFormat = 0;
 
     AVFrame* frame = av_frame_alloc();
     if (!frame) {
@@ -175,6 +178,7 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
+        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 

From ca92e72efe6a041ce27c5e7473b62abf99e8f4c2 Mon Sep 17 00:00:00 2001
From: squidbus <175574877+squidbus@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:41:58 -0700
Subject: [PATCH 4/7] shader_recompiler: Various fixes to shared memory and
 atomics. (#3075)

* shader_recompiler: Various fixes to shared memory and atomics.

* shader_recompiler: Re-type non-32bit load/stores.
---
 .../backend/spirv/emit_spirv_atomic.cpp       |  63 ++++++----
 .../backend/spirv/emit_spirv_bounds.h         |  66 ++++++++--
 .../spirv/emit_spirv_context_get_set.cpp      | 118 +++++++++---------
 .../backend/spirv/emit_spirv_convert.cpp      |   8 ++
 .../backend/spirv/emit_spirv_instructions.h   |  13 +-
 .../frontend/translate/data_share.cpp         |  76 +++++------
 .../frontend/translate/vector_memory.cpp      |   4 +-
 src/shader_recompiler/ir/ir_emitter.cpp       |  64 +++++++---
 src/shader_recompiler/ir/ir_emitter.h         |  26 ++--
 src/shader_recompiler/ir/microinstruction.cpp |  11 +-
 src/shader_recompiler/ir/opcodes.inc          |  25 ++--
 .../ir/passes/hull_shader_transform.cpp       |  12 +-
 .../ir/passes/lower_buffer_format_to_raw.cpp  |  16 +--
 .../ir/passes/resource_tracking_pass.cpp      |   9 ++
 .../ir/passes/shared_memory_barrier_pass.cpp  |   6 +-
 .../passes/shared_memory_to_storage_pass.cpp  | 100 ++++++++++-----
 src/shader_recompiler/ir/value.h              |   1 +
 17 files changed, 391 insertions(+), 227 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
index 13fd8e180..47290e7e8 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
@@ -27,6 +27,19 @@ Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value,
     });
 }
 
+Id SharedAtomicU32IncDec(EmitContext& ctx, Id offset,
+                         Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
+    const Id shift_id{ctx.ConstU32(2U)};
+    const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
+    const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
+    const Id pointer{
+        ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
+    const auto [scope, semantics]{AtomicArgs(ctx)};
+    return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
+        return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics);
+    });
+}
+
 Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value,
                    Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
     const Id shift_id{ctx.ConstU32(3U)};
@@ -40,19 +53,6 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value,
     });
 }
 
-Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset,
-                          Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
-    const Id shift_id{ctx.ConstU32(2U)};
-    const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
-    const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
-    const Id pointer{
-        ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
-    const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
-        return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics);
-    });
-}
-
 Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                    Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
     const auto& buffer = ctx.buffers[handle];
@@ -68,6 +68,21 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
     });
 }
 
+Id BufferAtomicU32IncDec(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
+                         Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
+    const auto& buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    }
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
+    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [scope, semantics]{AtomicArgs(ctx)};
+    return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
+        return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics);
+    });
+}
+
 Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                           Id cmp_value,
                           Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id, Id, Id)) {
@@ -156,12 +171,12 @@ Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) {
     return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub);
 }
 
-Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset) {
-    return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement);
+Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) {
+    return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement);
 }
 
-Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset) {
-    return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement);
+Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) {
+    return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement);
 }
 
 Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
@@ -172,6 +187,10 @@ Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd);
 }
 
+Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
+    return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicISub);
+}
+
 Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
     return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin);
 }
@@ -188,14 +207,12 @@ Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMax);
 }
 
-Id EmitBufferAtomicInc32(EmitContext&, IR::Inst*, u32, Id, Id) {
-    // TODO
-    UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicInc32);
+Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIIncrement);
 }
 
-Id EmitBufferAtomicDec32(EmitContext&, IR::Inst*, u32, Id, Id) {
-    // TODO
-    UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicDec32);
+Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIDecrement);
 }
 
 Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
index 41e70c8c3..e66467c6b 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
@@ -1,31 +1,54 @@
 // SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
+#pragma once
+
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 
 namespace Shader::Backend::SPIRV {
 
-template <u32 bit_size>
-auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
-    Id zero_value{};
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+std::tuple<Id, Id> ResolveTypeAndZero(EmitContext& ctx) {
     Id result_type{};
-    if constexpr (bit_size == 64) {
-        zero_value = ctx.u64_zero_value;
+    Id zero_value{};
+    if constexpr (bit_size == 64 && num_components == 1 && !is_float) {
         result_type = ctx.U64;
+        zero_value = ctx.u64_zero_value;
     } else if constexpr (bit_size == 32) {
-        zero_value = ctx.u32_zero_value;
-        result_type = ctx.U32[1];
-    } else if constexpr (bit_size == 16) {
-        zero_value = ctx.u16_zero_value;
+        if (is_float) {
+            result_type = ctx.F32[num_components];
+            zero_value = ctx.f32_zero_value;
+        } else {
+            result_type = ctx.U32[num_components];
+            zero_value = ctx.u32_zero_value;
+        }
+    } else if constexpr (bit_size == 16 && num_components == 1 && !is_float) {
         result_type = ctx.U16;
+        zero_value = ctx.u16_zero_value;
+    } else if constexpr (bit_size == 8 && num_components == 1 && !is_float) {
+        result_type = ctx.U8;
+        zero_value = ctx.u8_zero_value;
     } else {
-        static_assert(false, "type not supported");
+        static_assert(false, "Type not supported.");
     }
+    if (num_components > 1) {
+        std::array<Id, num_components> zero_ids;
+        zero_ids.fill(zero_value);
+        zero_value = ctx.ConstantComposite(result_type, zero_ids);
+    }
+    return {result_type, zero_value};
+}
+
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
     if (Sirit::ValidId(buffer_size)) {
         // Bounds checking enabled, wrap in a conditional branch to make sure that
         // the atomic is not mistakenly executed when the index is out of bounds.
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size);
+        auto compare_index = index;
+        if (num_components > 1) {
+            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1));
+        }
+        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
         const Id ib_label = ctx.OpLabel();
         const Id end_label = ctx.OpLabel();
         ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone);
@@ -36,6 +59,8 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun
         ctx.OpBranch(end_label);
         ctx.AddLabel(end_label);
         if (Sirit::ValidId(ib_result)) {
+            const auto [result_type, zero_value] =
+                ResolveTypeAndZero<bit_size, num_components, is_float>(ctx);
             return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label);
         } else {
             return Id{0};
@@ -45,4 +70,21 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun
     return emit_func();
 }
 
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+static Id LoadAccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result) {
+    if (Sirit::ValidId(buffer_size)) {
+        // Bounds checking enabled, wrap in a select.
+        auto compare_index = index;
+        if (num_components > 1) {
+            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1));
+        }
+        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
+        const auto [result_type, zero_value] =
+            ResolveTypeAndZero<bit_size, num_components, is_float>(ctx);
+        return ctx.OpSelect(result_type, in_bounds, result, zero_value);
+    }
+    // Bounds checking not enabled, just return the plain value.
+    return result;
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 658d4759f..ccbe54d0a 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -11,6 +11,8 @@
 
 #include <magic_enum/magic_enum.hpp>
 
+#include "emit_spirv_bounds.h"
+
 namespace Shader::Backend::SPIRV {
 namespace {
 
@@ -239,8 +241,8 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
     }
 
     if (IR::IsParam(attr)) {
-        const u32 index{u32(attr) - u32(IR::Attribute::Param0)};
-        const auto& param{ctx.input_params.at(index)};
+        const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)};
+        const auto& param{ctx.input_params.at(param_index)};
         if (param.buffer_handle >= 0) {
             const auto step_rate = EmitReadStepRate(ctx, param.id.value);
             const auto offset = ctx.OpIAdd(
@@ -415,27 +417,6 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) {
     ctx.OpStore(pointer, value);
 }
 
-template <u32 N>
-static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result,
-                                    bool is_float) {
-    if (Sirit::ValidId(buffer_size)) {
-        // Bounds checking enabled, wrap in a select.
-        const auto result_type = is_float ? ctx.F32[N] : ctx.U32[N];
-        auto compare_index = index;
-        auto zero_value = is_float ? ctx.f32_zero_value : ctx.u32_zero_value;
-        if (N > 1) {
-            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1));
-            std::array<Id, N> zero_ids;
-            zero_ids.fill(zero_value);
-            zero_value = ctx.ConstantComposite(result_type, zero_ids);
-        }
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
-        return ctx.OpSelect(result_type, in_bounds, result, zero_value);
-    }
-    // Bounds checking not enabled, just return the plain value.
-    return result;
-}
-
 template <u32 N, PointerType alias>
 static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto flags = inst->Flags<IR::BufferInstInfo>();
@@ -454,8 +435,9 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
         const Id result_i = ctx.OpLoad(data_types[1], ptr_i);
         if (!flags.typed) {
             // Untyped loads have bounds checking per-component.
-            ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords,
-                                                       result_i, alias == PointerType::F32));
+            ids.push_back(LoadAccessBoundsCheck < 32, 1,
+                          alias ==
+                              PointerType::F32 > (ctx, index_i, spv_buffer.size_dwords, result_i));
         } else {
             ids.push_back(result_i);
         }
@@ -464,8 +446,8 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
     const Id result = N == 1 ? ids[0] : ctx.OpCompositeConstruct(data_types[N], ids);
     if (flags.typed) {
         // Typed loads have single bounds check for the whole load.
-        return EmitLoadBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, result,
-                                            alias == PointerType::F32);
+        return LoadAccessBoundsCheck < 32, N,
+               alias == PointerType::F32 > (ctx, index, spv_buffer.size_dwords, result);
     }
     return result;
 }
@@ -477,8 +459,8 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     }
     const auto [id, pointer_type] = spv_buffer[PointerType::U8];
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
-    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))};
-    return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false);
+    const Id result{ctx.OpLoad(ctx.U8, ptr)};
+    return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.size, result);
 }
 
 Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -489,8 +471,8 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto [id, pointer_type] = spv_buffer[PointerType::U16];
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))};
-    return EmitLoadBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, result, false);
+    const Id result{ctx.OpLoad(ctx.U16, ptr)};
+    return LoadAccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, result);
 }
 
 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -509,6 +491,18 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address)
     return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address);
 }
 
+Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    const auto& spv_buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(spv_buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    }
+    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
+    const Id result{ctx.OpLoad(ctx.U64, ptr)};
+    return LoadAccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, result);
+}
+
 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address);
 }
@@ -529,29 +523,6 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr
     UNREACHABLE_MSG("SPIR-V instruction");
 }
 
-template <u32 N>
-void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
-    if (Sirit::ValidId(buffer_size)) {
-        // Bounds checking enabled, wrap in a conditional branch.
-        auto compare_index = index;
-        if (N > 1) {
-            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1));
-        }
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
-        const Id in_bounds_label = ctx.OpLabel();
-        const Id merge_label = ctx.OpLabel();
-        ctx.OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
-        ctx.OpBranchConditional(in_bounds, in_bounds_label, merge_label);
-        ctx.AddLabel(in_bounds_label);
-        emit_func();
-        ctx.OpBranch(merge_label);
-        ctx.AddLabel(merge_label);
-        return;
-    }
-    // Bounds checking not enabled, just perform the store.
-    emit_func();
-}
-
 template <u32 N, PointerType alias>
 static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
                                  Id value) {
@@ -569,19 +540,25 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
             const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
             const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i);
             const Id value_i = N == 1 ? value : ctx.OpCompositeExtract(data_types[1], value, i);
-            auto store_i = [&]() { ctx.OpStore(ptr_i, value_i); };
+            auto store_i = [&] {
+                ctx.OpStore(ptr_i, value_i);
+                return Id{};
+            };
             if (!flags.typed) {
                 // Untyped stores have bounds checking per-component.
-                EmitStoreBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, store_i);
+                AccessBoundsCheck<32, 1, alias == PointerType::F32>(
+                    ctx, index_i, spv_buffer.size_dwords, store_i);
             } else {
                 store_i();
             }
         }
+        return Id{};
     };
 
     if (flags.typed) {
         // Typed stores have single bounds check for the whole store.
-        EmitStoreBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, store);
+        AccessBoundsCheck<32, N, alias == PointerType::F32>(ctx, index, spv_buffer.size_dwords,
+                                                            store);
     } else {
         store();
     }
@@ -594,8 +571,10 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v
     }
     const auto [id, pointer_type] = spv_buffer[PointerType::U8];
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
-    const Id result{ctx.OpUConvert(ctx.U8, value)};
-    EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); });
+    AccessBoundsCheck<8>(ctx, address, spv_buffer.size, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
 }
 
 void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
@@ -606,9 +585,10 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
     const auto [id, pointer_type] = spv_buffer[PointerType::U16];
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    const Id result{ctx.OpUConvert(ctx.U16, value)};
-    EmitStoreBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts,
-                                  [&] { ctx.OpStore(ptr, result); });
+    AccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
 }
 
 void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
@@ -627,6 +607,20 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value);
 }
 
+void EmitStoreBufferU64(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
+    const auto& spv_buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(spv_buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    }
+    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
+    AccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
+}
+
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
     EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value);
 }
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
index 945fa6877..c75f43393 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
@@ -263,4 +263,12 @@ Id EmitConvertU32U16(EmitContext& ctx, Id value) {
     return ctx.OpUConvert(ctx.U32[1], value);
 }
 
+Id EmitConvertU8U32(EmitContext& ctx, Id value) {
+    return ctx.OpUConvert(ctx.U8, value);
+}
+
+Id EmitConvertU32U8(EmitContext& ctx, Id value) {
+    return ctx.OpUConvert(ctx.U32[1], value);
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
index 3441c5a23..daf1b973e 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -69,6 +69,7 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
@@ -80,6 +81,7 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address
 void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+void EmitStoreBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@@ -87,12 +89,13 @@ void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
 void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
-Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
-Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@@ -136,8 +139,8 @@ Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value);
-Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset);
-Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset);
+Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset);
+Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset);
 Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value);
 
 Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
@@ -461,6 +464,8 @@ Id EmitConvertF64U32(EmitContext& ctx, Id value);
 Id EmitConvertF64U64(EmitContext& ctx, Id value);
 Id EmitConvertU16U32(EmitContext& ctx, Id value);
 Id EmitConvertU32U16(EmitContext& ctx, Id value);
+Id EmitConvertU8U32(EmitContext& ctx, Id value);
+Id EmitConvertU32U8(EmitContext& ctx, Id value);
 
 Id EmitImageSampleRaw(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address1, Id address2,
                       Id address3, Id address4);
diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp
index 4b6a58fd0..8ead93f78 100644
--- a/src/shader_recompiler/frontend/translate/data_share.cpp
+++ b/src/shader_recompiler/frontend/translate/data_share.cpp
@@ -216,34 +216,38 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid
     if (is_pair) {
         const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
-        if (bit_size == 32) {
-            ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
-        } else {
+        if (bit_size == 64) {
             ir.WriteShared(64,
                            ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0),
                                                                  ir.GetVectorReg(data0 + 1))),
                            addr0);
+        } else if (bit_size == 32) {
+            ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
+        } else if (bit_size == 16) {
+            ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0);
         }
         const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
-        if (bit_size == 32) {
-            ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
-        } else {
+        if (bit_size == 64) {
             ir.WriteShared(64,
                            ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1),
                                                                  ir.GetVectorReg(data1 + 1))),
                            addr1);
+        } else if (bit_size == 32) {
+            ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
+        } else if (bit_size == 16) {
+            ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1);
         }
-    } else if (bit_size == 64) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::Value data =
-            ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
-        ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0);
-    } else if (bit_size == 16) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
     } else {
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
+        if (bit_size == 64) {
+            const IR::Value data =
+                ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
+            ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0);
+        } else if (bit_size == 32) {
+            ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
+        } else if (bit_size == 16) {
+            ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0);
+        }
     }
 }
 
@@ -264,7 +268,7 @@ void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) {
     const IR::U32 offset =
         ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0));
     const IR::U32 addr_offset = ir.IAdd(addr, offset);
-    const IR::Value original_val = ir.SharedAtomicIIncrement(addr_offset);
+    const IR::Value original_val = ir.SharedAtomicInc(addr_offset);
     if (rtn) {
         SetDst(inst.dst[0], IR::U32{original_val});
     }
@@ -275,7 +279,7 @@ void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) {
     const IR::U32 offset =
         ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0));
     const IR::U32 addr_offset = ir.IAdd(addr, offset);
-    const IR::Value original_val = ir.SharedAtomicIDecrement(addr_offset);
+    const IR::Value original_val = ir.SharedAtomicDec(addr_offset);
     if (rtn) {
         SetDst(inst.dst[0], IR::U32{original_val});
     }
@@ -309,36 +313,38 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride
         const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
         const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0);
-        if (bit_size == 32) {
-            ir.SetVectorReg(dst_reg++, IR::U32{data0});
-        } else {
+        if (bit_size == 64) {
             const auto vector = ir.UnpackUint2x32(IR::U64{data0});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg++, IR::U32{data0});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})});
         }
         const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
         const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
-        if (bit_size == 32) {
-            ir.SetVectorReg(dst_reg++, IR::U32{data1});
-        } else {
+        if (bit_size == 64) {
             const auto vector = ir.UnpackUint2x32(IR::U64{data1});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg++, IR::U32{data1});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})});
         }
-    } else if (bit_size == 64) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
-        const auto vector = ir.UnpackUint2x32(IR::U64{data});
-        ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)});
-        ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)});
-    } else if (bit_size == 16) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)};
-        ir.SetVectorReg(dst_reg, ir.UConvert(32, data));
     } else {
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)};
-        ir.SetVectorReg(dst_reg, data);
+        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
+        if (bit_size == 64) {
+            const auto vector = ir.UnpackUint2x32(IR::U64{data});
+            ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)});
+            ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg, IR::U32{data});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})});
+        }
     }
 }
 
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
index 5eb2079a4..54e8b8ee8 100644
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -354,9 +354,9 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
         case AtomicOp::Xor:
             return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info);
         case AtomicOp::Inc:
-            return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info);
+            return ir.BufferAtomicInc(handle, address, buffer_info);
         case AtomicOp::Dec:
-            return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info);
+            return ir.BufferAtomicDec(handle, address, buffer_info);
         default:
             UNREACHABLE();
         }
diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp
index 2c37c8099..3d7cf71dc 100644
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@@ -353,12 +353,12 @@ U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) {
     return Inst<U32>(Opcode::SharedAtomicXor32, address, data);
 }
 
-U32 IREmitter::SharedAtomicIIncrement(const U32& address) {
-    return Inst<U32>(Opcode::SharedAtomicIIncrement32, address);
+U32 IREmitter::SharedAtomicInc(const U32& address) {
+    return Inst<U32>(Opcode::SharedAtomicInc32, address);
 }
 
-U32 IREmitter::SharedAtomicIDecrement(const U32& address) {
-    return Inst<U32>(Opcode::SharedAtomicIDecrement32, address);
+U32 IREmitter::SharedAtomicDec(const U32& address) {
+    return Inst<U32>(Opcode::SharedAtomicDec32, address);
 }
 
 U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) {
@@ -373,12 +373,12 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
     return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
 }
 
-U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
-    return Inst<U32>(Opcode::LoadBufferU8, Flags{info}, handle, address);
+U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U8>(Opcode::LoadBufferU8, Flags{info}, handle, address);
 }
 
-U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
-    return Inst<U32>(Opcode::LoadBufferU16, Flags{info}, handle, address);
+U16 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U16>(Opcode::LoadBufferU16, Flags{info}, handle, address);
 }
 
 Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
@@ -397,6 +397,10 @@ Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value&
     }
 }
 
+U64 IREmitter::LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U64>(Opcode::LoadBufferU64, Flags{info}, handle, address);
+}
+
 Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
                                BufferInstInfo info) {
     switch (num_dwords) {
@@ -417,12 +421,12 @@ Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, Buf
     return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
 }
 
-void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data,
+void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U8& data,
                               BufferInstInfo info) {
     Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data);
 }
 
-void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data,
+void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U16& data,
                                BufferInstInfo info) {
     Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data);
 }
@@ -447,6 +451,11 @@ void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value&
     }
 }
 
+void IREmitter::StoreBufferU64(const Value& handle, const Value& address, const U64& data,
+                               BufferInstInfo info) {
+    Inst(Opcode::StoreBufferU64, Flags{info}, handle, address, data);
+}
+
 void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
                                const Value& data, BufferInstInfo info) {
     switch (num_dwords) {
@@ -474,7 +483,19 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con
 
 Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value,
                                   BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
+    switch (value.Type()) {
+    case Type::U32:
+        return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
+    case Type::U64:
+        return Inst(Opcode::BufferAtomicIAdd64, Flags{info}, handle, address, value);
+    default:
+        ThrowInvalidType(value.Type());
+    }
+}
+
+Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, const Value& value,
+                                  BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicISub32, Flags{info}, handle, address, value);
 }
 
 Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value,
@@ -489,14 +510,12 @@ Value IREmitter::BufferAtomicIMax(const Value& handle, const Value& address, con
                      : Inst(Opcode::BufferAtomicUMax32, Flags{info}, handle, address, value);
 }
 
-Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, const Value& value,
-                                 BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address, value);
+Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address);
 }
 
-Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, const Value& value,
-                                 BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address, value);
+Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address);
 }
 
 Value IREmitter::BufferAtomicAnd(const Value& handle, const Value& address, const Value& value,
@@ -1804,8 +1823,15 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s
                      : ConvertUToF(dest_bitsize, src_bitsize, value);
 }
 
-U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
+U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) {
     switch (result_bitsize) {
+    case 8:
+        switch (value.Type()) {
+        case Type::U32:
+            return Inst<U8>(Opcode::ConvertU8U32, value);
+        default:
+            break;
+        }
     case 16:
         switch (value.Type()) {
         case Type::U32:
@@ -1815,6 +1841,8 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
         }
     case 32:
         switch (value.Type()) {
+        case Type::U8:
+            return Inst<U32>(Opcode::ConvertU32U8, value);
         case Type::U16:
             return Inst<U32>(Opcode::ConvertU32U16, value);
         default:
diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h
index eae44ed04..215a35ee9 100644
--- a/src/shader_recompiler/ir/ir_emitter.h
+++ b/src/shader_recompiler/ir/ir_emitter.h
@@ -100,33 +100,35 @@ public:
     void WriteShared(int bit_size, const Value& value, const U32& offset);
 
     [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data);
+    [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed);
     [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed);
+    [[nodiscard]] U32 SharedAtomicInc(const U32& address);
+    [[nodiscard]] U32 SharedAtomicDec(const U32& address);
     [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data);
 
-    [[nodiscard]] U32 SharedAtomicIIncrement(const U32& address);
-    [[nodiscard]] U32 SharedAtomicIDecrement(const U32& address);
-    [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
-
     [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
     [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
 
-    [[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
-    [[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
+    [[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
+    [[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
     [[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
                                       BufferInstInfo info);
+    [[nodiscard]] U64 LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info);
     [[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
                                       BufferInstInfo info);
     [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address,
                                          BufferInstInfo info);
-    void StoreBufferU8(const Value& handle, const Value& address, const U32& data,
+    void StoreBufferU8(const Value& handle, const Value& address, const U8& data,
                        BufferInstInfo info);
-    void StoreBufferU16(const Value& handle, const Value& address, const U32& data,
+    void StoreBufferU16(const Value& handle, const Value& address, const U16& data,
                         BufferInstInfo info);
     void StoreBufferU32(int num_dwords, const Value& handle, const Value& address,
                         const Value& data, BufferInstInfo info);
+    void StoreBufferU64(const Value& handle, const Value& address, const U64& data,
+                        BufferInstInfo info);
     void StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
                         const Value& data, BufferInstInfo info);
     void StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
@@ -134,14 +136,16 @@ public:
 
     [[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address,
                                          const Value& value, BufferInstInfo info);
+    [[nodiscard]] Value BufferAtomicISub(const Value& handle, const Value& address,
+                                         const Value& value, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicIMin(const Value& handle, const Value& address,
                                          const Value& value, bool is_signed, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicIMax(const Value& handle, const Value& address,
                                          const Value& value, bool is_signed, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicInc(const Value& handle, const Value& address,
-                                        const Value& value, BufferInstInfo info);
+                                        BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicDec(const Value& handle, const Value& address,
-                                        const Value& value, BufferInstInfo info);
+                                        BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicAnd(const Value& handle, const Value& address,
                                         const Value& value, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicOr(const Value& handle, const Value& address,
@@ -309,7 +313,7 @@ public:
     [[nodiscard]] F32F64 ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_signed,
                                      const Value& value);
 
-    [[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value);
+    [[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value);
     [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value);
 
     [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords,
diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp
index a57310fb9..c2311afea 100644
--- a/src/shader_recompiler/ir/microinstruction.cpp
+++ b/src/shader_recompiler/ir/microinstruction.cpp
@@ -60,12 +60,15 @@ bool Inst::MayHaveSideEffects() const noexcept {
     case Opcode::StoreBufferU32x2:
     case Opcode::StoreBufferU32x3:
     case Opcode::StoreBufferU32x4:
+    case Opcode::StoreBufferU64:
     case Opcode::StoreBufferF32:
     case Opcode::StoreBufferF32x2:
     case Opcode::StoreBufferF32x3:
     case Opcode::StoreBufferF32x4:
     case Opcode::StoreBufferFormatF32:
     case Opcode::BufferAtomicIAdd32:
+    case Opcode::BufferAtomicIAdd64:
+    case Opcode::BufferAtomicISub32:
     case Opcode::BufferAtomicSMin32:
     case Opcode::BufferAtomicUMin32:
     case Opcode::BufferAtomicSMax32:
@@ -76,15 +79,21 @@ bool Inst::MayHaveSideEffects() const noexcept {
     case Opcode::BufferAtomicOr32:
     case Opcode::BufferAtomicXor32:
     case Opcode::BufferAtomicSwap32:
+    case Opcode::BufferAtomicCmpSwap32:
     case Opcode::DataAppend:
     case Opcode::DataConsume:
-    case Opcode::WriteSharedU64:
+    case Opcode::WriteSharedU16:
     case Opcode::WriteSharedU32:
+    case Opcode::WriteSharedU64:
     case Opcode::SharedAtomicIAdd32:
+    case Opcode::SharedAtomicIAdd64:
+    case Opcode::SharedAtomicISub32:
     case Opcode::SharedAtomicSMin32:
     case Opcode::SharedAtomicUMin32:
     case Opcode::SharedAtomicSMax32:
     case Opcode::SharedAtomicUMax32:
+    case Opcode::SharedAtomicInc32:
+    case Opcode::SharedAtomicDec32:
     case Opcode::SharedAtomicAnd32:
     case Opcode::SharedAtomicOr32:
     case Opcode::SharedAtomicXor32:
diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc
index e96e32297..1621d2acf 100644
--- a/src/shader_recompiler/ir/opcodes.inc
+++ b/src/shader_recompiler/ir/opcodes.inc
@@ -35,21 +35,21 @@ OPCODE(LoadSharedU32,                                       U32,            U32,
 OPCODE(LoadSharedU64,                                       U64,            U32,                                                                            )
 OPCODE(WriteSharedU16,                                      Void,           U32,            U16,                                                            )
 OPCODE(WriteSharedU32,                                      Void,           U32,            U32,                                                            )
-OPCODE(WriteSharedU64,                                      Void,           U32,            U64,                                                          )
+OPCODE(WriteSharedU64,                                      Void,           U32,            U64,                                                            )
 
 // Shared atomic operations
 OPCODE(SharedAtomicIAdd32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicIAdd64,                                  U64,            U32,            U64,                                                            )
+OPCODE(SharedAtomicISub32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicSMin32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicUMin32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicSMax32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicUMax32,                                  U32,            U32,            U32,                                                            )
+OPCODE(SharedAtomicInc32,                                   U32,            U32,                                                                            )
+OPCODE(SharedAtomicDec32,                                   U32,            U32,                                                                            )
 OPCODE(SharedAtomicAnd32,                                   U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicOr32,                                    U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicXor32,                                   U32,            U32,            U32,                                                            )
-OPCODE(SharedAtomicISub32,                                  U32,            U32,            U32,                                                            )
-OPCODE(SharedAtomicIIncrement32,                            U32,            U32,                                                                            )
-OPCODE(SharedAtomicIDecrement32,                            U32,            U32,                                                                            )
 
 // Context getters/setters
 OPCODE(GetUserData,                                         U32,            ScalarReg,                                                                      )
@@ -94,23 +94,25 @@ OPCODE(UndefU32,                                            U32,
 OPCODE(UndefU64,                                            U64,                                                                                            )
 
 // Buffer operations
-OPCODE(LoadBufferU8,                                        U32,            Opaque,         Opaque,                                                         )
-OPCODE(LoadBufferU16,                                       U32,            Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU8,                                        U8,             Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU16,                                       U16,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32,                                       U32,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x2,                                     U32x2,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x3,                                     U32x3,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x4,                                     U32x4,          Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU64,                                       U64,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32,                                       F32,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x2,                                     F32x2,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x3,                                     F32x3,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x4,                                     F32x4,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferFormatF32,                                 F32x4,          Opaque,         Opaque,                                                         )
-OPCODE(StoreBufferU8,                                       Void,           Opaque,         Opaque,         U32,                                            )
-OPCODE(StoreBufferU16,                                      Void,           Opaque,         Opaque,         U32,                                            )
+OPCODE(StoreBufferU8,                                       Void,           Opaque,         Opaque,         U8,                                             )
+OPCODE(StoreBufferU16,                                      Void,           Opaque,         Opaque,         U16,                                            )
 OPCODE(StoreBufferU32,                                      Void,           Opaque,         Opaque,         U32,                                            )
 OPCODE(StoreBufferU32x2,                                    Void,           Opaque,         Opaque,         U32x2,                                          )
 OPCODE(StoreBufferU32x3,                                    Void,           Opaque,         Opaque,         U32x3,                                          )
 OPCODE(StoreBufferU32x4,                                    Void,           Opaque,         Opaque,         U32x4,                                          )
+OPCODE(StoreBufferU64,                                      Void,           Opaque,         Opaque,         U64,                                            )
 OPCODE(StoreBufferF32,                                      Void,           Opaque,         Opaque,         F32,                                            )
 OPCODE(StoreBufferF32x2,                                    Void,           Opaque,         Opaque,         F32x2,                                          )
 OPCODE(StoreBufferF32x3,                                    Void,           Opaque,         Opaque,         F32x3,                                          )
@@ -120,12 +122,13 @@ OPCODE(StoreBufferFormatF32,                                Void,           Opaq
 // Buffer atomic operations
 OPCODE(BufferAtomicIAdd32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicIAdd64,                                  U64,            Opaque,         Opaque,         U64                                             )
+OPCODE(BufferAtomicISub32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicSMin32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicUMin32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicSMax32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicUMax32,                                  U32,            Opaque,         Opaque,         U32                                             )
-OPCODE(BufferAtomicInc32,                                   U32,            Opaque,         Opaque,         U32,                                            )
-OPCODE(BufferAtomicDec32,                                   U32,            Opaque,         Opaque,         U32,                                            )
+OPCODE(BufferAtomicInc32,                                   U32,            Opaque,         Opaque,                                                         )
+OPCODE(BufferAtomicDec32,                                   U32,            Opaque,         Opaque,                                                         )
 OPCODE(BufferAtomicAnd32,                                   U32,            Opaque,         Opaque,         U32,                                            )
 OPCODE(BufferAtomicOr32,                                    U32,            Opaque,         Opaque,         U32,                                            )
 OPCODE(BufferAtomicXor32,                                   U32,            Opaque,         Opaque,         U32,                                            )
@@ -405,6 +408,8 @@ OPCODE(ConvertF64U32,                                       F64,            U32,
 OPCODE(ConvertF32U16,                                       F32,            U16,                                                                            )
 OPCODE(ConvertU16U32,                                       U16,            U32,                                                                            )
 OPCODE(ConvertU32U16,                                       U32,            U16,                                                                            )
+OPCODE(ConvertU8U32,                                        U8,             U32,                                                                            )
+OPCODE(ConvertU32U8,                                        U32,            U8,                                                                             )
 
 // Image operations
 OPCODE(ImageSampleRaw,                                      F32x4,          Opaque,         F32x4,          F32x4,          F32x4,          F32,            )
diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
index 5cf8a1525..156cb6628 100644
--- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
+++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
@@ -438,7 +438,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                 IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
                 const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2;
                 const IR::U32 addr{inst.Arg(0)};
-                const IR::U32 data{inst.Arg(1).Resolve()};
+                const IR::Value data = num_dwords == 2
+                                           ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()})
+                                           : inst.Arg(1).Resolve();
 
                 const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind,
                                            u32 off_dw) {
@@ -466,10 +468,10 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
 
                 AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
                 if (num_dwords == 1) {
-                    SetOutput(addr, data, region, 0);
+                    SetOutput(addr, IR::U32{data}, region, 0);
                 } else {
                     for (auto i = 0; i < num_dwords; i++) {
-                        SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i);
+                        SetOutput(addr, IR::U32{ir.CompositeExtract(data, i)}, region, i);
                     }
                 }
                 inst.Invalidate();
@@ -499,7 +501,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                             ReadTessControlPointAttribute(addr, stride, ir, i, is_tcs_output_read);
                         read_components.push_back(ir.BitCast<IR::U32>(component));
                     }
-                    attr_read = ir.CompositeConstruct(read_components);
+                    attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
                 }
                 inst.ReplaceUsesWithAndRemove(attr_read);
                 break;
@@ -578,7 +580,7 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                         const IR::F32 component = GetInput(addr, i);
                         read_components.push_back(ir.BitCast<IR::U32>(component));
                     }
-                    attr_read = ir.CompositeConstruct(read_components);
+                    attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
                 }
                 inst.ReplaceUsesWithAndRemove(attr_read);
                 break;
diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
index fcb86e3fb..bb36e2748 100644
--- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
+++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
@@ -34,13 +34,13 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
         interpreted = ir.Imm32(0.f);
         break;
     case AmdGpu::DataFormat::Format8: {
-        const auto unpacked =
-            ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info));
+        const auto raw = ir.UConvert(32, ir.LoadBufferU8(handle, address, info));
+        const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
         interpreted = ir.CompositeExtract(unpacked, 0);
         break;
     }
     case AmdGpu::DataFormat::Format8_8: {
-        const auto raw = ir.LoadBufferU16(handle, address, info);
+        const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
         const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
         interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
                                             ir.CompositeExtract(unpacked, 1));
@@ -51,8 +51,8 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
                                    IR::U32{ir.LoadBufferU32(1, handle, address, info)});
         break;
     case AmdGpu::DataFormat::Format16: {
-        const auto unpacked =
-            ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info));
+        const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
+        const auto unpacked = ir.Unpack2x16(format_info.num_format, raw);
         interpreted = ir.CompositeExtract(unpacked, 0);
         break;
     }
@@ -126,7 +126,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
         const auto packed =
             ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f),
                                                                      ir.Imm32(0.f), ir.Imm32(0.f)));
-        ir.StoreBufferU8(handle, address, packed, info);
+        ir.StoreBufferU8(handle, address, ir.UConvert(8, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format8_8: {
@@ -134,7 +134,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
                                        ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
                                                              ir.CompositeExtract(real_value, 1),
                                                              ir.Imm32(0.f), ir.Imm32(0.f)));
-        ir.StoreBufferU16(handle, address, packed, info);
+        ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format8_8_8_8: {
@@ -145,7 +145,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
     case AmdGpu::DataFormat::Format16: {
         const auto packed =
             ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
-        ir.StoreBufferU16(handle, address, packed, info);
+        ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format16_16: {
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index 18c77e600..ba96d1034 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -17,6 +17,8 @@ using SharpLocation = u32;
 bool IsBufferAtomic(const IR::Inst& inst) {
     switch (inst.GetOpcode()) {
     case IR::Opcode::BufferAtomicIAdd32:
+    case IR::Opcode::BufferAtomicIAdd64:
+    case IR::Opcode::BufferAtomicISub32:
     case IR::Opcode::BufferAtomicSMin32:
     case IR::Opcode::BufferAtomicUMin32:
     case IR::Opcode::BufferAtomicSMax32:
@@ -27,6 +29,7 @@ bool IsBufferAtomic(const IR::Inst& inst) {
     case IR::Opcode::BufferAtomicOr32:
     case IR::Opcode::BufferAtomicXor32:
     case IR::Opcode::BufferAtomicSwap32:
+    case IR::Opcode::BufferAtomicCmpSwap32:
         return true;
     default:
         return false;
@@ -41,6 +44,7 @@ bool IsBufferStore(const IR::Inst& inst) {
     case IR::Opcode::StoreBufferU32x2:
     case IR::Opcode::StoreBufferU32x3:
     case IR::Opcode::StoreBufferU32x4:
+    case IR::Opcode::StoreBufferU64:
     case IR::Opcode::StoreBufferF32:
     case IR::Opcode::StoreBufferF32x2:
     case IR::Opcode::StoreBufferF32x3:
@@ -60,6 +64,7 @@ bool IsBufferInstruction(const IR::Inst& inst) {
     case IR::Opcode::LoadBufferU32x2:
     case IR::Opcode::LoadBufferU32x3:
     case IR::Opcode::LoadBufferU32x4:
+    case IR::Opcode::LoadBufferU64:
     case IR::Opcode::LoadBufferF32:
     case IR::Opcode::LoadBufferF32x2:
     case IR::Opcode::LoadBufferF32x3:
@@ -85,6 +90,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
     case IR::Opcode::LoadBufferU16:
     case IR::Opcode::StoreBufferU16:
         return IR::Type::U16;
+    case IR::Opcode::LoadBufferU64:
+    case IR::Opcode::StoreBufferU64:
+    case IR::Opcode::BufferAtomicIAdd64:
+        return IR::Type::U64;
     case IR::Opcode::LoadBufferFormatF32:
     case IR::Opcode::StoreBufferFormatF32:
         // Formatted buffer loads can use a variety of types.
diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
index baf6ad0d1..10d6a285c 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
@@ -9,12 +9,14 @@
 namespace Shader::Optimization {
 
 static bool IsLoadShared(const IR::Inst& inst) {
-    return inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
+    return inst.GetOpcode() == IR::Opcode::LoadSharedU16 ||
+           inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
            inst.GetOpcode() == IR::Opcode::LoadSharedU64;
 }
 
 static bool IsWriteShared(const IR::Inst& inst) {
-    return inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
+    return inst.GetOpcode() == IR::Opcode::WriteSharedU16 ||
+           inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
            inst.GetOpcode() == IR::Opcode::WriteSharedU64;
 }
 
diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
index 12d4d0659..839a8ddc5 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
@@ -10,18 +10,23 @@ namespace Shader::Optimization {
 static bool IsSharedAccess(const IR::Inst& inst) {
     const auto opcode = inst.GetOpcode();
     switch (opcode) {
+    case IR::Opcode::LoadSharedU16:
     case IR::Opcode::LoadSharedU32:
     case IR::Opcode::LoadSharedU64:
+    case IR::Opcode::WriteSharedU16:
     case IR::Opcode::WriteSharedU32:
     case IR::Opcode::WriteSharedU64:
-    case IR::Opcode::SharedAtomicAnd32:
     case IR::Opcode::SharedAtomicIAdd32:
     case IR::Opcode::SharedAtomicIAdd64:
-    case IR::Opcode::SharedAtomicOr32:
-    case IR::Opcode::SharedAtomicSMax32:
-    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicISub32:
     case IR::Opcode::SharedAtomicSMin32:
     case IR::Opcode::SharedAtomicUMin32:
+    case IR::Opcode::SharedAtomicSMax32:
+    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicInc32:
+    case IR::Opcode::SharedAtomicDec32:
+    case IR::Opcode::SharedAtomicAnd32:
+    case IR::Opcode::SharedAtomicOr32:
     case IR::Opcode::SharedAtomicXor32:
         return true;
     default:
@@ -41,14 +46,8 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
                                     profile.supports_workgroup_explicit_memory_layout)) {
         return;
     }
-    // Add buffer binding for shared memory storage buffer.
     const u32 binding = static_cast<u32>(program.info.buffers.size());
-    program.info.buffers.push_back({
-        .used_types = IR::Type::U32,
-        .inline_cbuf = AmdGpu::Buffer::Null(),
-        .buffer_type = BufferType::SharedMemory,
-        .is_written = true,
-    });
+    IR::Type used_types{};
     for (IR::Block* const block : program.blocks) {
         for (IR::Inst& inst : block->Instructions()) {
             if (!IsSharedAccess(inst)) {
@@ -56,73 +55,106 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
             }
             IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
             const IR::U32 handle = ir.Imm32(binding);
+            const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
+                                           ir.Imm32(shared_memory_size));
+            const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
             // Replace shared atomics first
             switch (inst.GetOpcode()) {
-            case IR::Opcode::SharedAtomicAnd32:
-                inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {}));
-                continue;
             case IR::Opcode::SharedAtomicIAdd32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
             case IR::Opcode::SharedAtomicIAdd64:
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U64;
                 continue;
-            case IR::Opcode::SharedAtomicOr32:
+            case IR::Opcode::SharedAtomicISub32:
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
                 continue;
-            case IR::Opcode::SharedAtomicSMax32:
-            case IR::Opcode::SharedAtomicUMax32: {
-                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
-                inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
-                continue;
-            }
             case IR::Opcode::SharedAtomicSMin32:
             case IR::Opcode::SharedAtomicUMin32: {
                 const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
+                    ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
+                used_types |= IR::Type::U32;
                 continue;
             }
-            case IR::Opcode::SharedAtomicXor32:
+            case IR::Opcode::SharedAtomicSMax32:
+            case IR::Opcode::SharedAtomicUMax32: {
+                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            }
+            case IR::Opcode::SharedAtomicInc32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicDec32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicAnd32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicOr32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicXor32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
                 continue;
             default:
                 break;
             }
             // Replace shared operations.
-            const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
-                                           ir.Imm32(shared_memory_size));
-            const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
             switch (inst.GetOpcode()) {
             case IR::Opcode::LoadSharedU16:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
+                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::LoadSharedU32:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
+                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::LoadSharedU64:
-                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {}));
+                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
+                used_types |= IR::Type::U64;
                 break;
             case IR::Opcode::WriteSharedU16:
-                ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {});
+                ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
                 inst.Invalidate();
+                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::WriteSharedU32:
                 ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
                 inst.Invalidate();
+                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::WriteSharedU64:
-                ir.StoreBufferU32(2, handle, address, inst.Arg(1), {});
+                ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
                 inst.Invalidate();
+                used_types |= IR::Type::U64;
                 break;
             default:
                 break;
             }
         }
     }
+    // Add buffer binding for shared memory storage buffer.
+    program.info.buffers.push_back({
+        .used_types = used_types,
+        .inline_cbuf = AmdGpu::Buffer::Null(),
+        .buffer_type = BufferType::SharedMemory,
+        .is_written = true,
+    });
 }
 
 } // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h
index ed1e5536a..b92c5d555 100644
--- a/src/shader_recompiler/ir/value.h
+++ b/src/shader_recompiler/ir/value.h
@@ -265,6 +265,7 @@ using U32F32 = TypedValue<Type::U32 | Type::F32>;
 using U64F64 = TypedValue<Type::U64 | Type::F64>;
 using U32U64 = TypedValue<Type::U32 | Type::U64>;
 using U16U32U64 = TypedValue<Type::U16 | Type::U32 | Type::U64>;
+using U8U16U32U64 = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;
 using F32F64 = TypedValue<Type::F32 | Type::F64>;
 using F16F32F64 = TypedValue<Type::F16 | Type::F32 | Type::F64>;
 using UAny = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;

From fc4fd0107d5bde21e2a4fbdff6b502e6b39e9b7f Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Tue, 10 Jun 2025 17:43:11 -0500
Subject: [PATCH 5/7] libSceNpTrophy: Change initial context and handle values
 (#3080)

* Change default context and handle values

libSceNpToolkit internally uses context/handle values of zero to indicate NpTrophy calls failed.
This PR returns handle/context as index + 1 instead, avoiding this issue.

* Fix log message
---
 src/core/libraries/np_trophy/np_trophy.cpp | 30 +++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/core/libraries/np_trophy/np_trophy.cpp b/src/core/libraries/np_trophy/np_trophy.cpp
index 6de84bd93..e3c5ce35e 100644
--- a/src/core/libraries/np_trophy/np_trophy.cpp
+++ b/src/core/libraries/np_trophy/np_trophy.cpp
@@ -164,10 +164,12 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateContext(OrbisNpTrophyContext* context, int32_t
     }
 
     const auto ctx_id = trophy_contexts.insert(user_id, service_label);
-    contexts_internal[key].context_id = ctx_id.index;
-    LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", ctx_id.index,
-             user_id, service_label);
-    *context = ctx_id.index;
+
+    *context = ctx_id.index + 1;
+    contexts_internal[key].context_id = *context;
+    LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", *context, user_id,
+             service_label);
+
     return ORBIS_OK;
 }
 
@@ -179,21 +181,23 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateHandle(OrbisNpTrophyHandle* handle) {
     if (trophy_handles.size() >= MaxTrophyHandles) {
         return ORBIS_NP_TROPHY_ERROR_HANDLE_EXCEEDS_MAX;
     }
-    const auto handle_id = trophy_handles.insert();
-    LOG_INFO(Lib_NpTrophy, "New handle = {}", handle_id.index);
 
-    *handle = handle_id.index;
+    const auto handle_id = trophy_handles.insert();
+
+    *handle = handle_id.index + 1;
+    LOG_INFO(Lib_NpTrophy, "New handle = {}", *handle);
     return ORBIS_OK;
 }
 
 int PS4_SYSV_ABI sceNpTrophyDestroyContext(OrbisNpTrophyContext context) {
     LOG_INFO(Lib_NpTrophy, "Destroyed Context {}", context);
 
-    if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT)
+    if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT) {
         return ORBIS_NP_TROPHY_ERROR_INVALID_CONTEXT;
+    }
 
     Common::SlotId contextId;
-    contextId.index = context;
+    contextId.index = context - 1;
 
     ContextKey contextkey = trophy_contexts[contextId];
     trophy_contexts.erase(contextId);
@@ -206,15 +210,17 @@ s32 PS4_SYSV_ABI sceNpTrophyDestroyHandle(OrbisNpTrophyHandle handle) {
     if (handle == ORBIS_NP_TROPHY_INVALID_HANDLE)
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
 
-    if (handle >= trophy_handles.size()) {
+    s32 handle_index = handle - 1;
+    if (handle_index >= trophy_handles.size()) {
         LOG_ERROR(Lib_NpTrophy, "Invalid handle {}", handle);
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
     }
-    if (!trophy_handles.is_allocated({static_cast<u32>(handle)})) {
+
+    if (!trophy_handles.is_allocated({static_cast<u32>(handle_index)})) {
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
     }
 
-    trophy_handles.erase({static_cast<u32>(handle)});
+    trophy_handles.erase({static_cast<u32>(handle_index)});
     LOG_INFO(Lib_NpTrophy, "Handle {} destroyed", handle);
     return ORBIS_OK;
 }

From dedf6de2ac13b6543339ee5cdedc44ee0efd963c Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Wed, 11 Jun 2025 11:34:37 +0300
Subject: [PATCH 6/7]  texture_cache: Implement color<->depth copies (#3079)

* texture_cache: Implement color to depth copies and vise versa

* ir_passes: Adjust shared memory barrier pass to cover more cases

* texture_cache: Remove unused code

* review comment
---
 .../ir/passes/shared_memory_barrier_pass.cpp  |  35 ++++--
 src/video_core/buffer_cache/buffer_cache.cpp  |  10 +-
 src/video_core/buffer_cache/buffer_cache.h    |  23 ++--
 .../renderer_vulkan/vk_rasterizer.cpp         |   4 +-
 src/video_core/texture_cache/image.cpp        | 113 +++++++++++++++---
 src/video_core/texture_cache/image.h          |   3 +-
 .../texture_cache/texture_cache.cpp           |  23 ++--
 7 files changed, 157 insertions(+), 54 deletions(-)

diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
index 10d6a285c..11713d099 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <unordered_set>
 #include "shader_recompiler/ir/breadth_first_search.h"
 #include "shader_recompiler/ir/ir_emitter.h"
 #include "shader_recompiler/ir/program.h"
@@ -51,11 +52,14 @@ static void EmitBarrierInBlock(IR::Block* block) {
     }
 }
 
+using NodeSet = std::unordered_set<const IR::Block*>;
+
 // Inserts a barrier after divergent conditional blocks to avoid undefined
 // behavior when some threads write and others read from shared memory.
-static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
+static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data,
+                                    NodeSet& divergence_end, u32& divergence_depth) {
     const IR::U1 cond = data.if_node.cond;
-    const auto insert_barrier =
+    const auto is_divergent_cond =
         IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional<bool> {
             if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 &&
                 inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) {
@@ -63,11 +67,15 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
             }
             return std::nullopt;
         });
-    if (insert_barrier) {
-        IR::Block* const merge = data.if_node.merge;
-        auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi);
-        IR::IREmitter ir{*merge, insert_point};
-        ir.Barrier();
+    if (is_divergent_cond) {
+        if (divergence_depth == 0) {
+            IR::Block* const merge = data.if_node.merge;
+            auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi);
+            IR::IREmitter ir{*merge, insert_point};
+            ir.Barrier();
+        }
+        ++divergence_depth;
+        divergence_end.emplace(data.if_node.merge);
     }
 }
 
@@ -89,19 +97,22 @@ void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_in
         return;
     }
     using Type = IR::AbstractSyntaxNode::Type;
-    u32 branch_depth{};
+    u32 divergence_depth{};
+    NodeSet divergence_end;
     for (const IR::AbstractSyntaxNode& node : program.syntax_list) {
         if (node.type == Type::EndIf) {
-            --branch_depth;
+            if (divergence_end.contains(node.data.end_if.merge)) {
+                --divergence_depth;
+            }
             continue;
         }
         // Check if branch depth is zero, we don't want to insert barrier in potentially divergent
         // code.
-        if (node.type == Type::If && branch_depth++ == 0) {
-            EmitBarrierInMergeBlock(node.data);
+        if (node.type == Type::If) {
+            EmitBarrierInMergeBlock(node.data, divergence_end, divergence_depth);
             continue;
         }
-        if (node.type == Type::Block && branch_depth == 0) {
+        if (node.type == Type::Block && divergence_depth == 0) {
             EmitBarrierInBlock(node.data.block);
         }
     }
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index e470f8e77..ffa744b31 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -23,6 +23,7 @@ static constexpr size_t DataShareBufferSize = 64_KB;
 static constexpr size_t StagingBufferSize = 512_MB;
 static constexpr size_t UboStreamBufferSize = 128_MB;
 static constexpr size_t DownloadBufferSize = 128_MB;
+static constexpr size_t DeviceBufferSize = 16_MB;
 static constexpr size_t MaxPageFaults = 1024;
 
 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
@@ -32,7 +33,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
       memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_},
       staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
       stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
-      download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize),
+      download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize},
+      device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize},
       gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
       bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
                            0,        AllFlags,  BDA_PAGETABLE_SIZE},
@@ -348,7 +350,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
     return {&buffer, buffer.Offset(device_addr)};
 }
 
-std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
+std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) {
     // Check if any buffer contains the full requested range.
     const u64 page = gpu_addr >> CACHING_PAGEBITS;
     const BufferId buffer_id = page_table[page].buffer_id;
@@ -361,10 +363,10 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
     }
     // If no buffer contains the full requested range but some buffer within was GPU-modified,
     // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
-    // This is only done if the request prefers to use GPU memory, otherwise we can skip it.
-    if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
+    if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
         return ObtainBuffer(gpu_addr, size, false, false);
     }
+
     // In all other cases, just do a CPU copy to the staging buffer.
     const auto [data, offset] = staging_buffer.Map(size, 16);
     memory->CopySparseMemory(gpu_addr, data, size);
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index c2faf12c8..d7d753213 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -80,11 +80,6 @@ public:
         return &gds_buffer;
     }
 
-    /// Retrieves the host visible device local stream buffer.
-    [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept {
-        return stream_buffer;
-    }
-
     /// Retrieves the device local DBA page table buffer.
     [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept {
         return &bda_pagetable_buffer;
@@ -100,6 +95,20 @@ public:
         return slot_buffers[id];
     }
 
+    /// Retrieves a utility buffer optimized for specified memory usage.
+    StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept {
+        switch (usage) {
+        case MemoryUsage::Stream:
+            return stream_buffer;
+        case MemoryUsage::Download:
+            return download_buffer;
+        case MemoryUsage::Upload:
+            return staging_buffer;
+        case MemoryUsage::DeviceLocal:
+            return device_buffer;
+        }
+    }
+
     /// Invalidates any buffer in the logical page range.
     void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
 
@@ -121,8 +130,7 @@ public:
                                                        BufferId buffer_id = {});
 
     /// Attempts to obtain a buffer without modifying the cache contents.
-    [[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size,
-                                                           bool prefer_gpu);
+    [[nodiscard]] std::pair<Buffer*, u32> ObtainBufferForImage(VAddr gpu_addr, u32 size);
 
     /// Return true when a region is registered on the cache
     [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
@@ -193,6 +201,7 @@ private:
     StreamBuffer staging_buffer;
     StreamBuffer stream_buffer;
     StreamBuffer download_buffer;
+    StreamBuffer device_buffer;
     Buffer gds_buffer;
     Buffer bda_pagetable_buffer;
     Buffer fault_buffer;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index dff4e5a5f..9dea5ceea 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -549,7 +549,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
                 const auto* gds_buf = buffer_cache.GetGdsBuffer();
                 buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes());
             } else if (desc.buffer_type == Shader::BufferType::Flatbuf) {
-                auto& vk_buffer = buffer_cache.GetStreamBuffer();
+                auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream);
                 const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32);
                 const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size,
                                                   instance.UniformMinAlignment());
@@ -561,7 +561,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
                 const auto* fault_buffer = buffer_cache.GetFaultBuffer();
                 buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes());
             } else if (desc.buffer_type == Shader::BufferType::SharedMemory) {
-                auto& lds_buffer = buffer_cache.GetStreamBuffer();
+                auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream);
                 const auto& cs_program = liverpool->GetCsRegs();
                 const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups();
                 const auto [data, offset] =
diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp
index 6241100a0..ab9111e6b 100644
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@@ -312,43 +312,121 @@ void Image::Upload(vk::Buffer buffer, u64 offset) {
             vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {});
 }
 
-void Image::CopyImage(const Image& image) {
+void Image::CopyImage(const Image& src_image) {
     scheduler->EndRendering();
     Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
 
     auto cmdbuf = scheduler->CommandBuffer();
+    const auto& src_info = src_image.info;
 
     boost::container::small_vector<vk::ImageCopy, 14> image_copy{};
-    const u32 num_mips = std::min(image.info.resources.levels, info.resources.levels);
+    const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels);
     for (u32 m = 0; m < num_mips; ++m) {
-        const auto mip_w = std::max(image.info.size.width >> m, 1u);
-        const auto mip_h = std::max(image.info.size.height >> m, 1u);
-        const auto mip_d = std::max(image.info.size.depth >> m, 1u);
+        const auto mip_w = std::max(src_info.size.width >> m, 1u);
+        const auto mip_h = std::max(src_info.size.height >> m, 1u);
+        const auto mip_d = std::max(src_info.size.depth >> m, 1u);
 
         image_copy.emplace_back(vk::ImageCopy{
             .srcSubresource{
-                .aspectMask = image.aspect_mask,
+                .aspectMask = src_image.aspect_mask,
                 .mipLevel = m,
                 .baseArrayLayer = 0,
-                .layerCount = image.info.resources.layers,
+                .layerCount = src_info.resources.layers,
             },
             .dstSubresource{
-                .aspectMask = image.aspect_mask,
+                .aspectMask = src_image.aspect_mask,
                 .mipLevel = m,
                 .baseArrayLayer = 0,
-                .layerCount = image.info.resources.layers,
+                .layerCount = src_info.resources.layers,
             },
             .extent = {mip_w, mip_h, mip_d},
         });
     }
-    cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout,
+    cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout,
                      image_copy);
 
     Transit(vk::ImageLayout::eGeneral,
             vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {});
 }
 
-void Image::CopyMip(const Image& image, u32 mip, u32 slice) {
+void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) {
+    const auto& src_info = src_image.info;
+
+    vk::BufferImageCopy buffer_image_copy = {
+        .bufferOffset = offset,
+        .bufferRowLength = 0,
+        .bufferImageHeight = 0,
+        .imageSubresource =
+            {
+                .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth
+                                                        : vk::ImageAspectFlagBits::eColor,
+                .mipLevel = 0,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+        .imageOffset =
+            {
+                .x = 0,
+                .y = 0,
+                .z = 0,
+            },
+        .imageExtent =
+            {
+                .width = src_info.size.width,
+                .height = src_info.size.height,
+                .depth = src_info.size.depth,
+            },
+    };
+
+    const vk::BufferMemoryBarrier2 pre_copy_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferRead,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .buffer = buffer,
+        .offset = offset,
+        .size = VK_WHOLE_SIZE,
+    };
+
+    const vk::BufferMemoryBarrier2 post_copy_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferRead,
+        .buffer = buffer,
+        .offset = offset,
+        .size = VK_WHOLE_SIZE,
+    };
+
+    scheduler->EndRendering();
+    src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
+    Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
+
+    auto cmdbuf = scheduler->CommandBuffer();
+
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &pre_copy_barrier,
+    });
+
+    cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer,
+                             buffer_image_copy);
+
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &post_copy_barrier,
+    });
+
+    buffer_image_copy.imageSubresource.aspectMask =
+        info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor;
+
+    cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal,
+                             buffer_image_copy);
+}
+
+void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) {
     scheduler->EndRendering();
     Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
 
@@ -358,26 +436,27 @@ void Image::CopyMip(const Image& image, u32 mip, u32 slice) {
     const auto mip_h = std::max(info.size.height >> mip, 1u);
     const auto mip_d = std::max(info.size.depth >> mip, 1u);
 
-    ASSERT(mip_w == image.info.size.width);
-    ASSERT(mip_h == image.info.size.height);
+    const auto& src_info = src_image.info;
+    ASSERT(mip_w == src_info.size.width);
+    ASSERT(mip_h == src_info.size.height);
 
-    const u32 num_layers = std::min(image.info.resources.layers, info.resources.layers);
+    const u32 num_layers = std::min(src_info.resources.layers, info.resources.layers);
     const vk::ImageCopy image_copy{
         .srcSubresource{
-            .aspectMask = image.aspect_mask,
+            .aspectMask = src_image.aspect_mask,
             .mipLevel = 0,
             .baseArrayLayer = 0,
             .layerCount = num_layers,
         },
         .dstSubresource{
-            .aspectMask = image.aspect_mask,
+            .aspectMask = src_image.aspect_mask,
             .mipLevel = mip,
             .baseArrayLayer = slice,
             .layerCount = num_layers,
         },
         .extent = {mip_w, mip_h, mip_d},
     };
-    cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout,
+    cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout,
                      image_copy);
 
     Transit(vk::ImageLayout::eGeneral,
diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h
index 404e25e88..31b67e021 100644
--- a/src/video_core/texture_cache/image.h
+++ b/src/video_core/texture_cache/image.h
@@ -104,7 +104,8 @@ struct Image {
                  std::optional<SubresourceRange> range, vk::CommandBuffer cmdbuf = {});
     void Upload(vk::Buffer buffer, u64 offset);
 
-    void CopyImage(const Image& image);
+    void CopyImage(const Image& src_image);
+    void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset);
     void CopyMip(const Image& src_image, u32 mip, u32 slice);
 
     bool IsTracked() {
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index cc244eb6b..a47e858ab 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -8,7 +8,6 @@
 #include "common/debug.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/page_manager.h"
-#include "video_core/renderer_vulkan/liverpool_to_vk.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/texture_cache/host_compatibility.h"
@@ -126,7 +125,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
 
 ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding,
                                           ImageId cache_image_id) {
-    const auto& cache_image = slot_images[cache_image_id];
+    auto& cache_image = slot_images[cache_image_id];
 
     if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) {
         return {};
@@ -169,18 +168,21 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi
     }
 
     if (recreate) {
-        auto new_info{requested_info};
-        new_info.resources = std::max(requested_info.resources, cache_image.info.resources);
-        new_info.UpdateSize();
+        auto new_info = requested_info;
+        new_info.resources = std::min(requested_info.resources, cache_image.info.resources);
         const auto new_image_id = slot_images.insert(instance, scheduler, new_info);
         RegisterImage(new_image_id);
 
         // Inherit image usage
-        auto& new_image = GetImage(new_image_id);
+        auto& new_image = slot_images[new_image_id];
         new_image.usage = cache_image.usage;
+        new_image.flags &= ~ImageFlagBits::Dirty;
 
-        // TODO: perform a depth copy here
+        // Perform depth<->color copy using the intermediate copy buffer.
+        const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal);
+        new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0);
 
+        // Free the cache image.
         FreeImage(cache_image_id);
         return new_image_id;
     }
@@ -584,12 +586,11 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
 
     const VAddr image_addr = image.info.guest_address;
     const size_t image_size = image.info.guest_size;
-    const auto [vk_buffer, buf_offset] =
-        buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty);
+    const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size);
 
     const auto cmdbuf = sched_ptr->CommandBuffer();
-    // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
-    // hazard
+
+    // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard
     if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
                                              vk::PipelineStageFlagBits2::eTransfer)) {
         cmdbuf.pipelineBarrier2(vk::DependencyInfo{

From 274182954551d429c77e8b88ec395ae8726a0127 Mon Sep 17 00:00:00 2001
From: georgemoralis <giorgosmrls@gmail.com>
Date: Wed, 11 Jun 2025 12:02:59 +0300
Subject: [PATCH 7/7] New translations en_us.ts (Arabic) (#3081)

---
 src/qt_gui/translations/ar_SA.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qt_gui/translations/ar_SA.ts b/src/qt_gui/translations/ar_SA.ts
index 26e768720..7d0c15e6b 100644
--- a/src/qt_gui/translations/ar_SA.ts
+++ b/src/qt_gui/translations/ar_SA.ts
@@ -2049,7 +2049,7 @@ Nightly: نُسخ تحتوي على أحدث الميزات، لكنها أقل
     </message>
     <message>
       <source> * Unsupported Vulkan Version</source>
-      <translation type="unfinished"> * Unsupported Vulkan Version</translation>
+      <translation>نسخ Vulkan غير مدعومة</translation>
     </message>
   </context>
   <context>