From 64bbedeb82ce95b4ba328fb3b4597a6b6f3b2dd0 Mon Sep 17 00:00:00 2001
From: Connor Garey <connorgarey1999@gmail.com>
Date: Mon, 9 Jun 2025 23:25:57 +0100
Subject: [PATCH 01/14] changed package name to openal-soft-devel reflecting
 the fedora name package change (#3069)

---
 documents/building-linux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documents/building-linux.md b/documents/building-linux.md
index bd07b2eff..61d067881 100644
--- a/documents/building-linux.md
+++ b/documents/building-linux.md
@@ -25,7 +25,7 @@ sudo apt install build-essential clang git cmake libasound2-dev \
 
 ```bash
 sudo dnf install clang git cmake libatomic alsa-lib-devel \
-    pipewire-jack-audio-connection-kit-devel openal-devel \
+    pipewire-jack-audio-connection-kit-devel openal-soft-devel \
     openssl-devel libevdev-devel libudev-devel libXext-devel \
     qt6-qtbase-devel qt6-qtbase-private-devel \
     qt6-qtmultimedia-devel qt6-qtsvg-devel qt6-qttools-devel \

From 0444e590e071c565f548b1fdb48d4e06f0eba3b7 Mon Sep 17 00:00:00 2001
From: squidbus <175574877+squidbus@users.noreply.github.com>
Date: Mon, 9 Jun 2025 19:29:15 -0700
Subject: [PATCH 02/14] mac: Fix building on macOS 26. (#3073)

---
 CMakeLists.txt                 | 13 +++++--------
 externals/MoltenVK/MoltenVK    |  2 +-
 externals/MoltenVK/SPIRV-Cross |  2 +-
 src/core/signals.cpp           |  1 +
 src/core/tls.cpp               |  2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6dfe9348a..7c2739d22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1119,6 +1119,10 @@ if (APPLE)
         set(MVK_BUNDLE_PATH "Resources/vulkan/icd.d")
         set_property(TARGET shadps4 APPEND PROPERTY BUILD_RPATH "@executable_path/../${MVK_BUNDLE_PATH}")
         set(MVK_DST ${CMAKE_CURRENT_BINARY_DIR}/shadps4.app/Contents/${MVK_BUNDLE_PATH})
+
+        add_custom_command(
+            OUTPUT ${MVK_DST}
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${MVK_DST})
     else()
         set_property(TARGET shadps4 APPEND PROPERTY BUILD_RPATH "@executable_path")
         set(MVK_DST ${CMAKE_CURRENT_BINARY_DIR})
@@ -1129,9 +1133,6 @@ if (APPLE)
     set(MVK_ICD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/externals/MoltenVK/MoltenVK/MoltenVK/icd/MoltenVK_icd.json)
     set(MVK_ICD_DST ${MVK_DST}/MoltenVK_icd.json)
 
-    add_custom_command(
-        OUTPUT ${MVK_DST}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${MVK_DST})
     add_custom_command(
         OUTPUT ${MVK_ICD_DST}
         DEPENDS ${MVK_ICD_SRC} ${MVK_DST}
@@ -1146,17 +1147,13 @@ if (APPLE)
 
     if (ARCHITECTURE STREQUAL "x86_64")
         # Reserve system-managed memory space.
-        target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,SYSTEM_MANAGED,0x400000,-segaddr,SYSTEM_RESERVED,0x7FFFFC000,-image_base,0x20000000000)
+        target_link_options(shadps4 PRIVATE -Wl,-ld_classic,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,SYSTEM_MANAGED,0x400000,-segaddr,SYSTEM_RESERVED,0x7FFFFC000,-image_base,0x20000000000)
     endif()
 
     # Replacement for std::chrono::time_zone
     target_link_libraries(shadps4 PRIVATE date::date-tz)
 endif()
 
-if (NOT ENABLE_QT_GUI)
-  target_link_libraries(shadps4 PRIVATE SDL3::SDL3)
-endif()
-
 if (ENABLE_QT_GUI)
     target_link_libraries(shadps4 PRIVATE Qt6::Widgets Qt6::Concurrent Qt6::Network Qt6::Multimedia)
     add_definitions(-DENABLE_QT_GUI)
diff --git a/externals/MoltenVK/MoltenVK b/externals/MoltenVK/MoltenVK
index 3a0b07a24..00abd384c 160000
--- a/externals/MoltenVK/MoltenVK
+++ b/externals/MoltenVK/MoltenVK
@@ -1 +1 @@
-Subproject commit 3a0b07a24a4a681ffe70b461b1f4333b2729e2ef
+Subproject commit 00abd384ce01cbd439045905d2fa6cf799dfa2f6
diff --git a/externals/MoltenVK/SPIRV-Cross b/externals/MoltenVK/SPIRV-Cross
index 969e75f7c..1a69a919f 160000
--- a/externals/MoltenVK/SPIRV-Cross
+++ b/externals/MoltenVK/SPIRV-Cross
@@ -1 +1 @@
-Subproject commit 969e75f7cc0718774231d029f9d52fa87d4ae1b2
+Subproject commit 1a69a919fa302e92b337594bd0a8aaea61037d91
diff --git a/src/core/signals.cpp b/src/core/signals.cpp
index e47a78cd2..4099ac237 100644
--- a/src/core/signals.cpp
+++ b/src/core/signals.cpp
@@ -11,6 +11,7 @@
 #include <windows.h>
 #else
 #include <csignal>
+#include <pthread.h>
 #ifdef ARCH_X86_64
 #include <Zydis/Formatter.h>
 #endif
diff --git a/src/core/tls.cpp b/src/core/tls.cpp
index e13c683e1..0d1d514cf 100644
--- a/src/core/tls.cpp
+++ b/src/core/tls.cpp
@@ -51,7 +51,7 @@ Tcb* GetTcbBase() {
 // Apple x86_64
 
 // Reserve space in the 32-bit address range for allocating TCB pages.
-asm(".zerofill TCB_SPACE,TCB_SPACE,__guest_system,0x3FC000");
+asm(".zerofill TCB_SPACE,TCB_SPACE,__tcb_space,0x3FC000");
 
 struct LdtPage {
     void* tcb;

From e2b726382ea4a156eefc52bcb7cac06713563e5e Mon Sep 17 00:00:00 2001
From: squidbus <175574877+squidbus@users.noreply.github.com>
Date: Mon, 9 Jun 2025 19:48:20 -0700
Subject: [PATCH 03/14] vulkan: Fix two validation errors introduced by shared
 memory changes. (#3074)

---
 .../passes/shared_memory_to_storage_pass.cpp  |  8 ++++----
 .../renderer_vulkan/vk_instance.cpp           | 20 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
index 409c05940..12d4d0659 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
@@ -34,11 +34,11 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
     if (program.info.stage != Stage::Compute) {
         return;
     }
-    // Only perform the transform if the host shared memory is insufficient
-    // or the device does not support VK_KHR_workgroup_memory_explicit_layout
+    // Only perform the transform if there is shared memory and either host shared memory is
+    // insufficient or the device does not support VK_KHR_workgroup_memory_explicit_layout
     const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
-    if (shared_memory_size <= profile.max_shared_memory_size &&
-        profile.supports_workgroup_explicit_memory_layout) {
+    if (shared_memory_size == 0 || (shared_memory_size <= profile.max_shared_memory_size &&
+                                    profile.supports_workgroup_explicit_memory_layout)) {
         return;
     }
     // Add buffer binding for shared memory storage buffer.
diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp
index 0591e06ce..63c0a38d6 100644
--- a/src/video_core/renderer_vulkan/vk_instance.cpp
+++ b/src/video_core/renderer_vulkan/vk_instance.cpp
@@ -445,7 +445,25 @@ bool Instance::CreateDevice() {
                 workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess,
         },
 #ifdef __APPLE__
-        portability_features,
+        vk::PhysicalDevicePortabilitySubsetFeaturesKHR{
+            .constantAlphaColorBlendFactors = portability_features.constantAlphaColorBlendFactors,
+            .events = portability_features.events,
+            .imageViewFormatReinterpretation = portability_features.imageViewFormatReinterpretation,
+            .imageViewFormatSwizzle = portability_features.imageViewFormatSwizzle,
+            .imageView2DOn3DImage = portability_features.imageView2DOn3DImage,
+            .multisampleArrayImage = portability_features.multisampleArrayImage,
+            .mutableComparisonSamplers = portability_features.mutableComparisonSamplers,
+            .pointPolygons = portability_features.pointPolygons,
+            .samplerMipLodBias = portability_features.samplerMipLodBias,
+            .separateStencilMaskRef = portability_features.separateStencilMaskRef,
+            .shaderSampleRateInterpolationFunctions =
+                portability_features.shaderSampleRateInterpolationFunctions,
+            .tessellationIsolines = portability_features.tessellationIsolines,
+            .tessellationPointMode = portability_features.tessellationPointMode,
+            .triangleFans = portability_features.triangleFans,
+            .vertexAttributeAccessBeyondStride =
+                portability_features.vertexAttributeAccessBeyondStride,
+        },
 #endif
     };
 

From e0c930f2d801e0d2998202760b785a76f9346ecd Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Tue, 10 Jun 2025 18:57:16 +0300
Subject: [PATCH 04/14] shader_recompiler: Cleanup fragment attribute handling
 (#3076)

* image: Take minimum of mip levels

Avoids validation error

* texture_cache: Update depth target image

Avoids using undefined depth target in rendering

* shader_recompiler: Cleanup fragment attribute handling
---
 .../backend/spirv/spirv_emit_context.cpp      | 28 ++++++++++---------
 .../translate/vector_interpolation.cpp        | 12 ++++----
 src/video_core/texture_cache/image.cpp        |  3 +-
 .../texture_cache/texture_cache.cpp           |  2 +-
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 672856397..c47a75739 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -299,8 +299,7 @@ void EmitContext::DefineInterpolatedAttribs() {
     // Iterate all input attributes, load them and manually interpolate.
     for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
         const auto& input = runtime_info.fs_info.inputs[i];
-        const u32 semantic = input.param_index;
-        auto& params = input_params[semantic];
+        auto& params = input_params[i];
         if (input.is_flat || params.is_loaded) {
             continue;
         }
@@ -318,7 +317,7 @@ void EmitContext::DefineInterpolatedAttribs() {
         const Id p10_y{OpVectorTimesScalar(F32[4], p10, bary_coord_y)};
         const Id p20_z{OpVectorTimesScalar(F32[4], p20, bary_coord_z)};
         params.id = OpFAdd(F32[4], p0, OpFAdd(F32[4], p10_y, p20_z));
-        Name(params.id, fmt::format("fs_in_attr{}", semantic));
+        Name(params.id, fmt::format("fs_in_attr{}", i));
         params.is_loaded = true;
     }
 }
@@ -427,25 +426,28 @@ void EmitContext::DefineInputs() {
         }
         for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) {
             const auto& input = runtime_info.fs_info.inputs[i];
-            const u32 semantic = input.param_index;
-            ASSERT(semantic < IR::NumParams);
             if (input.IsDefault()) {
-                input_params[semantic] = {
-                    MakeDefaultValue(*this, input.default_value), input_f32, F32[1], 4, false, true,
+                input_params[i] = {
+                    .id = MakeDefaultValue(*this, input.default_value),
+                    .pointer_type = input_f32,
+                    .component_type = F32[1],
+                    .num_components = 4,
+                    .is_integer = false,
+                    .is_loaded = true,
                 };
                 continue;
             }
-            const IR::Attribute param{IR::Attribute::Param0 + input.param_index};
+            const IR::Attribute param{IR::Attribute::Param0 + i};
             const u32 num_components = info.loads.NumComponents(param);
             const Id type{F32[num_components]};
             Id attr_id{};
             if (profile.needs_manual_interpolation && !input.is_flat) {
-                attr_id = DefineInput(TypeArray(type, ConstU32(3U)), semantic);
+                attr_id = DefineInput(TypeArray(type, ConstU32(3U)), input.param_index);
                 Decorate(attr_id, spv::Decoration::PerVertexKHR);
-                Name(attr_id, fmt::format("fs_in_attr{}_p", semantic));
+                Name(attr_id, fmt::format("fs_in_attr{}_p", i));
             } else {
-                attr_id = DefineInput(type, semantic);
-                Name(attr_id, fmt::format("fs_in_attr{}", semantic));
+                attr_id = DefineInput(type, input.param_index);
+                Name(attr_id, fmt::format("fs_in_attr{}", i));
 
                 if (input.is_flat) {
                     Decorate(attr_id, spv::Decoration::Flat);
@@ -453,7 +455,7 @@ void EmitContext::DefineInputs() {
                     Decorate(attr_id, spv::Decoration::NoPerspective);
                 }
             }
-            input_params[semantic] =
+            input_params[i] =
                 GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, false);
         }
         break;
diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
index 2d7297c12..5a287dbe2 100644
--- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp
@@ -22,15 +22,17 @@ void Translator::EmitVectorInterpolation(const GcnInst& inst) {
 // VINTRP
 
 void Translator::V_INTERP_P2_F32(const GcnInst& inst) {
-    const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr);
-    info.interp_qualifiers[attr.param_index] = vgpr_to_interp[inst.src[0].code];
-    const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
+    const u32 attr_index = inst.control.vintrp.attr;
+    const auto& attr = runtime_info.fs_info.inputs.at(attr_index);
+    info.interp_qualifiers[attr_index] = vgpr_to_interp[inst.src[0].code];
+    const IR::Attribute attrib{IR::Attribute::Param0 + attr_index};
     SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan));
 }
 
 void Translator::V_INTERP_MOV_F32(const GcnInst& inst) {
-    const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr);
-    const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index};
+    const u32 attr_index = inst.control.vintrp.attr;
+    const auto& attr = runtime_info.fs_info.inputs.at(attr_index);
+    const IR::Attribute attrib{IR::Attribute::Param0 + attr_index};
     SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan));
 }
 
diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp
index d8070da61..6241100a0 100644
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@@ -319,7 +319,8 @@ void Image::CopyImage(const Image& image) {
     auto cmdbuf = scheduler->CommandBuffer();
 
     boost::container::small_vector<vk::ImageCopy, 14> image_copy{};
-    for (u32 m = 0; m < image.info.resources.levels; ++m) {
+    const u32 num_mips = std::min(image.info.resources.levels, info.resources.levels);
+    for (u32 m = 0; m < num_mips; ++m) {
         const auto mip_w = std::max(image.info.size.width >> m, 1u);
         const auto mip_h = std::max(image.info.size.height >> m, 1u);
         const auto mip_d = std::max(image.info.size.depth >> m, 1u);
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index f070b9132..cc244eb6b 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -461,9 +461,9 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) {
     const ImageId image_id = FindImage(desc);
     Image& image = slot_images[image_id];
     image.flags |= ImageFlagBits::GpuModified;
-    image.flags &= ~ImageFlagBits::Dirty;
     image.usage.depth_target = 1u;
     image.usage.stencil = image.info.HasStencil();
+    UpdateImage(image_id);
 
     // Register meta data for this depth buffer
     if (!(image.flags & ImageFlagBits::MetaRegistered)) {

From 9981c8df03dcaf00d3e3d6b59f731961424397d5 Mon Sep 17 00:00:00 2001
From: Fire Cube <ben7@gmx.ch>
Date: Tue, 10 Jun 2025 21:30:45 +0200
Subject: [PATCH 05/14] Add option to ignore game patch (#3039)

* impl

* fix

* cleanup

* more

* clang +

* why
---
 src/core/file_sys/fs.cpp |  6 ++++--
 src/core/file_sys/fs.h   |  1 +
 src/emulator.cpp         |  2 +-
 src/main.cpp             | 26 +++++++++++++++-----------
 src/qt_gui/main.cpp      | 32 ++++++++++++++++++--------------
 5 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp
index 4dad44874..b237ab7d9 100644
--- a/src/core/file_sys/fs.cpp
+++ b/src/core/file_sys/fs.cpp
@@ -10,6 +10,8 @@
 
 namespace Core::FileSys {
 
+bool MntPoints::ignore_game_patches = false;
+
 std::string RemoveTrailingSlashes(const std::string& path) {
     // Remove trailing slashes to make comparisons simpler.
     std::string path_sanitized = path;
@@ -77,7 +79,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea
     patch_path /= rel_path;
 
     if ((corrected_path.starts_with("/app0") || corrected_path.starts_with("/hostapp")) &&
-        !force_base_path && std::filesystem::exists(patch_path)) {
+        !force_base_path && !ignore_game_patches && std::filesystem::exists(patch_path)) {
         return patch_path;
     }
 
@@ -137,7 +139,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea
         return std::optional<std::filesystem::path>(current_path);
     };
 
-    if (!force_base_path) {
+    if (!force_base_path && !ignore_game_patches) {
         if (const auto path = search(patch_path)) {
             return *path;
         }
diff --git a/src/core/file_sys/fs.h b/src/core/file_sys/fs.h
index 6638b48e8..4a2aa56c1 100644
--- a/src/core/file_sys/fs.h
+++ b/src/core/file_sys/fs.h
@@ -21,6 +21,7 @@ class MntPoints {
     static constexpr bool NeedsCaseInsensitiveSearch = true;
 #endif
 public:
+    static bool ignore_game_patches;
     struct MntPair {
         std::filesystem::path host_path;
         std::string mount; // e.g /app0
diff --git a/src/emulator.cpp b/src/emulator.cpp
index bb50b8686..f50147818 100644
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@@ -75,7 +75,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector<std::string> ar
         game_folder_name.ends_with("-UPDATE") || game_folder_name.ends_with("-patch")) {
         // If an executable was launched from a separate update directory,
         // use the base game directory as the game folder.
-        const auto base_name = game_folder_name.substr(0, game_folder_name.size() - 7);
+        const std::string base_name = game_folder_name.substr(0, game_folder_name.rfind('-'));
         const auto base_path = game_folder.parent_path() / base_name;
         if (std::filesystem::is_directory(base_path)) {
             game_folder = base_path;
diff --git a/src/main.cpp b/src/main.cpp
index 85581774b..8a251c55a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -35,17 +35,19 @@ int main(int argc, char* argv[]) {
     std::unordered_map<std::string, std::function<void(int&)>> arg_map = {
         {"-h",
          [&](int&) {
-             std::cout << "Usage: shadps4 [options] <elf or eboot.bin path>\n"
-                          "Options:\n"
-                          "  -g, --game <path|ID>          Specify game path to launch\n"
-                          " -- ...                         Parameters passed to the game ELF. "
-                          "Needs to be at the end of the line, and everything after \"--\" is a "
-                          "game argument.\n"
-                          "  -p, --patch <patch_file>      Apply specified patch file\n"
-                          "  -f, --fullscreen <true|false> Specify window initial fullscreen "
-                          "state. Does not overwrite the config file.\n"
-                          "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
-                          "  -h, --help                    Display this help message\n";
+             std::cout
+                 << "Usage: shadps4 [options] <elf or eboot.bin path>\n"
+                    "Options:\n"
+                    "  -g, --game <path|ID>          Specify game path to launch\n"
+                    " -- ...                         Parameters passed to the game ELF. "
+                    "Needs to be at the end of the line, and everything after \"--\" is a "
+                    "game argument.\n"
+                    "  -p, --patch <patch_file>      Apply specified patch file\n"
+                    "  -i, --ignore-game-patch       Disable automatic loading of game patch\n"
+                    "  -f, --fullscreen <true|false> Specify window initial fullscreen "
+                    "state. Does not overwrite the config file.\n"
+                    "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
+                    "  -h, --help                    Display this help message\n";
              exit(0);
          }},
         {"--help", [&](int& i) { arg_map["-h"](i); }},
@@ -72,6 +74,8 @@ int main(int argc, char* argv[]) {
              }
          }},
         {"--patch", [&](int& i) { arg_map["-p"](i); }},
+        {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }},
+        {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }},
         {"-f",
          [&](int& i) {
              if (++i >= argc) {
diff --git a/src/qt_gui/main.cpp b/src/qt_gui/main.cpp
index bd9dca6ce..b7de517e8 100644
--- a/src/qt_gui/main.cpp
+++ b/src/qt_gui/main.cpp
@@ -41,20 +41,22 @@ int main(int argc, char* argv[]) {
     std::unordered_map<std::string, std::function<void(int&)>> arg_map = {
         {"-h",
          [&](int&) {
-             std::cout << "Usage: shadps4 [options]\n"
-                          "Options:\n"
-                          "  No arguments: Opens the GUI.\n"
-                          "  -g, --game <path|ID>          Specify <eboot.bin or elf path> or "
-                          "<game ID (CUSAXXXXX)> to launch\n"
-                          " -- ...                         Parameters passed to the game ELF. "
-                          "Needs to be at the end of the line, and everything after \"--\" is a "
-                          "game argument.\n"
-                          "  -p, --patch <patch_file>      Apply specified patch file\n"
-                          "  -s, --show-gui                Show the GUI\n"
-                          "  -f, --fullscreen <true|false> Specify window initial fullscreen "
-                          "state. Does not overwrite the config file.\n"
-                          "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
-                          "  -h, --help                    Display this help message\n";
+             std::cout
+                 << "Usage: shadps4 [options]\n"
+                    "Options:\n"
+                    "  No arguments: Opens the GUI.\n"
+                    "  -g, --game <path|ID>          Specify <eboot.bin or elf path> or "
+                    "<game ID (CUSAXXXXX)> to launch\n"
+                    " -- ...                         Parameters passed to the game ELF. "
+                    "Needs to be at the end of the line, and everything after \"--\" is a "
+                    "game argument.\n"
+                    "  -p, --patch <patch_file>      Apply specified patch file\n"
+                    "  -i, --ignore-game-patch       Disable automatic loading of game patch\n"
+                    "  -s, --show-gui                Show the GUI\n"
+                    "  -f, --fullscreen <true|false> Specify window initial fullscreen "
+                    "state. Does not overwrite the config file.\n"
+                    "  --add-game-folder <folder>    Adds a new game folder to the config.\n"
+                    "  -h, --help                    Display this help message\n";
              exit(0);
          }},
         {"--help", [&](int& i) { arg_map["-h"](i); }}, // Redirect --help to -h
@@ -84,6 +86,8 @@ int main(int argc, char* argv[]) {
              }
          }},
         {"--patch", [&](int& i) { arg_map["-p"](i); }},
+        {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }},
+        {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }},
         {"-f",
          [&](int& i) {
              if (++i >= argc) {

From b49340dff8e28abcf96fe07ad0e90c4dda0bcaf2 Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:22:50 -0500
Subject: [PATCH 06/14] libSceVideodec2: Update structs to match newer
 firmwares (#3077)

* Update file_system.cpp

* libSceVideodec2 struct fixes

Our code was based on an old version of the libSceVideodec2 library. Based on what I've decompiled, these structs changed somewhere around firmware 6.50, and newer versions of the library have these flexible checks to accommodate both variants of the structs.

* Static assert for AvcPictureInfo struct

All the other Videodec2 structs have static asserts, might as well use one here too.

* Initialize new values

Set proper values for frameFormat and framePitchInBytes.
`frame->linesize[0]` appears to be in bytes already, I'm not sure if that means framePitch is being set wrong though.
---
 src/core/libraries/kernel/file_system.cpp      |  1 +
 src/core/libraries/videodec/videodec2.cpp      |  6 +++---
 src/core/libraries/videodec/videodec2.h        |  4 +++-
 src/core/libraries/videodec/videodec2_avc.h    | 17 +++++++++++++++++
 src/core/libraries/videodec/videodec2_impl.cpp |  4 ++++
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/core/libraries/kernel/file_system.cpp b/src/core/libraries/kernel/file_system.cpp
index ad372325c..fecc606fd 100644
--- a/src/core/libraries/kernel/file_system.cpp
+++ b/src/core/libraries/kernel/file_system.cpp
@@ -1050,6 +1050,7 @@ void RegisterFileSystem(Core::Loader::SymbolsResolver* sym) {
     LIB_FUNCTION("4wSze92BhLI", "libkernel", 1, "libkernel", 1, 1, sceKernelWrite);
     LIB_FUNCTION("+WRlkKjZvag", "libkernel", 1, "libkernel", 1, 1, readv);
     LIB_FUNCTION("YSHRBRLn2pI", "libkernel", 1, "libkernel", 1, 1, writev);
+    LIB_FUNCTION("kAt6VDbHmro", "libkernel", 1, "libkernel", 1, 1, sceKernelWritev);
     LIB_FUNCTION("Oy6IpwgtYOk", "libScePosix", 1, "libkernel", 1, 1, posix_lseek);
     LIB_FUNCTION("Oy6IpwgtYOk", "libkernel", 1, "libkernel", 1, 1, posix_lseek);
     LIB_FUNCTION("oib76F-12fk", "libkernel", 1, "libkernel", 1, 1, sceKernelLseek);
diff --git a/src/core/libraries/videodec/videodec2.cpp b/src/core/libraries/videodec/videodec2.cpp
index 4f9379151..1c6044fe2 100644
--- a/src/core/libraries/videodec/videodec2.cpp
+++ b/src/core/libraries/videodec/videodec2.cpp
@@ -140,7 +140,7 @@ s32 PS4_SYSV_ABI sceVideodec2Flush(OrbisVideodec2Decoder decoder,
         return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER;
     }
     if (frameBuffer->thisSize != sizeof(OrbisVideodec2FrameBuffer) ||
-        outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) {
+        (outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) {
         LOG_ERROR(Lib_Vdec2, "Invalid struct size");
         return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
     }
@@ -167,7 +167,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp
         LOG_ERROR(Lib_Vdec2, "Invalid arguments");
         return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER;
     }
-    if (outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) {
+    if ((outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) {
         LOG_ERROR(Lib_Vdec2, "Invalid struct size");
         return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
     }
@@ -179,7 +179,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp
     if (p1stPictureInfoOut) {
         OrbisVideodec2AvcPictureInfo* picInfo =
             static_cast<OrbisVideodec2AvcPictureInfo*>(p1stPictureInfoOut);
-        if (picInfo->thisSize != sizeof(OrbisVideodec2AvcPictureInfo)) {
+        if ((picInfo->thisSize | 16) != sizeof(OrbisVideodec2AvcPictureInfo)) {
             LOG_ERROR(Lib_Vdec2, "Invalid struct size");
             return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE;
         }
diff --git a/src/core/libraries/videodec/videodec2.h b/src/core/libraries/videodec/videodec2.h
index abc8f8ab5..410ee8ea6 100644
--- a/src/core/libraries/videodec/videodec2.h
+++ b/src/core/libraries/videodec/videodec2.h
@@ -73,8 +73,10 @@ struct OrbisVideodec2OutputInfo {
     u32 frameHeight;
     void* frameBuffer;
     u64 frameBufferSize;
+    u32 frameFormat;
+    u32 framePitchInBytes;
 };
-static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x30);
+static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x38);
 
 struct OrbisVideodec2FrameBuffer {
     u64 thisSize;
diff --git a/src/core/libraries/videodec/videodec2_avc.h b/src/core/libraries/videodec/videodec2_avc.h
index 22293ee93..1975209cb 100644
--- a/src/core/libraries/videodec/videodec2_avc.h
+++ b/src/core/libraries/videodec/videodec2_avc.h
@@ -55,6 +55,23 @@ struct OrbisVideodec2AvcPictureInfo {
     u8 pic_struct;
     u8 field_pic_flag;
     u8 bottom_field_flag;
+
+    u8 sequenceParameterSetPresentFlag;
+    u8 pictureParameterSetPresentFlag;
+    u8 auDelimiterPresentFlag;
+    u8 endOfSequencePresentFlag;
+    u8 endOfStreamPresentFlag;
+    u8 fillerDataPresentFlag;
+    u8 pictureTimingSeiPresentFlag;
+    u8 bufferingPeriodSeiPresentFlag;
+
+    u8 constraint_set0_flag;
+    u8 constraint_set1_flag;
+    u8 constraint_set2_flag;
+    u8 constraint_set3_flag;
+    u8 constraint_set4_flag;
+    u8 constraint_set5_flag;
 };
+static_assert(sizeof(OrbisVideodec2AvcPictureInfo) == 0x78);
 
 } // namespace Libraries::Vdec2
\ No newline at end of file
diff --git a/src/core/libraries/videodec/videodec2_impl.cpp b/src/core/libraries/videodec/videodec2_impl.cpp
index 22b17c86c..a643239a3 100644
--- a/src/core/libraries/videodec/videodec2_impl.cpp
+++ b/src/core/libraries/videodec/videodec2_impl.cpp
@@ -48,6 +48,7 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
+    outputInfo.frameFormat = 0;
 
     if (!inputData.auData) {
         return ORBIS_VIDEODEC2_ERROR_ACCESS_UNIT_POINTER;
@@ -106,6 +107,7 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
+        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 
@@ -144,6 +146,7 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
+    outputInfo.frameFormat = 0;
 
     AVFrame* frame = av_frame_alloc();
     if (!frame) {
@@ -175,6 +178,7 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
+        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 

From ca92e72efe6a041ce27c5e7473b62abf99e8f4c2 Mon Sep 17 00:00:00 2001
From: squidbus <175574877+squidbus@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:41:58 -0700
Subject: [PATCH 07/14] shader_recompiler: Various fixes to shared memory and
 atomics. (#3075)

* shader_recompiler: Various fixes to shared memory and atomics.

* shader_recompiler: Re-type non-32bit load/stores.
---
 .../backend/spirv/emit_spirv_atomic.cpp       |  63 ++++++----
 .../backend/spirv/emit_spirv_bounds.h         |  66 ++++++++--
 .../spirv/emit_spirv_context_get_set.cpp      | 118 +++++++++---------
 .../backend/spirv/emit_spirv_convert.cpp      |   8 ++
 .../backend/spirv/emit_spirv_instructions.h   |  13 +-
 .../frontend/translate/data_share.cpp         |  76 +++++------
 .../frontend/translate/vector_memory.cpp      |   4 +-
 src/shader_recompiler/ir/ir_emitter.cpp       |  64 +++++++---
 src/shader_recompiler/ir/ir_emitter.h         |  26 ++--
 src/shader_recompiler/ir/microinstruction.cpp |  11 +-
 src/shader_recompiler/ir/opcodes.inc          |  25 ++--
 .../ir/passes/hull_shader_transform.cpp       |  12 +-
 .../ir/passes/lower_buffer_format_to_raw.cpp  |  16 +--
 .../ir/passes/resource_tracking_pass.cpp      |   9 ++
 .../ir/passes/shared_memory_barrier_pass.cpp  |   6 +-
 .../passes/shared_memory_to_storage_pass.cpp  | 100 ++++++++++-----
 src/shader_recompiler/ir/value.h              |   1 +
 17 files changed, 391 insertions(+), 227 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
index 13fd8e180..47290e7e8 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
@@ -27,6 +27,19 @@ Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value,
     });
 }
 
+Id SharedAtomicU32IncDec(EmitContext& ctx, Id offset,
+                         Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
+    const Id shift_id{ctx.ConstU32(2U)};
+    const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
+    const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
+    const Id pointer{
+        ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
+    const auto [scope, semantics]{AtomicArgs(ctx)};
+    return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
+        return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics);
+    });
+}
+
 Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value,
                    Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
     const Id shift_id{ctx.ConstU32(3U)};
@@ -40,19 +53,6 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value,
     });
 }
 
-Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset,
-                          Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
-    const Id shift_id{ctx.ConstU32(2U)};
-    const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)};
-    const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)};
-    const Id pointer{
-        ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)};
-    const auto [scope, semantics]{AtomicArgs(ctx)};
-    return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] {
-        return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics);
-    });
-}
-
 Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                    Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) {
     const auto& buffer = ctx.buffers[handle];
@@ -68,6 +68,21 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id
     });
 }
 
+Id BufferAtomicU32IncDec(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
+                         Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) {
+    const auto& buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset);
+    }
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u));
+    const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32];
+    const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index);
+    const auto [scope, semantics]{AtomicArgs(ctx)};
+    return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] {
+        return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics);
+    });
+}
+
 Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value,
                           Id cmp_value,
                           Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id, Id, Id)) {
@@ -156,12 +171,12 @@ Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) {
     return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub);
 }
 
-Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset) {
-    return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement);
+Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) {
+    return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement);
 }
 
-Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset) {
-    return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement);
+Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) {
+    return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement);
 }
 
 Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
@@ -172,6 +187,10 @@ Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd);
 }
 
+Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
+    return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicISub);
+}
+
 Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
     return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin);
 }
@@ -188,14 +207,12 @@ Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMax);
 }
 
-Id EmitBufferAtomicInc32(EmitContext&, IR::Inst*, u32, Id, Id) {
-    // TODO
-    UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicInc32);
+Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIIncrement);
 }
 
-Id EmitBufferAtomicDec32(EmitContext&, IR::Inst*, u32, Id, Id) {
-    // TODO
-    UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicDec32);
+Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIDecrement);
 }
 
 Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
index 41e70c8c3..e66467c6b 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h
@@ -1,31 +1,54 @@
 // SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
-#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
+#pragma once
+
 #include "shader_recompiler/backend/spirv/spirv_emit_context.h"
 
 namespace Shader::Backend::SPIRV {
 
-template <u32 bit_size>
-auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
-    Id zero_value{};
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+std::tuple<Id, Id> ResolveTypeAndZero(EmitContext& ctx) {
     Id result_type{};
-    if constexpr (bit_size == 64) {
-        zero_value = ctx.u64_zero_value;
+    Id zero_value{};
+    if constexpr (bit_size == 64 && num_components == 1 && !is_float) {
         result_type = ctx.U64;
+        zero_value = ctx.u64_zero_value;
     } else if constexpr (bit_size == 32) {
-        zero_value = ctx.u32_zero_value;
-        result_type = ctx.U32[1];
-    } else if constexpr (bit_size == 16) {
-        zero_value = ctx.u16_zero_value;
+        if (is_float) {
+            result_type = ctx.F32[num_components];
+            zero_value = ctx.f32_zero_value;
+        } else {
+            result_type = ctx.U32[num_components];
+            zero_value = ctx.u32_zero_value;
+        }
+    } else if constexpr (bit_size == 16 && num_components == 1 && !is_float) {
         result_type = ctx.U16;
+        zero_value = ctx.u16_zero_value;
+    } else if constexpr (bit_size == 8 && num_components == 1 && !is_float) {
+        result_type = ctx.U8;
+        zero_value = ctx.u8_zero_value;
     } else {
-        static_assert(false, "type not supported");
+        static_assert(false, "Type not supported.");
     }
+    if (num_components > 1) {
+        std::array<Id, num_components> zero_ids;
+        zero_ids.fill(zero_value);
+        zero_value = ctx.ConstantComposite(result_type, zero_ids);
+    }
+    return {result_type, zero_value};
+}
+
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
     if (Sirit::ValidId(buffer_size)) {
         // Bounds checking enabled, wrap in a conditional branch to make sure that
         // the atomic is not mistakenly executed when the index is out of bounds.
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size);
+        auto compare_index = index;
+        if (num_components > 1) {
+            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1));
+        }
+        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
         const Id ib_label = ctx.OpLabel();
         const Id end_label = ctx.OpLabel();
         ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone);
@@ -36,6 +59,8 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun
         ctx.OpBranch(end_label);
         ctx.AddLabel(end_label);
         if (Sirit::ValidId(ib_result)) {
+            const auto [result_type, zero_value] =
+                ResolveTypeAndZero<bit_size, num_components, is_float>(ctx);
             return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label);
         } else {
             return Id{0};
@@ -45,4 +70,21 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun
     return emit_func();
 }
 
+template <u32 bit_size, u32 num_components = 1, bool is_float = false>
+static Id LoadAccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result) {
+    if (Sirit::ValidId(buffer_size)) {
+        // Bounds checking enabled, wrap in a select.
+        auto compare_index = index;
+        if (num_components > 1) {
+            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1));
+        }
+        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
+        const auto [result_type, zero_value] =
+            ResolveTypeAndZero<bit_size, num_components, is_float>(ctx);
+        return ctx.OpSelect(result_type, in_bounds, result, zero_value);
+    }
+    // Bounds checking not enabled, just return the plain value.
+    return result;
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 658d4759f..ccbe54d0a 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -11,6 +11,8 @@
 
 #include <magic_enum/magic_enum.hpp>
 
+#include "emit_spirv_bounds.h"
+
 namespace Shader::Backend::SPIRV {
 namespace {
 
@@ -239,8 +241,8 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) {
     }
 
     if (IR::IsParam(attr)) {
-        const u32 index{u32(attr) - u32(IR::Attribute::Param0)};
-        const auto& param{ctx.input_params.at(index)};
+        const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)};
+        const auto& param{ctx.input_params.at(param_index)};
         if (param.buffer_handle >= 0) {
             const auto step_rate = EmitReadStepRate(ctx, param.id.value);
             const auto offset = ctx.OpIAdd(
@@ -415,27 +417,6 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) {
     ctx.OpStore(pointer, value);
 }
 
-template <u32 N>
-static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result,
-                                    bool is_float) {
-    if (Sirit::ValidId(buffer_size)) {
-        // Bounds checking enabled, wrap in a select.
-        const auto result_type = is_float ? ctx.F32[N] : ctx.U32[N];
-        auto compare_index = index;
-        auto zero_value = is_float ? ctx.f32_zero_value : ctx.u32_zero_value;
-        if (N > 1) {
-            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1));
-            std::array<Id, N> zero_ids;
-            zero_ids.fill(zero_value);
-            zero_value = ctx.ConstantComposite(result_type, zero_ids);
-        }
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
-        return ctx.OpSelect(result_type, in_bounds, result, zero_value);
-    }
-    // Bounds checking not enabled, just return the plain value.
-    return result;
-}
-
 template <u32 N, PointerType alias>
 static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto flags = inst->Flags<IR::BufferInstInfo>();
@@ -454,8 +435,9 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
         const Id result_i = ctx.OpLoad(data_types[1], ptr_i);
         if (!flags.typed) {
             // Untyped loads have bounds checking per-component.
-            ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords,
-                                                       result_i, alias == PointerType::F32));
+            ids.push_back(LoadAccessBoundsCheck < 32, 1,
+                          alias ==
+                              PointerType::F32 > (ctx, index_i, spv_buffer.size_dwords, result_i));
         } else {
             ids.push_back(result_i);
         }
@@ -464,8 +446,8 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a
     const Id result = N == 1 ? ids[0] : ctx.OpCompositeConstruct(data_types[N], ids);
     if (flags.typed) {
         // Typed loads have single bounds check for the whole load.
-        return EmitLoadBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, result,
-                                            alias == PointerType::F32);
+        return LoadAccessBoundsCheck < 32, N,
+               alias == PointerType::F32 > (ctx, index, spv_buffer.size_dwords, result);
     }
     return result;
 }
@@ -477,8 +459,8 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     }
     const auto [id, pointer_type] = spv_buffer[PointerType::U8];
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
-    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))};
-    return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false);
+    const Id result{ctx.OpLoad(ctx.U8, ptr)};
+    return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.size, result);
 }
 
 Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -489,8 +471,8 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto [id, pointer_type] = spv_buffer[PointerType::U16];
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))};
-    return EmitLoadBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, result, false);
+    const Id result{ctx.OpLoad(ctx.U16, ptr)};
+    return LoadAccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, result);
 }
 
 Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
@@ -509,6 +491,18 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address)
     return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address);
 }
 
+Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
+    const auto& spv_buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(spv_buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    }
+    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
+    const Id result{ctx.OpLoad(ctx.U64, ptr)};
+    return LoadAccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, result);
+}
+
 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address);
 }
@@ -529,29 +523,6 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr
     UNREACHABLE_MSG("SPIR-V instruction");
 }
 
-template <u32 N>
-void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) {
-    if (Sirit::ValidId(buffer_size)) {
-        // Bounds checking enabled, wrap in a conditional branch.
-        auto compare_index = index;
-        if (N > 1) {
-            compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1));
-        }
-        const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size);
-        const Id in_bounds_label = ctx.OpLabel();
-        const Id merge_label = ctx.OpLabel();
-        ctx.OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone);
-        ctx.OpBranchConditional(in_bounds, in_bounds_label, merge_label);
-        ctx.AddLabel(in_bounds_label);
-        emit_func();
-        ctx.OpBranch(merge_label);
-        ctx.AddLabel(merge_label);
-        return;
-    }
-    // Bounds checking not enabled, just perform the store.
-    emit_func();
-}
-
 template <u32 N, PointerType alias>
 static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address,
                                  Id value) {
@@ -569,19 +540,25 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I
             const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i));
             const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i);
             const Id value_i = N == 1 ? value : ctx.OpCompositeExtract(data_types[1], value, i);
-            auto store_i = [&]() { ctx.OpStore(ptr_i, value_i); };
+            auto store_i = [&] {
+                ctx.OpStore(ptr_i, value_i);
+                return Id{};
+            };
             if (!flags.typed) {
                 // Untyped stores have bounds checking per-component.
-                EmitStoreBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, store_i);
+                AccessBoundsCheck<32, 1, alias == PointerType::F32>(
+                    ctx, index_i, spv_buffer.size_dwords, store_i);
             } else {
                 store_i();
             }
         }
+        return Id{};
     };
 
     if (flags.typed) {
         // Typed stores have single bounds check for the whole store.
-        EmitStoreBufferBoundsCheck<N>(ctx, index, spv_buffer.size_dwords, store);
+        AccessBoundsCheck<32, N, alias == PointerType::F32>(ctx, index, spv_buffer.size_dwords,
+                                                            store);
     } else {
         store();
     }
@@ -594,8 +571,10 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v
     }
     const auto [id, pointer_type] = spv_buffer[PointerType::U8];
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)};
-    const Id result{ctx.OpUConvert(ctx.U8, value)};
-    EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); });
+    AccessBoundsCheck<8>(ctx, address, spv_buffer.size, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
 }
 
 void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
@@ -606,9 +585,10 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id
     const auto [id, pointer_type] = spv_buffer[PointerType::U16];
     const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u));
     const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)};
-    const Id result{ctx.OpUConvert(ctx.U16, value)};
-    EmitStoreBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts,
-                                  [&] { ctx.OpStore(ptr, result); });
+    AccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
 }
 
 void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
@@ -627,6 +607,20 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
     EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value);
 }
 
+void EmitStoreBufferU64(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) {
+    const auto& spv_buffer = ctx.buffers[handle];
+    if (Sirit::ValidId(spv_buffer.offset)) {
+        address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset);
+    }
+    const auto [id, pointer_type] = spv_buffer[PointerType::U64];
+    const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u));
+    const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)};
+    AccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, [&] {
+        ctx.OpStore(ptr, value);
+        return Id{};
+    });
+}
+
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
     EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value);
 }
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
index 945fa6877..c75f43393 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
@@ -263,4 +263,12 @@ Id EmitConvertU32U16(EmitContext& ctx, Id value) {
     return ctx.OpUConvert(ctx.U32[1], value);
 }
 
+Id EmitConvertU8U32(EmitContext& ctx, Id value) {
+    return ctx.OpUConvert(ctx.U8, value);
+}
+
+Id EmitConvertU32U8(EmitContext& ctx, Id value) {
+    return ctx.OpUConvert(ctx.U32[1], value);
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
index 3441c5a23..daf1b973e 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -69,6 +69,7 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
@@ -80,6 +81,7 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address
 void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+void EmitStoreBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@@ -87,12 +89,13 @@ void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
 void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
-Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
-Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
+Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
+Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
 Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
 Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value);
@@ -136,8 +139,8 @@ Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value);
 Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value);
-Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset);
-Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset);
+Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset);
+Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset);
 Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value);
 
 Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2);
@@ -461,6 +464,8 @@ Id EmitConvertF64U32(EmitContext& ctx, Id value);
 Id EmitConvertF64U64(EmitContext& ctx, Id value);
 Id EmitConvertU16U32(EmitContext& ctx, Id value);
 Id EmitConvertU32U16(EmitContext& ctx, Id value);
+Id EmitConvertU8U32(EmitContext& ctx, Id value);
+Id EmitConvertU32U8(EmitContext& ctx, Id value);
 
 Id EmitImageSampleRaw(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address1, Id address2,
                       Id address3, Id address4);
diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp
index 4b6a58fd0..8ead93f78 100644
--- a/src/shader_recompiler/frontend/translate/data_share.cpp
+++ b/src/shader_recompiler/frontend/translate/data_share.cpp
@@ -216,34 +216,38 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid
     if (is_pair) {
         const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
-        if (bit_size == 32) {
-            ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
-        } else {
+        if (bit_size == 64) {
             ir.WriteShared(64,
                            ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0),
                                                                  ir.GetVectorReg(data0 + 1))),
                            addr0);
+        } else if (bit_size == 32) {
+            ir.WriteShared(32, ir.GetVectorReg(data0), addr0);
+        } else if (bit_size == 16) {
+            ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0);
         }
         const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
-        if (bit_size == 32) {
-            ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
-        } else {
+        if (bit_size == 64) {
             ir.WriteShared(64,
                            ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1),
                                                                  ir.GetVectorReg(data1 + 1))),
                            addr1);
+        } else if (bit_size == 32) {
+            ir.WriteShared(32, ir.GetVectorReg(data1), addr1);
+        } else if (bit_size == 16) {
+            ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1);
         }
-    } else if (bit_size == 64) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::Value data =
-            ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
-        ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0);
-    } else if (bit_size == 16) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
     } else {
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
+        if (bit_size == 64) {
+            const IR::Value data =
+                ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1));
+            ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0);
+        } else if (bit_size == 32) {
+            ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0);
+        } else if (bit_size == 16) {
+            ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0);
+        }
     }
 }
 
@@ -264,7 +268,7 @@ void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) {
     const IR::U32 offset =
         ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0));
     const IR::U32 addr_offset = ir.IAdd(addr, offset);
-    const IR::Value original_val = ir.SharedAtomicIIncrement(addr_offset);
+    const IR::Value original_val = ir.SharedAtomicInc(addr_offset);
     if (rtn) {
         SetDst(inst.dst[0], IR::U32{original_val});
     }
@@ -275,7 +279,7 @@ void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) {
     const IR::U32 offset =
         ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0));
     const IR::U32 addr_offset = ir.IAdd(addr, offset);
-    const IR::Value original_val = ir.SharedAtomicIDecrement(addr_offset);
+    const IR::Value original_val = ir.SharedAtomicDec(addr_offset);
     if (rtn) {
         SetDst(inst.dst[0], IR::U32{original_val});
     }
@@ -309,36 +313,38 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride
         const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1);
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj)));
         const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0);
-        if (bit_size == 32) {
-            ir.SetVectorReg(dst_reg++, IR::U32{data0});
-        } else {
+        if (bit_size == 64) {
             const auto vector = ir.UnpackUint2x32(IR::U64{data0});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg++, IR::U32{data0});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})});
         }
         const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj)));
         const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1);
-        if (bit_size == 32) {
-            ir.SetVectorReg(dst_reg++, IR::U32{data1});
-        } else {
+        if (bit_size == 64) {
             const auto vector = ir.UnpackUint2x32(IR::U64{data1});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)});
             ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg++, IR::U32{data1});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})});
         }
-    } else if (bit_size == 64) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
-        const auto vector = ir.UnpackUint2x32(IR::U64{data});
-        ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)});
-        ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)});
-    } else if (bit_size == 16) {
-        const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)};
-        ir.SetVectorReg(dst_reg, ir.UConvert(32, data));
     } else {
         const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset));
-        const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)};
-        ir.SetVectorReg(dst_reg, data);
+        const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0);
+        if (bit_size == 64) {
+            const auto vector = ir.UnpackUint2x32(IR::U64{data});
+            ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)});
+            ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)});
+        } else if (bit_size == 32) {
+            ir.SetVectorReg(dst_reg, IR::U32{data});
+        } else if (bit_size == 16) {
+            ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})});
+        }
     }
 }
 
diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp
index 5eb2079a4..54e8b8ee8 100644
--- a/src/shader_recompiler/frontend/translate/vector_memory.cpp
+++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp
@@ -354,9 +354,9 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
         case AtomicOp::Xor:
             return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info);
         case AtomicOp::Inc:
-            return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info);
+            return ir.BufferAtomicInc(handle, address, buffer_info);
         case AtomicOp::Dec:
-            return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info);
+            return ir.BufferAtomicDec(handle, address, buffer_info);
         default:
             UNREACHABLE();
         }
diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp
index 2c37c8099..3d7cf71dc 100644
--- a/src/shader_recompiler/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/ir/ir_emitter.cpp
@@ -353,12 +353,12 @@ U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) {
     return Inst<U32>(Opcode::SharedAtomicXor32, address, data);
 }
 
-U32 IREmitter::SharedAtomicIIncrement(const U32& address) {
-    return Inst<U32>(Opcode::SharedAtomicIIncrement32, address);
+U32 IREmitter::SharedAtomicInc(const U32& address) {
+    return Inst<U32>(Opcode::SharedAtomicInc32, address);
 }
 
-U32 IREmitter::SharedAtomicIDecrement(const U32& address) {
-    return Inst<U32>(Opcode::SharedAtomicIDecrement32, address);
+U32 IREmitter::SharedAtomicDec(const U32& address) {
+    return Inst<U32>(Opcode::SharedAtomicDec32, address);
 }
 
 U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) {
@@ -373,12 +373,12 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) {
     return Inst<U32>(Opcode::ReadConstBuffer, handle, index);
 }
 
-U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
-    return Inst<U32>(Opcode::LoadBufferU8, Flags{info}, handle, address);
+U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U8>(Opcode::LoadBufferU8, Flags{info}, handle, address);
 }
 
-U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
-    return Inst<U32>(Opcode::LoadBufferU16, Flags{info}, handle, address);
+U16 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U16>(Opcode::LoadBufferU16, Flags{info}, handle, address);
 }
 
 Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
@@ -397,6 +397,10 @@ Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value&
     }
 }
 
+U64 IREmitter::LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst<U64>(Opcode::LoadBufferU64, Flags{info}, handle, address);
+}
+
 Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
                                BufferInstInfo info) {
     switch (num_dwords) {
@@ -417,12 +421,12 @@ Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, Buf
     return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address);
 }
 
-void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data,
+void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U8& data,
                               BufferInstInfo info) {
     Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data);
 }
 
-void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data,
+void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U16& data,
                                BufferInstInfo info) {
     Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data);
 }
@@ -447,6 +451,11 @@ void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value&
     }
 }
 
+void IREmitter::StoreBufferU64(const Value& handle, const Value& address, const U64& data,
+                               BufferInstInfo info) {
+    Inst(Opcode::StoreBufferU64, Flags{info}, handle, address, data);
+}
+
 void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
                                const Value& data, BufferInstInfo info) {
     switch (num_dwords) {
@@ -474,7 +483,19 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con
 
 Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value,
                                   BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
+    switch (value.Type()) {
+    case Type::U32:
+        return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value);
+    case Type::U64:
+        return Inst(Opcode::BufferAtomicIAdd64, Flags{info}, handle, address, value);
+    default:
+        ThrowInvalidType(value.Type());
+    }
+}
+
+Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, const Value& value,
+                                  BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicISub32, Flags{info}, handle, address, value);
 }
 
 Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value,
@@ -489,14 +510,12 @@ Value IREmitter::BufferAtomicIMax(const Value& handle, const Value& address, con
                      : Inst(Opcode::BufferAtomicUMax32, Flags{info}, handle, address, value);
 }
 
-Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, const Value& value,
-                                 BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address, value);
+Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address);
 }
 
-Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, const Value& value,
-                                 BufferInstInfo info) {
-    return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address, value);
+Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, BufferInstInfo info) {
+    return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address);
 }
 
 Value IREmitter::BufferAtomicAnd(const Value& handle, const Value& address, const Value& value,
@@ -1804,8 +1823,15 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s
                      : ConvertUToF(dest_bitsize, src_bitsize, value);
 }
 
-U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
+U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) {
     switch (result_bitsize) {
+    case 8:
+        switch (value.Type()) {
+        case Type::U32:
+            return Inst<U8>(Opcode::ConvertU8U32, value);
+        default:
+            break;
+        }
     case 16:
         switch (value.Type()) {
         case Type::U32:
@@ -1815,6 +1841,8 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) {
         }
     case 32:
         switch (value.Type()) {
+        case Type::U8:
+            return Inst<U32>(Opcode::ConvertU32U8, value);
         case Type::U16:
             return Inst<U32>(Opcode::ConvertU32U16, value);
         default:
diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h
index eae44ed04..215a35ee9 100644
--- a/src/shader_recompiler/ir/ir_emitter.h
+++ b/src/shader_recompiler/ir/ir_emitter.h
@@ -100,33 +100,35 @@ public:
     void WriteShared(int bit_size, const Value& value, const U32& offset);
 
     [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data);
+    [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed);
     [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed);
+    [[nodiscard]] U32 SharedAtomicInc(const U32& address);
+    [[nodiscard]] U32 SharedAtomicDec(const U32& address);
     [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data);
     [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data);
 
-    [[nodiscard]] U32 SharedAtomicIIncrement(const U32& address);
-    [[nodiscard]] U32 SharedAtomicIDecrement(const U32& address);
-    [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data);
-
     [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset);
     [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index);
 
-    [[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
-    [[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
+    [[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info);
+    [[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info);
     [[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address,
                                       BufferInstInfo info);
+    [[nodiscard]] U64 LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info);
     [[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address,
                                       BufferInstInfo info);
     [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address,
                                          BufferInstInfo info);
-    void StoreBufferU8(const Value& handle, const Value& address, const U32& data,
+    void StoreBufferU8(const Value& handle, const Value& address, const U8& data,
                        BufferInstInfo info);
-    void StoreBufferU16(const Value& handle, const Value& address, const U32& data,
+    void StoreBufferU16(const Value& handle, const Value& address, const U16& data,
                         BufferInstInfo info);
     void StoreBufferU32(int num_dwords, const Value& handle, const Value& address,
                         const Value& data, BufferInstInfo info);
+    void StoreBufferU64(const Value& handle, const Value& address, const U64& data,
+                        BufferInstInfo info);
     void StoreBufferF32(int num_dwords, const Value& handle, const Value& address,
                         const Value& data, BufferInstInfo info);
     void StoreBufferFormat(const Value& handle, const Value& address, const Value& data,
@@ -134,14 +136,16 @@ public:
 
     [[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address,
                                          const Value& value, BufferInstInfo info);
+    [[nodiscard]] Value BufferAtomicISub(const Value& handle, const Value& address,
+                                         const Value& value, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicIMin(const Value& handle, const Value& address,
                                          const Value& value, bool is_signed, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicIMax(const Value& handle, const Value& address,
                                          const Value& value, bool is_signed, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicInc(const Value& handle, const Value& address,
-                                        const Value& value, BufferInstInfo info);
+                                        BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicDec(const Value& handle, const Value& address,
-                                        const Value& value, BufferInstInfo info);
+                                        BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicAnd(const Value& handle, const Value& address,
                                         const Value& value, BufferInstInfo info);
     [[nodiscard]] Value BufferAtomicOr(const Value& handle, const Value& address,
@@ -309,7 +313,7 @@ public:
     [[nodiscard]] F32F64 ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_signed,
                                      const Value& value);
 
-    [[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value);
+    [[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value);
     [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value);
 
     [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords,
diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp
index a57310fb9..c2311afea 100644
--- a/src/shader_recompiler/ir/microinstruction.cpp
+++ b/src/shader_recompiler/ir/microinstruction.cpp
@@ -60,12 +60,15 @@ bool Inst::MayHaveSideEffects() const noexcept {
     case Opcode::StoreBufferU32x2:
     case Opcode::StoreBufferU32x3:
     case Opcode::StoreBufferU32x4:
+    case Opcode::StoreBufferU64:
     case Opcode::StoreBufferF32:
     case Opcode::StoreBufferF32x2:
     case Opcode::StoreBufferF32x3:
     case Opcode::StoreBufferF32x4:
     case Opcode::StoreBufferFormatF32:
     case Opcode::BufferAtomicIAdd32:
+    case Opcode::BufferAtomicIAdd64:
+    case Opcode::BufferAtomicISub32:
     case Opcode::BufferAtomicSMin32:
     case Opcode::BufferAtomicUMin32:
     case Opcode::BufferAtomicSMax32:
@@ -76,15 +79,21 @@ bool Inst::MayHaveSideEffects() const noexcept {
     case Opcode::BufferAtomicOr32:
     case Opcode::BufferAtomicXor32:
     case Opcode::BufferAtomicSwap32:
+    case Opcode::BufferAtomicCmpSwap32:
     case Opcode::DataAppend:
     case Opcode::DataConsume:
-    case Opcode::WriteSharedU64:
+    case Opcode::WriteSharedU16:
     case Opcode::WriteSharedU32:
+    case Opcode::WriteSharedU64:
     case Opcode::SharedAtomicIAdd32:
+    case Opcode::SharedAtomicIAdd64:
+    case Opcode::SharedAtomicISub32:
     case Opcode::SharedAtomicSMin32:
     case Opcode::SharedAtomicUMin32:
     case Opcode::SharedAtomicSMax32:
     case Opcode::SharedAtomicUMax32:
+    case Opcode::SharedAtomicInc32:
+    case Opcode::SharedAtomicDec32:
     case Opcode::SharedAtomicAnd32:
     case Opcode::SharedAtomicOr32:
     case Opcode::SharedAtomicXor32:
diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc
index e96e32297..1621d2acf 100644
--- a/src/shader_recompiler/ir/opcodes.inc
+++ b/src/shader_recompiler/ir/opcodes.inc
@@ -35,21 +35,21 @@ OPCODE(LoadSharedU32,                                       U32,            U32,
 OPCODE(LoadSharedU64,                                       U64,            U32,                                                                            )
 OPCODE(WriteSharedU16,                                      Void,           U32,            U16,                                                            )
 OPCODE(WriteSharedU32,                                      Void,           U32,            U32,                                                            )
-OPCODE(WriteSharedU64,                                      Void,           U32,            U64,                                                          )
+OPCODE(WriteSharedU64,                                      Void,           U32,            U64,                                                            )
 
 // Shared atomic operations
 OPCODE(SharedAtomicIAdd32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicIAdd64,                                  U64,            U32,            U64,                                                            )
+OPCODE(SharedAtomicISub32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicSMin32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicUMin32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicSMax32,                                  U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicUMax32,                                  U32,            U32,            U32,                                                            )
+OPCODE(SharedAtomicInc32,                                   U32,            U32,                                                                            )
+OPCODE(SharedAtomicDec32,                                   U32,            U32,                                                                            )
 OPCODE(SharedAtomicAnd32,                                   U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicOr32,                                    U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicXor32,                                   U32,            U32,            U32,                                                            )
-OPCODE(SharedAtomicISub32,                                  U32,            U32,            U32,                                                            )
-OPCODE(SharedAtomicIIncrement32,                            U32,            U32,                                                                            )
-OPCODE(SharedAtomicIDecrement32,                            U32,            U32,                                                                            )
 
 // Context getters/setters
 OPCODE(GetUserData,                                         U32,            ScalarReg,                                                                      )
@@ -94,23 +94,25 @@ OPCODE(UndefU32,                                            U32,
 OPCODE(UndefU64,                                            U64,                                                                                            )
 
 // Buffer operations
-OPCODE(LoadBufferU8,                                        U32,            Opaque,         Opaque,                                                         )
-OPCODE(LoadBufferU16,                                       U32,            Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU8,                                        U8,             Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU16,                                       U16,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32,                                       U32,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x2,                                     U32x2,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x3,                                     U32x3,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferU32x4,                                     U32x4,          Opaque,         Opaque,                                                         )
+OPCODE(LoadBufferU64,                                       U64,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32,                                       F32,            Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x2,                                     F32x2,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x3,                                     F32x3,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferF32x4,                                     F32x4,          Opaque,         Opaque,                                                         )
 OPCODE(LoadBufferFormatF32,                                 F32x4,          Opaque,         Opaque,                                                         )
-OPCODE(StoreBufferU8,                                       Void,           Opaque,         Opaque,         U32,                                            )
-OPCODE(StoreBufferU16,                                      Void,           Opaque,         Opaque,         U32,                                            )
+OPCODE(StoreBufferU8,                                       Void,           Opaque,         Opaque,         U8,                                             )
+OPCODE(StoreBufferU16,                                      Void,           Opaque,         Opaque,         U16,                                            )
 OPCODE(StoreBufferU32,                                      Void,           Opaque,         Opaque,         U32,                                            )
 OPCODE(StoreBufferU32x2,                                    Void,           Opaque,         Opaque,         U32x2,                                          )
 OPCODE(StoreBufferU32x3,                                    Void,           Opaque,         Opaque,         U32x3,                                          )
 OPCODE(StoreBufferU32x4,                                    Void,           Opaque,         Opaque,         U32x4,                                          )
+OPCODE(StoreBufferU64,                                      Void,           Opaque,         Opaque,         U64,                                            )
 OPCODE(StoreBufferF32,                                      Void,           Opaque,         Opaque,         F32,                                            )
 OPCODE(StoreBufferF32x2,                                    Void,           Opaque,         Opaque,         F32x2,                                          )
 OPCODE(StoreBufferF32x3,                                    Void,           Opaque,         Opaque,         F32x3,                                          )
@@ -120,12 +122,13 @@ OPCODE(StoreBufferFormatF32,                                Void,           Opaq
 // Buffer atomic operations
 OPCODE(BufferAtomicIAdd32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicIAdd64,                                  U64,            Opaque,         Opaque,         U64                                             )
+OPCODE(BufferAtomicISub32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicSMin32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicUMin32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicSMax32,                                  U32,            Opaque,         Opaque,         U32                                             )
 OPCODE(BufferAtomicUMax32,                                  U32,            Opaque,         Opaque,         U32                                             )
-OPCODE(BufferAtomicInc32,                                   U32,            Opaque,         Opaque,         U32,                                            )
-OPCODE(BufferAtomicDec32,                                   U32,            Opaque,         Opaque,         U32,                                            )
+OPCODE(BufferAtomicInc32,                                   U32,            Opaque,         Opaque,                                                         )
+OPCODE(BufferAtomicDec32,                                   U32,            Opaque,         Opaque,                                                         )
 OPCODE(BufferAtomicAnd32,                                   U32,            Opaque,         Opaque,         U32,                                            )
 OPCODE(BufferAtomicOr32,                                    U32,            Opaque,         Opaque,         U32,                                            )
 OPCODE(BufferAtomicXor32,                                   U32,            Opaque,         Opaque,         U32,                                            )
@@ -405,6 +408,8 @@ OPCODE(ConvertF64U32,                                       F64,            U32,
 OPCODE(ConvertF32U16,                                       F32,            U16,                                                                            )
 OPCODE(ConvertU16U32,                                       U16,            U32,                                                                            )
 OPCODE(ConvertU32U16,                                       U32,            U16,                                                                            )
+OPCODE(ConvertU8U32,                                        U8,             U32,                                                                            )
+OPCODE(ConvertU32U8,                                        U32,            U8,                                                                             )
 
 // Image operations
 OPCODE(ImageSampleRaw,                                      F32x4,          Opaque,         F32x4,          F32x4,          F32x4,          F32,            )
diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
index 5cf8a1525..156cb6628 100644
--- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
+++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp
@@ -438,7 +438,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                 IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
                 const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2;
                 const IR::U32 addr{inst.Arg(0)};
-                const IR::U32 data{inst.Arg(1).Resolve()};
+                const IR::Value data = num_dwords == 2
+                                           ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()})
+                                           : inst.Arg(1).Resolve();
 
                 const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind,
                                            u32 off_dw) {
@@ -466,10 +468,10 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
 
                 AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
                 if (num_dwords == 1) {
-                    SetOutput(addr, data, region, 0);
+                    SetOutput(addr, IR::U32{data}, region, 0);
                 } else {
                     for (auto i = 0; i < num_dwords; i++) {
-                        SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i);
+                        SetOutput(addr, IR::U32{ir.CompositeExtract(data, i)}, region, i);
                     }
                 }
                 inst.Invalidate();
@@ -499,7 +501,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                             ReadTessControlPointAttribute(addr, stride, ir, i, is_tcs_output_read);
                         read_components.push_back(ir.BitCast<IR::U32>(component));
                     }
-                    attr_read = ir.CompositeConstruct(read_components);
+                    attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
                 }
                 inst.ReplaceUsesWithAndRemove(attr_read);
                 break;
@@ -578,7 +580,7 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
                         const IR::F32 component = GetInput(addr, i);
                         read_components.push_back(ir.BitCast<IR::U32>(component));
                     }
-                    attr_read = ir.CompositeConstruct(read_components);
+                    attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components));
                 }
                 inst.ReplaceUsesWithAndRemove(attr_read);
                 break;
diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
index fcb86e3fb..bb36e2748 100644
--- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
+++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp
@@ -34,13 +34,13 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
         interpreted = ir.Imm32(0.f);
         break;
     case AmdGpu::DataFormat::Format8: {
-        const auto unpacked =
-            ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info));
+        const auto raw = ir.UConvert(32, ir.LoadBufferU8(handle, address, info));
+        const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
         interpreted = ir.CompositeExtract(unpacked, 0);
         break;
     }
     case AmdGpu::DataFormat::Format8_8: {
-        const auto raw = ir.LoadBufferU16(handle, address, info);
+        const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
         const auto unpacked = ir.Unpack4x8(format_info.num_format, raw);
         interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0),
                                             ir.CompositeExtract(unpacked, 1));
@@ -51,8 +51,8 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con
                                    IR::U32{ir.LoadBufferU32(1, handle, address, info)});
         break;
     case AmdGpu::DataFormat::Format16: {
-        const auto unpacked =
-            ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info));
+        const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info));
+        const auto unpacked = ir.Unpack2x16(format_info.num_format, raw);
         interpreted = ir.CompositeExtract(unpacked, 0);
         break;
     }
@@ -126,7 +126,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
         const auto packed =
             ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f),
                                                                      ir.Imm32(0.f), ir.Imm32(0.f)));
-        ir.StoreBufferU8(handle, address, packed, info);
+        ir.StoreBufferU8(handle, address, ir.UConvert(8, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format8_8: {
@@ -134,7 +134,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
                                        ir.CompositeConstruct(ir.CompositeExtract(real_value, 0),
                                                              ir.CompositeExtract(real_value, 1),
                                                              ir.Imm32(0.f), ir.Imm32(0.f)));
-        ir.StoreBufferU16(handle, address, packed, info);
+        ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format8_8_8_8: {
@@ -145,7 +145,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I
     case AmdGpu::DataFormat::Format16: {
         const auto packed =
             ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f)));
-        ir.StoreBufferU16(handle, address, packed, info);
+        ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info);
         break;
     }
     case AmdGpu::DataFormat::Format16_16: {
diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
index 18c77e600..ba96d1034 100644
--- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
+++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
@@ -17,6 +17,8 @@ using SharpLocation = u32;
 bool IsBufferAtomic(const IR::Inst& inst) {
     switch (inst.GetOpcode()) {
     case IR::Opcode::BufferAtomicIAdd32:
+    case IR::Opcode::BufferAtomicIAdd64:
+    case IR::Opcode::BufferAtomicISub32:
     case IR::Opcode::BufferAtomicSMin32:
     case IR::Opcode::BufferAtomicUMin32:
     case IR::Opcode::BufferAtomicSMax32:
@@ -27,6 +29,7 @@ bool IsBufferAtomic(const IR::Inst& inst) {
     case IR::Opcode::BufferAtomicOr32:
     case IR::Opcode::BufferAtomicXor32:
     case IR::Opcode::BufferAtomicSwap32:
+    case IR::Opcode::BufferAtomicCmpSwap32:
         return true;
     default:
         return false;
@@ -41,6 +44,7 @@ bool IsBufferStore(const IR::Inst& inst) {
     case IR::Opcode::StoreBufferU32x2:
     case IR::Opcode::StoreBufferU32x3:
     case IR::Opcode::StoreBufferU32x4:
+    case IR::Opcode::StoreBufferU64:
     case IR::Opcode::StoreBufferF32:
     case IR::Opcode::StoreBufferF32x2:
     case IR::Opcode::StoreBufferF32x3:
@@ -60,6 +64,7 @@ bool IsBufferInstruction(const IR::Inst& inst) {
     case IR::Opcode::LoadBufferU32x2:
     case IR::Opcode::LoadBufferU32x3:
     case IR::Opcode::LoadBufferU32x4:
+    case IR::Opcode::LoadBufferU64:
     case IR::Opcode::LoadBufferF32:
     case IR::Opcode::LoadBufferF32x2:
     case IR::Opcode::LoadBufferF32x3:
@@ -85,6 +90,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) {
     case IR::Opcode::LoadBufferU16:
     case IR::Opcode::StoreBufferU16:
         return IR::Type::U16;
+    case IR::Opcode::LoadBufferU64:
+    case IR::Opcode::StoreBufferU64:
+    case IR::Opcode::BufferAtomicIAdd64:
+        return IR::Type::U64;
     case IR::Opcode::LoadBufferFormatF32:
     case IR::Opcode::StoreBufferFormatF32:
         // Formatted buffer loads can use a variety of types.
diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
index baf6ad0d1..10d6a285c 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
@@ -9,12 +9,14 @@
 namespace Shader::Optimization {
 
 static bool IsLoadShared(const IR::Inst& inst) {
-    return inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
+    return inst.GetOpcode() == IR::Opcode::LoadSharedU16 ||
+           inst.GetOpcode() == IR::Opcode::LoadSharedU32 ||
            inst.GetOpcode() == IR::Opcode::LoadSharedU64;
 }
 
 static bool IsWriteShared(const IR::Inst& inst) {
-    return inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
+    return inst.GetOpcode() == IR::Opcode::WriteSharedU16 ||
+           inst.GetOpcode() == IR::Opcode::WriteSharedU32 ||
            inst.GetOpcode() == IR::Opcode::WriteSharedU64;
 }
 
diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
index 12d4d0659..839a8ddc5 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
@@ -10,18 +10,23 @@ namespace Shader::Optimization {
 static bool IsSharedAccess(const IR::Inst& inst) {
     const auto opcode = inst.GetOpcode();
     switch (opcode) {
+    case IR::Opcode::LoadSharedU16:
     case IR::Opcode::LoadSharedU32:
     case IR::Opcode::LoadSharedU64:
+    case IR::Opcode::WriteSharedU16:
     case IR::Opcode::WriteSharedU32:
     case IR::Opcode::WriteSharedU64:
-    case IR::Opcode::SharedAtomicAnd32:
     case IR::Opcode::SharedAtomicIAdd32:
     case IR::Opcode::SharedAtomicIAdd64:
-    case IR::Opcode::SharedAtomicOr32:
-    case IR::Opcode::SharedAtomicSMax32:
-    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicISub32:
     case IR::Opcode::SharedAtomicSMin32:
     case IR::Opcode::SharedAtomicUMin32:
+    case IR::Opcode::SharedAtomicSMax32:
+    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicInc32:
+    case IR::Opcode::SharedAtomicDec32:
+    case IR::Opcode::SharedAtomicAnd32:
+    case IR::Opcode::SharedAtomicOr32:
     case IR::Opcode::SharedAtomicXor32:
         return true;
     default:
@@ -41,14 +46,8 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
                                     profile.supports_workgroup_explicit_memory_layout)) {
         return;
     }
-    // Add buffer binding for shared memory storage buffer.
     const u32 binding = static_cast<u32>(program.info.buffers.size());
-    program.info.buffers.push_back({
-        .used_types = IR::Type::U32,
-        .inline_cbuf = AmdGpu::Buffer::Null(),
-        .buffer_type = BufferType::SharedMemory,
-        .is_written = true,
-    });
+    IR::Type used_types{};
     for (IR::Block* const block : program.blocks) {
         for (IR::Inst& inst : block->Instructions()) {
             if (!IsSharedAccess(inst)) {
@@ -56,73 +55,106 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
             }
             IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
             const IR::U32 handle = ir.Imm32(binding);
+            const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
+                                           ir.Imm32(shared_memory_size));
+            const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
             // Replace shared atomics first
             switch (inst.GetOpcode()) {
-            case IR::Opcode::SharedAtomicAnd32:
-                inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {}));
-                continue;
             case IR::Opcode::SharedAtomicIAdd32:
+                inst.ReplaceUsesWithAndRemove(
+                    ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
             case IR::Opcode::SharedAtomicIAdd64:
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U64;
                 continue;
-            case IR::Opcode::SharedAtomicOr32:
+            case IR::Opcode::SharedAtomicISub32:
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
                 continue;
-            case IR::Opcode::SharedAtomicSMax32:
-            case IR::Opcode::SharedAtomicUMax32: {
-                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
-                inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
-                continue;
-            }
             case IR::Opcode::SharedAtomicSMin32:
             case IR::Opcode::SharedAtomicUMin32: {
                 const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {}));
+                    ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
+                used_types |= IR::Type::U32;
                 continue;
             }
-            case IR::Opcode::SharedAtomicXor32:
+            case IR::Opcode::SharedAtomicSMax32:
+            case IR::Opcode::SharedAtomicUMax32: {
+                const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
                 inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {}));
+                    ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            }
+            case IR::Opcode::SharedAtomicInc32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicDec32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicAnd32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicOr32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
+                continue;
+            case IR::Opcode::SharedAtomicXor32:
+                inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
+                used_types |= IR::Type::U32;
                 continue;
             default:
                 break;
             }
             // Replace shared operations.
-            const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
-                                           ir.Imm32(shared_memory_size));
-            const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
             switch (inst.GetOpcode()) {
             case IR::Opcode::LoadSharedU16:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
+                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::LoadSharedU32:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
+                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::LoadSharedU64:
-                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {}));
+                inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
+                used_types |= IR::Type::U64;
                 break;
             case IR::Opcode::WriteSharedU16:
-                ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {});
+                ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
                 inst.Invalidate();
+                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::WriteSharedU32:
                 ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
                 inst.Invalidate();
+                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::WriteSharedU64:
-                ir.StoreBufferU32(2, handle, address, inst.Arg(1), {});
+                ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
                 inst.Invalidate();
+                used_types |= IR::Type::U64;
                 break;
             default:
                 break;
             }
         }
     }
+    // Add buffer binding for shared memory storage buffer.
+    program.info.buffers.push_back({
+        .used_types = used_types,
+        .inline_cbuf = AmdGpu::Buffer::Null(),
+        .buffer_type = BufferType::SharedMemory,
+        .is_written = true,
+    });
 }
 
 } // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h
index ed1e5536a..b92c5d555 100644
--- a/src/shader_recompiler/ir/value.h
+++ b/src/shader_recompiler/ir/value.h
@@ -265,6 +265,7 @@ using U32F32 = TypedValue<Type::U32 | Type::F32>;
 using U64F64 = TypedValue<Type::U64 | Type::F64>;
 using U32U64 = TypedValue<Type::U32 | Type::U64>;
 using U16U32U64 = TypedValue<Type::U16 | Type::U32 | Type::U64>;
+using U8U16U32U64 = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;
 using F32F64 = TypedValue<Type::F32 | Type::F64>;
 using F16F32F64 = TypedValue<Type::F16 | Type::F32 | Type::F64>;
 using UAny = TypedValue<Type::U8 | Type::U16 | Type::U32 | Type::U64>;

From fc4fd0107d5bde21e2a4fbdff6b502e6b39e9b7f Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Tue, 10 Jun 2025 17:43:11 -0500
Subject: [PATCH 08/14] libSceNpTrophy: Change initial context and handle
 values (#3080)

* Change default context and handle values

libSceNpToolkit internally uses context/handle values of zero to indicate NpTrophy calls failed.
This PR returns handle/context as index + 1 instead, avoiding this issue.

* Fix log message
---
 src/core/libraries/np_trophy/np_trophy.cpp | 30 +++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/core/libraries/np_trophy/np_trophy.cpp b/src/core/libraries/np_trophy/np_trophy.cpp
index 6de84bd93..e3c5ce35e 100644
--- a/src/core/libraries/np_trophy/np_trophy.cpp
+++ b/src/core/libraries/np_trophy/np_trophy.cpp
@@ -164,10 +164,12 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateContext(OrbisNpTrophyContext* context, int32_t
     }
 
     const auto ctx_id = trophy_contexts.insert(user_id, service_label);
-    contexts_internal[key].context_id = ctx_id.index;
-    LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", ctx_id.index,
-             user_id, service_label);
-    *context = ctx_id.index;
+
+    *context = ctx_id.index + 1;
+    contexts_internal[key].context_id = *context;
+    LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", *context, user_id,
+             service_label);
+
     return ORBIS_OK;
 }
 
@@ -179,21 +181,23 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateHandle(OrbisNpTrophyHandle* handle) {
     if (trophy_handles.size() >= MaxTrophyHandles) {
         return ORBIS_NP_TROPHY_ERROR_HANDLE_EXCEEDS_MAX;
     }
-    const auto handle_id = trophy_handles.insert();
-    LOG_INFO(Lib_NpTrophy, "New handle = {}", handle_id.index);
 
-    *handle = handle_id.index;
+    const auto handle_id = trophy_handles.insert();
+
+    *handle = handle_id.index + 1;
+    LOG_INFO(Lib_NpTrophy, "New handle = {}", *handle);
     return ORBIS_OK;
 }
 
 int PS4_SYSV_ABI sceNpTrophyDestroyContext(OrbisNpTrophyContext context) {
     LOG_INFO(Lib_NpTrophy, "Destroyed Context {}", context);
 
-    if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT)
+    if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT) {
         return ORBIS_NP_TROPHY_ERROR_INVALID_CONTEXT;
+    }
 
     Common::SlotId contextId;
-    contextId.index = context;
+    contextId.index = context - 1;
 
     ContextKey contextkey = trophy_contexts[contextId];
     trophy_contexts.erase(contextId);
@@ -206,15 +210,17 @@ s32 PS4_SYSV_ABI sceNpTrophyDestroyHandle(OrbisNpTrophyHandle handle) {
     if (handle == ORBIS_NP_TROPHY_INVALID_HANDLE)
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
 
-    if (handle >= trophy_handles.size()) {
+    s32 handle_index = handle - 1;
+    if (handle_index >= trophy_handles.size()) {
         LOG_ERROR(Lib_NpTrophy, "Invalid handle {}", handle);
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
     }
-    if (!trophy_handles.is_allocated({static_cast<u32>(handle)})) {
+
+    if (!trophy_handles.is_allocated({static_cast<u32>(handle_index)})) {
         return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE;
     }
 
-    trophy_handles.erase({static_cast<u32>(handle)});
+    trophy_handles.erase({static_cast<u32>(handle_index)});
     LOG_INFO(Lib_NpTrophy, "Handle {} destroyed", handle);
     return ORBIS_OK;
 }

From dedf6de2ac13b6543339ee5cdedc44ee0efd963c Mon Sep 17 00:00:00 2001
From: TheTurtle <geoster3d@gmail.com>
Date: Wed, 11 Jun 2025 11:34:37 +0300
Subject: [PATCH 09/14]  texture_cache: Implement color<->depth copies (#3079)

* texture_cache: Implement color to depth copies and vise versa

* ir_passes: Adjust shared memory barrier pass to cover more cases

* texture_cache: Remove unused code

* review comment
---
 .../ir/passes/shared_memory_barrier_pass.cpp  |  35 ++++--
 src/video_core/buffer_cache/buffer_cache.cpp  |  10 +-
 src/video_core/buffer_cache/buffer_cache.h    |  23 ++--
 .../renderer_vulkan/vk_rasterizer.cpp         |   4 +-
 src/video_core/texture_cache/image.cpp        | 113 +++++++++++++++---
 src/video_core/texture_cache/image.h          |   3 +-
 .../texture_cache/texture_cache.cpp           |  23 ++--
 7 files changed, 157 insertions(+), 54 deletions(-)

diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
index 10d6a285c..11713d099 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 
+#include <unordered_set>
 #include "shader_recompiler/ir/breadth_first_search.h"
 #include "shader_recompiler/ir/ir_emitter.h"
 #include "shader_recompiler/ir/program.h"
@@ -51,11 +52,14 @@ static void EmitBarrierInBlock(IR::Block* block) {
     }
 }
 
+using NodeSet = std::unordered_set<const IR::Block*>;
+
 // Inserts a barrier after divergent conditional blocks to avoid undefined
 // behavior when some threads write and others read from shared memory.
-static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
+static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data,
+                                    NodeSet& divergence_end, u32& divergence_depth) {
     const IR::U1 cond = data.if_node.cond;
-    const auto insert_barrier =
+    const auto is_divergent_cond =
         IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional<bool> {
             if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 &&
                 inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) {
@@ -63,11 +67,15 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) {
             }
             return std::nullopt;
         });
-    if (insert_barrier) {
-        IR::Block* const merge = data.if_node.merge;
-        auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi);
-        IR::IREmitter ir{*merge, insert_point};
-        ir.Barrier();
+    if (is_divergent_cond) {
+        if (divergence_depth == 0) {
+            IR::Block* const merge = data.if_node.merge;
+            auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi);
+            IR::IREmitter ir{*merge, insert_point};
+            ir.Barrier();
+        }
+        ++divergence_depth;
+        divergence_end.emplace(data.if_node.merge);
     }
 }
 
@@ -89,19 +97,22 @@ void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_in
         return;
     }
     using Type = IR::AbstractSyntaxNode::Type;
-    u32 branch_depth{};
+    u32 divergence_depth{};
+    NodeSet divergence_end;
     for (const IR::AbstractSyntaxNode& node : program.syntax_list) {
         if (node.type == Type::EndIf) {
-            --branch_depth;
+            if (divergence_end.contains(node.data.end_if.merge)) {
+                --divergence_depth;
+            }
             continue;
         }
         // Check if branch depth is zero, we don't want to insert barrier in potentially divergent
         // code.
-        if (node.type == Type::If && branch_depth++ == 0) {
-            EmitBarrierInMergeBlock(node.data);
+        if (node.type == Type::If) {
+            EmitBarrierInMergeBlock(node.data, divergence_end, divergence_depth);
             continue;
         }
-        if (node.type == Type::Block && branch_depth == 0) {
+        if (node.type == Type::Block && divergence_depth == 0) {
             EmitBarrierInBlock(node.data.block);
         }
     }
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index e470f8e77..ffa744b31 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -23,6 +23,7 @@ static constexpr size_t DataShareBufferSize = 64_KB;
 static constexpr size_t StagingBufferSize = 512_MB;
 static constexpr size_t UboStreamBufferSize = 128_MB;
 static constexpr size_t DownloadBufferSize = 128_MB;
+static constexpr size_t DeviceBufferSize = 16_MB;
 static constexpr size_t MaxPageFaults = 1024;
 
 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
@@ -32,7 +33,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
       memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_},
       staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize},
       stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize},
-      download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize),
+      download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize},
+      device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize},
       gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize},
       bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
                            0,        AllFlags,  BDA_PAGETABLE_SIZE},
@@ -348,7 +350,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
     return {&buffer, buffer.Offset(device_addr)};
 }
 
-std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
+std::pair<Buffer*, u32> BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) {
     // Check if any buffer contains the full requested range.
     const u64 page = gpu_addr >> CACHING_PAGEBITS;
     const BufferId buffer_id = page_table[page].buffer_id;
@@ -361,10 +363,10 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
     }
     // If no buffer contains the full requested range but some buffer within was GPU-modified,
     // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
-    // This is only done if the request prefers to use GPU memory, otherwise we can skip it.
-    if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
+    if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
         return ObtainBuffer(gpu_addr, size, false, false);
     }
+
     // In all other cases, just do a CPU copy to the staging buffer.
     const auto [data, offset] = staging_buffer.Map(size, 16);
     memory->CopySparseMemory(gpu_addr, data, size);
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index c2faf12c8..d7d753213 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -80,11 +80,6 @@ public:
         return &gds_buffer;
     }
 
-    /// Retrieves the host visible device local stream buffer.
-    [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept {
-        return stream_buffer;
-    }
-
     /// Retrieves the device local DBA page table buffer.
     [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept {
         return &bda_pagetable_buffer;
@@ -100,6 +95,20 @@ public:
         return slot_buffers[id];
     }
 
+    /// Retrieves a utility buffer optimized for specified memory usage.
+    StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept {
+        switch (usage) {
+        case MemoryUsage::Stream:
+            return stream_buffer;
+        case MemoryUsage::Download:
+            return download_buffer;
+        case MemoryUsage::Upload:
+            return staging_buffer;
+        case MemoryUsage::DeviceLocal:
+            return device_buffer;
+        }
+    }
+
     /// Invalidates any buffer in the logical page range.
     void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
 
@@ -121,8 +130,7 @@ public:
                                                        BufferId buffer_id = {});
 
     /// Attempts to obtain a buffer without modifying the cache contents.
-    [[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size,
-                                                           bool prefer_gpu);
+    [[nodiscard]] std::pair<Buffer*, u32> ObtainBufferForImage(VAddr gpu_addr, u32 size);
 
     /// Return true when a region is registered on the cache
     [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
@@ -193,6 +201,7 @@ private:
     StreamBuffer staging_buffer;
     StreamBuffer stream_buffer;
     StreamBuffer download_buffer;
+    StreamBuffer device_buffer;
     Buffer gds_buffer;
     Buffer bda_pagetable_buffer;
     Buffer fault_buffer;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index dff4e5a5f..9dea5ceea 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -549,7 +549,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
                 const auto* gds_buf = buffer_cache.GetGdsBuffer();
                 buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes());
             } else if (desc.buffer_type == Shader::BufferType::Flatbuf) {
-                auto& vk_buffer = buffer_cache.GetStreamBuffer();
+                auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream);
                 const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32);
                 const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size,
                                                   instance.UniformMinAlignment());
@@ -561,7 +561,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
                 const auto* fault_buffer = buffer_cache.GetFaultBuffer();
                 buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes());
             } else if (desc.buffer_type == Shader::BufferType::SharedMemory) {
-                auto& lds_buffer = buffer_cache.GetStreamBuffer();
+                auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream);
                 const auto& cs_program = liverpool->GetCsRegs();
                 const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups();
                 const auto [data, offset] =
diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp
index 6241100a0..ab9111e6b 100644
--- a/src/video_core/texture_cache/image.cpp
+++ b/src/video_core/texture_cache/image.cpp
@@ -312,43 +312,121 @@ void Image::Upload(vk::Buffer buffer, u64 offset) {
             vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {});
 }
 
-void Image::CopyImage(const Image& image) {
+void Image::CopyImage(const Image& src_image) {
     scheduler->EndRendering();
     Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
 
     auto cmdbuf = scheduler->CommandBuffer();
+    const auto& src_info = src_image.info;
 
     boost::container::small_vector<vk::ImageCopy, 14> image_copy{};
-    const u32 num_mips = std::min(image.info.resources.levels, info.resources.levels);
+    const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels);
     for (u32 m = 0; m < num_mips; ++m) {
-        const auto mip_w = std::max(image.info.size.width >> m, 1u);
-        const auto mip_h = std::max(image.info.size.height >> m, 1u);
-        const auto mip_d = std::max(image.info.size.depth >> m, 1u);
+        const auto mip_w = std::max(src_info.size.width >> m, 1u);
+        const auto mip_h = std::max(src_info.size.height >> m, 1u);
+        const auto mip_d = std::max(src_info.size.depth >> m, 1u);
 
         image_copy.emplace_back(vk::ImageCopy{
             .srcSubresource{
-                .aspectMask = image.aspect_mask,
+                .aspectMask = src_image.aspect_mask,
                 .mipLevel = m,
                 .baseArrayLayer = 0,
-                .layerCount = image.info.resources.layers,
+                .layerCount = src_info.resources.layers,
             },
             .dstSubresource{
-                .aspectMask = image.aspect_mask,
+                .aspectMask = src_image.aspect_mask,
                 .mipLevel = m,
                 .baseArrayLayer = 0,
-                .layerCount = image.info.resources.layers,
+                .layerCount = src_info.resources.layers,
             },
             .extent = {mip_w, mip_h, mip_d},
         });
     }
-    cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout,
+    cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout,
                      image_copy);
 
     Transit(vk::ImageLayout::eGeneral,
             vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {});
 }
 
-void Image::CopyMip(const Image& image, u32 mip, u32 slice) {
+void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) {
+    const auto& src_info = src_image.info;
+
+    vk::BufferImageCopy buffer_image_copy = {
+        .bufferOffset = offset,
+        .bufferRowLength = 0,
+        .bufferImageHeight = 0,
+        .imageSubresource =
+            {
+                .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth
+                                                        : vk::ImageAspectFlagBits::eColor,
+                .mipLevel = 0,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+        .imageOffset =
+            {
+                .x = 0,
+                .y = 0,
+                .z = 0,
+            },
+        .imageExtent =
+            {
+                .width = src_info.size.width,
+                .height = src_info.size.height,
+                .depth = src_info.size.depth,
+            },
+    };
+
+    const vk::BufferMemoryBarrier2 pre_copy_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferRead,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .buffer = buffer,
+        .offset = offset,
+        .size = VK_WHOLE_SIZE,
+    };
+
+    const vk::BufferMemoryBarrier2 post_copy_barrier = {
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eTransferRead,
+        .buffer = buffer,
+        .offset = offset,
+        .size = VK_WHOLE_SIZE,
+    };
+
+    scheduler->EndRendering();
+    src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
+    Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
+
+    auto cmdbuf = scheduler->CommandBuffer();
+
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &pre_copy_barrier,
+    });
+
+    cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer,
+                             buffer_image_copy);
+
+    cmdbuf.pipelineBarrier2(vk::DependencyInfo{
+        .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+        .bufferMemoryBarrierCount = 1,
+        .pBufferMemoryBarriers = &post_copy_barrier,
+    });
+
+    buffer_image_copy.imageSubresource.aspectMask =
+        info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor;
+
+    cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal,
+                             buffer_image_copy);
+}
+
+void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) {
     scheduler->EndRendering();
     Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {});
 
@@ -358,26 +436,27 @@ void Image::CopyMip(const Image& image, u32 mip, u32 slice) {
     const auto mip_h = std::max(info.size.height >> mip, 1u);
     const auto mip_d = std::max(info.size.depth >> mip, 1u);
 
-    ASSERT(mip_w == image.info.size.width);
-    ASSERT(mip_h == image.info.size.height);
+    const auto& src_info = src_image.info;
+    ASSERT(mip_w == src_info.size.width);
+    ASSERT(mip_h == src_info.size.height);
 
-    const u32 num_layers = std::min(image.info.resources.layers, info.resources.layers);
+    const u32 num_layers = std::min(src_info.resources.layers, info.resources.layers);
     const vk::ImageCopy image_copy{
         .srcSubresource{
-            .aspectMask = image.aspect_mask,
+            .aspectMask = src_image.aspect_mask,
             .mipLevel = 0,
             .baseArrayLayer = 0,
             .layerCount = num_layers,
         },
         .dstSubresource{
-            .aspectMask = image.aspect_mask,
+            .aspectMask = src_image.aspect_mask,
             .mipLevel = mip,
             .baseArrayLayer = slice,
             .layerCount = num_layers,
         },
         .extent = {mip_w, mip_h, mip_d},
     };
-    cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout,
+    cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout,
                      image_copy);
 
     Transit(vk::ImageLayout::eGeneral,
diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h
index 404e25e88..31b67e021 100644
--- a/src/video_core/texture_cache/image.h
+++ b/src/video_core/texture_cache/image.h
@@ -104,7 +104,8 @@ struct Image {
                  std::optional<SubresourceRange> range, vk::CommandBuffer cmdbuf = {});
     void Upload(vk::Buffer buffer, u64 offset);
 
-    void CopyImage(const Image& image);
+    void CopyImage(const Image& src_image);
+    void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset);
     void CopyMip(const Image& src_image, u32 mip, u32 slice);
 
     bool IsTracked() {
diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp
index cc244eb6b..a47e858ab 100644
--- a/src/video_core/texture_cache/texture_cache.cpp
+++ b/src/video_core/texture_cache/texture_cache.cpp
@@ -8,7 +8,6 @@
 #include "common/debug.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/page_manager.h"
-#include "video_core/renderer_vulkan/liverpool_to_vk.h"
 #include "video_core/renderer_vulkan/vk_instance.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/texture_cache/host_compatibility.h"
@@ -126,7 +125,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) {
 
 ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding,
                                           ImageId cache_image_id) {
-    const auto& cache_image = slot_images[cache_image_id];
+    auto& cache_image = slot_images[cache_image_id];
 
     if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) {
         return {};
@@ -169,18 +168,21 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi
     }
 
     if (recreate) {
-        auto new_info{requested_info};
-        new_info.resources = std::max(requested_info.resources, cache_image.info.resources);
-        new_info.UpdateSize();
+        auto new_info = requested_info;
+        new_info.resources = std::min(requested_info.resources, cache_image.info.resources);
         const auto new_image_id = slot_images.insert(instance, scheduler, new_info);
         RegisterImage(new_image_id);
 
         // Inherit image usage
-        auto& new_image = GetImage(new_image_id);
+        auto& new_image = slot_images[new_image_id];
         new_image.usage = cache_image.usage;
+        new_image.flags &= ~ImageFlagBits::Dirty;
 
-        // TODO: perform a depth copy here
+        // Perform depth<->color copy using the intermediate copy buffer.
+        const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal);
+        new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0);
 
+        // Free the cache image.
         FreeImage(cache_image_id);
         return new_image_id;
     }
@@ -584,12 +586,11 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
 
     const VAddr image_addr = image.info.guest_address;
     const size_t image_size = image.info.guest_size;
-    const auto [vk_buffer, buf_offset] =
-        buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty);
+    const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size);
 
     const auto cmdbuf = sched_ptr->CommandBuffer();
-    // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
-    // hazard
+
+    // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard
     if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
                                              vk::PipelineStageFlagBits2::eTransfer)) {
         cmdbuf.pipelineBarrier2(vk::DependencyInfo{

From 274182954551d429c77e8b88ec395ae8726a0127 Mon Sep 17 00:00:00 2001
From: georgemoralis <giorgosmrls@gmail.com>
Date: Wed, 11 Jun 2025 12:02:59 +0300
Subject: [PATCH 10/14] New translations en_us.ts (Arabic) (#3081)

---
 src/qt_gui/translations/ar_SA.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qt_gui/translations/ar_SA.ts b/src/qt_gui/translations/ar_SA.ts
index 26e768720..7d0c15e6b 100644
--- a/src/qt_gui/translations/ar_SA.ts
+++ b/src/qt_gui/translations/ar_SA.ts
@@ -2049,7 +2049,7 @@ Nightly: نُسخ تحتوي على أحدث الميزات، لكنها أقل
     </message>
     <message>
       <source> * Unsupported Vulkan Version</source>
-      <translation type="unfinished"> * Unsupported Vulkan Version</translation>
+      <translation>نسخ Vulkan غير مدعومة</translation>
     </message>
   </context>
   <context>

From 3e0ec9ebef8c6b7d752d4538e42b36b571c983a6 Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Wed, 11 Jun 2025 09:34:00 -0500
Subject: [PATCH 11/14] Core: Merge Direct Memory Areas (#3084)

* Merge dmem areas

* Fix DirectMemoryArea::CanMergeWith

Don't merge dmem areas if the memory types are different.

* Reduce some warnings to info

Both functions should behave properly now, there's no reason to warn about their use.

* Clang
---
 src/core/libraries/kernel/memory.cpp | 6 +++---
 src/core/memory.cpp                  | 1 +
 src/core/memory.h                    | 3 +++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp
index f02ddafdc..ea3998ddd 100644
--- a/src/core/libraries/kernel/memory.cpp
+++ b/src/core/libraries/kernel/memory.cpp
@@ -99,8 +99,8 @@ s32 PS4_SYSV_ABI sceKernelReleaseDirectMemory(u64 start, size_t len) {
 s32 PS4_SYSV_ABI sceKernelAvailableDirectMemorySize(u64 searchStart, u64 searchEnd,
                                                     size_t alignment, u64* physAddrOut,
                                                     size_t* sizeOut) {
-    LOG_WARNING(Kernel_Vmm, "called searchStart = {:#x}, searchEnd = {:#x}, alignment = {:#x}",
-                searchStart, searchEnd, alignment);
+    LOG_INFO(Kernel_Vmm, "called searchStart = {:#x}, searchEnd = {:#x}, alignment = {:#x}",
+             searchStart, searchEnd, alignment);
 
     if (physAddrOut == nullptr || sizeOut == nullptr) {
         return ORBIS_KERNEL_ERROR_EINVAL;
@@ -287,7 +287,7 @@ s32 PS4_SYSV_ABI sceKernelMtypeprotect(const void* addr, u64 size, s32 mtype, s3
 
 int PS4_SYSV_ABI sceKernelDirectMemoryQuery(u64 offset, int flags, OrbisQueryInfo* query_info,
                                             size_t infoSize) {
-    LOG_WARNING(Kernel_Vmm, "called offset = {:#x}, flags = {:#x}", offset, flags);
+    LOG_INFO(Kernel_Vmm, "called offset = {:#x}, flags = {:#x}", offset, flags);
     auto* memory = Core::Memory::Instance();
     return memory->DirectMemoryQuery(offset, flags == 1, query_info);
 }
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index e738f85a1..dad42347a 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -222,6 +222,7 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size,
     auto& area = CarveDmemArea(mapping_start, size)->second;
     area.memory_type = memory_type;
     area.is_free = false;
+    MergeAdjacent(dmem_map, dmem_area);
     return mapping_start;
 }
 
diff --git a/src/core/memory.h b/src/core/memory.h
index 68f9c26c4..6a9b29382 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -75,6 +75,9 @@ struct DirectMemoryArea {
         if (base + size != next.base) {
             return false;
         }
+        if (memory_type != next.memory_type) {
+            return false;
+        }
         if (is_free != next.is_free) {
             return false;
         }

From 34a1ffbcda67aaa136535744da2ea29cb6d00848 Mon Sep 17 00:00:00 2001
From: Missake212 <exomissake@gmail.com>
Date: Wed, 11 Jun 2025 20:21:55 +0100
Subject: [PATCH 12/14] Few changes to the README.md (#3086)

* Update README.md

* backslash
---
 README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 985bba586..9079ead73 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ SPDX-License-Identifier: GPL-2.0-or-later
 
 **shadPS4** is an early **PlayStation 4** emulator for **Windows**, **Linux** and **macOS** written in C++.
 
-If you encounter problems or have doubts, do not hesitate to look at the [**Quickstart**](https://github.com/shadps4-emu/shadPS4/blob/main/documents/Quickstart/Quickstart.md).\
+If you encounter problems or have doubts, do not hesitate to look at the [**Quickstart**](https://github.com/shadps4-emu/shadPS4/wiki/I.-Quick-start-%5BUsers%5D).\
 To verify that a game works, you can look at [**shadPS4 Game Compatibility**](https://github.com/shadps4-emu/shadps4-game-compatibility).\
 To discuss shadPS4 development, suggest ideas or to ask for help, join our [**Discord server**](https://discord.gg/bFJxfftGW6).\
 To get the latest news, go to our [**X (Twitter)**](https://x.com/shadps4) or our [**website**](https://shadps4.net/).\
@@ -124,8 +124,8 @@ Keyboard and mouse inputs can be customized in the settings menu by clicking the
 
 # Firmware files
 
-shadPS4 can load some PlayStation 4 firmware files, these must be dumped from your legally owned PlayStation 4 console.\
-The following firmware modules are supported and must be placed in shadPS4's `user/sys_modules` folder.
+shadPS4 can load some PlayStation 4 firmware files, these must be dumped from your legally owned PlayStation 4 console.
+The following firmware modules are supported and must be placed in shadPS4's `sys_modules` folder.
 
 <div align="center">
 
@@ -139,7 +139,6 @@ The following firmware modules are supported and must be placed in shadPS4's `us
 
 > [!Caution]
 > The above modules are required to run the games properly and must be extracted from your PlayStation 4.\
-> **We do not provide any information or support on how to do this**.
 
 
 
@@ -148,7 +147,7 @@ The following firmware modules are supported and must be placed in shadPS4's `us
 - [**georgemoralis**](https://github.com/georgemoralis)
 - [**psucien**](https://github.com/psucien)
 - [**viniciuslrangel**](https://github.com/viniciuslrangel)
-- [**roamic**](https://github.com/vladmikhalin)
+- [**roamic**](https://github.com/roamic)
 - [**squidbus**](https://github.com/squidbus)
 - [**frodo**](https://github.com/baggins183)
 - [**Stephen Miller**](https://github.com/StevenMiller123)
@@ -158,7 +157,7 @@ Logo is done by [**Xphalnos**](https://github.com/Xphalnos)
 
 # Contributing
 
-If you want to contribute, please look the [**CONTRIBUTING.md**](https://github.com/shadps4-emu/shadPS4/blob/main/CONTRIBUTING.md) file.\
+If you want to contribute, please read the [**CONTRIBUTING.md**](https://github.com/shadps4-emu/shadPS4/blob/main/CONTRIBUTING.md) file.\
 Open a PR and we'll check it :)
 
 # Translations

From 69a50fa7132f27f73754aebe15be953546f5ace2 Mon Sep 17 00:00:00 2001
From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com>
Date: Wed, 11 Jun 2025 14:22:34 -0500
Subject: [PATCH 13/14] Struct update fixes (#3087)

Neither sceVideodec2Decode or sceVideodec2Flush should be modifying the output's `thisSize`, doing so breaks older games now that we have the updated structs.
We should also only set frameFormat and framePitchInBytes if the game inputted the newer struct, since otherwise we're modifying memory the game never gave us.
These changes might fix the regression in Hatsune Miku Project Diva X, though it's hard to tell due to some weird caching issue with Windows, and the ancient regression this game had on Linux.
---
 .../libraries/videodec/videodec2_impl.cpp     | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/core/libraries/videodec/videodec2_impl.cpp b/src/core/libraries/videodec/videodec2_impl.cpp
index a643239a3..373809c14 100644
--- a/src/core/libraries/videodec/videodec2_impl.cpp
+++ b/src/core/libraries/videodec/videodec2_impl.cpp
@@ -44,11 +44,14 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
                         OrbisVideodec2FrameBuffer& frameBuffer,
                         OrbisVideodec2OutputInfo& outputInfo) {
     frameBuffer.isAccepted = false;
-    outputInfo.thisSize = sizeof(OrbisVideodec2OutputInfo);
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
-    outputInfo.frameFormat = 0;
+
+    // Only set frameFormat if the game uses the newer struct version.
+    if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) {
+        outputInfo.frameFormat = 0;
+    }
 
     if (!inputData.auData) {
         return ORBIS_VIDEODEC2_ERROR_ACCESS_UNIT_POINTER;
@@ -107,7 +110,6 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
-        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 
@@ -115,6 +117,11 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
         outputInfo.isErrorFrame = false;
         outputInfo.pictureCount = 1; // TODO: 2 pictures for interlaced video
 
+        // Only set framePitchInBytes if the game uses the newer struct version.
+        if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) {
+            outputInfo.framePitchInBytes = frame->linesize[0];
+        }
+
         if (outputInfo.isValid) {
             OrbisVideodec2AvcPictureInfo pictureInfo = {};
 
@@ -142,11 +149,14 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData,
 s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
                        OrbisVideodec2OutputInfo& outputInfo) {
     frameBuffer.isAccepted = false;
-    outputInfo.thisSize = sizeof(OrbisVideodec2OutputInfo);
     outputInfo.isValid = false;
     outputInfo.isErrorFrame = true;
     outputInfo.pictureCount = 0;
-    outputInfo.frameFormat = 0;
+
+    // Only set frameFormat if the game uses the newer struct version.
+    if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) {
+        outputInfo.frameFormat = 0;
+    }
 
     AVFrame* frame = av_frame_alloc();
     if (!frame) {
@@ -178,7 +188,6 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
         outputInfo.frameWidth = frame->width;
         outputInfo.frameHeight = frame->height;
         outputInfo.framePitch = frame->linesize[0];
-        outputInfo.framePitchInBytes = frame->linesize[0];
         outputInfo.frameBufferSize = frameBuffer.frameBufferSize;
         outputInfo.frameBuffer = frameBuffer.frameBuffer;
 
@@ -186,6 +195,11 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer,
         outputInfo.isErrorFrame = false;
         outputInfo.pictureCount = 1; // TODO: 2 pictures for interlaced video
 
+        // Only set framePitchInBytes if the game uses the newer struct version.
+        if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) {
+            outputInfo.framePitchInBytes = frame->linesize[0];
+        }
+
         // FIXME: Should we add picture info here too?
     }
 

From c71dc740e20ec917ac06092cf938f66b62153e48 Mon Sep 17 00:00:00 2001
From: squidbus <175574877+squidbus@users.noreply.github.com>
Date: Wed, 11 Jun 2025 13:24:41 -0700
Subject: [PATCH 14/14] shader_recompiler: Reduce cases where shared memory to
 buffer pass is needed. (#3082)

---
 CMakeLists.txt                                |   1 +
 .../backend/spirv/emit_spirv.cpp              |   3 +-
 .../backend/spirv/spirv_emit_context.cpp      |  28 +++-
 src/shader_recompiler/info.h                  |   2 +-
 src/shader_recompiler/ir/passes/ir_passes.h   |   1 +
 .../ir/passes/shader_info_collection_pass.cpp |  22 ++-
 .../ir/passes/shared_memory_simplify_pass.cpp | 127 ++++++++++++++++++
 .../passes/shared_memory_to_storage_pass.cpp  |  96 ++++++++-----
 src/shader_recompiler/recompiler.cpp          |   1 +
 9 files changed, 232 insertions(+), 49 deletions(-)
 create mode 100644 src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c2739d22..0d89524cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -870,6 +870,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
                       src/shader_recompiler/ir/passes/ring_access_elimination.cpp
                       src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
                       src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp
+                      src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
                       src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
                       src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp
                       src/shader_recompiler/ir/abstract_syntax_list.cpp
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
index 37d7eea35..93fb81df4 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@@ -303,7 +303,8 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct
         ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses);
         ctx.AddExtension("SPV_KHR_physical_storage_buffer");
     }
-    if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) {
+    const auto shared_type_count = std::popcount(static_cast<u32>(info.shared_types));
+    if (shared_type_count > 1 && profile.supports_workgroup_explicit_memory_layout) {
         ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout");
         ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR);
         ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR);
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index c47a75739..0a8f78f72 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -979,32 +979,46 @@ void EmitContext::DefineImagesAndSamplers() {
 }
 
 void EmitContext::DefineSharedMemory() {
-    if (!info.uses_shared) {
+    const auto num_types = std::popcount(static_cast<u32>(info.shared_types));
+    if (num_types == 0) {
         return;
     }
     ASSERT(info.stage == Stage::Compute);
     const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
 
-    const auto make_type = [&](Id element_type, u32 element_size) {
+    const auto make_type = [&](IR::Type type, Id element_type, u32 element_size,
+                               std::string_view name) {
+        if (False(info.shared_types & type)) {
+            // Skip unused shared memory types.
+            return std::make_tuple(Id{}, Id{}, Id{});
+        }
+
         const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)};
         const Id array_type{TypeArray(element_type, ConstU32(num_elements))};
         Decorate(array_type, spv::Decoration::ArrayStride, element_size);
 
         const Id struct_type{TypeStruct(array_type)};
         MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u);
-        Decorate(struct_type, spv::Decoration::Block);
 
         const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type);
         const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type);
         const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup);
-        Decorate(variable, spv::Decoration::Aliased);
+        Name(variable, name);
         interfaces.push_back(variable);
 
+        if (num_types > 1) {
+            Decorate(struct_type, spv::Decoration::Block);
+            Decorate(variable, spv::Decoration::Aliased);
+        }
+
         return std::make_tuple(variable, element_pointer, pointer);
     };
-    std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u);
-    std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u);
-    std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u);
+    std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) =
+        make_type(IR::Type::U16, U16, 2u, "shared_mem_u16");
+    std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) =
+        make_type(IR::Type::U32, U32[1], 4u, "shared_mem_u32");
+    std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) =
+        make_type(IR::Type::U64, U64, 8u, "shared_mem_u64");
 }
 
 Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) {
diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
index e14c7988d..f25111350 100644
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@@ -214,7 +214,7 @@ struct Info {
     bool uses_lane_id{};
     bool uses_group_quad{};
     bool uses_group_ballot{};
-    bool uses_shared{};
+    IR::Type shared_types{};
     bool uses_fp16{};
     bool uses_fp64{};
     bool uses_pack_10_11_11{};
diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h
index 06e4ac850..57d36f6df 100644
--- a/src/shader_recompiler/ir/passes/ir_passes.h
+++ b/src/shader_recompiler/ir/passes/ir_passes.h
@@ -28,6 +28,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
 void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info);
 void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info,
                              const Profile& profile);
+void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile);
 void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
                                const Profile& profile);
 
diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
index ba8d1cca6..4cd16d18f 100644
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@@ -35,12 +35,28 @@ void Visit(Info& info, const IR::Inst& inst) {
         break;
     }
     case IR::Opcode::LoadSharedU16:
-    case IR::Opcode::LoadSharedU32:
-    case IR::Opcode::LoadSharedU64:
     case IR::Opcode::WriteSharedU16:
+        info.shared_types |= IR::Type::U16;
+        break;
+    case IR::Opcode::LoadSharedU32:
     case IR::Opcode::WriteSharedU32:
+    case IR::Opcode::SharedAtomicIAdd32:
+    case IR::Opcode::SharedAtomicISub32:
+    case IR::Opcode::SharedAtomicSMin32:
+    case IR::Opcode::SharedAtomicUMin32:
+    case IR::Opcode::SharedAtomicSMax32:
+    case IR::Opcode::SharedAtomicUMax32:
+    case IR::Opcode::SharedAtomicInc32:
+    case IR::Opcode::SharedAtomicDec32:
+    case IR::Opcode::SharedAtomicAnd32:
+    case IR::Opcode::SharedAtomicOr32:
+    case IR::Opcode::SharedAtomicXor32:
+        info.shared_types |= IR::Type::U32;
+        break;
+    case IR::Opcode::LoadSharedU64:
     case IR::Opcode::WriteSharedU64:
-        info.uses_shared = true;
+    case IR::Opcode::SharedAtomicIAdd64:
+        info.shared_types |= IR::Type::U64;
         break;
     case IR::Opcode::ConvertF16F32:
     case IR::Opcode::ConvertF32F16:
diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
new file mode 100644
index 000000000..0f80a3b28
--- /dev/null
+++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp
@@ -0,0 +1,127 @@
+// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shader_recompiler/ir/ir_emitter.h"
+#include "shader_recompiler/ir/program.h"
+#include "shader_recompiler/profile.h"
+
+namespace Shader::Optimization {
+
+static bool Requires16BitSharedAtomic(const IR::Inst& inst) {
+    // Nothing yet
+    return false;
+}
+
+static bool Requires64BitSharedAtomic(const IR::Inst& inst) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::SharedAtomicIAdd64:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static bool IsNon32BitSharedLoadStore(const IR::Inst& inst) {
+    switch (inst.GetOpcode()) {
+    case IR::Opcode::LoadSharedU16:
+    case IR::Opcode::LoadSharedU64:
+    case IR::Opcode::WriteSharedU16:
+    case IR::Opcode::WriteSharedU64:
+        return true;
+    default:
+        return false;
+    }
+}
+
+IR::Type CalculateSpecialSharedAtomicTypes(IR::Program& program) {
+    IR::Type extra_atomic_types{IR::Type::Void};
+    for (IR::Block* const block : program.blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            if (Requires16BitSharedAtomic(inst)) {
+                extra_atomic_types |= IR::Type::U16;
+            }
+            if (Requires64BitSharedAtomic(inst)) {
+                extra_atomic_types |= IR::Type::U64;
+            }
+        }
+    }
+    return extra_atomic_types;
+}
+
+// Simplifies down U16 and U64 shared memory operations to U32 when aliasing is not supported and
+// atomics of the same type are not used.
+void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile) {
+    if (program.info.stage != Stage::Compute || profile.supports_workgroup_explicit_memory_layout) {
+        return;
+    }
+
+    const auto atomic_types = CalculateSpecialSharedAtomicTypes(program);
+    if (True(atomic_types & IR::Type::U16) && True(atomic_types & IR::Type::U64)) {
+        // If both other atomic types are used, there is nothing to do.
+        return;
+    }
+
+    // Iterate through shared load/store U16/U64 instructions, replacing with
+    // equivalent U32 ops when the types are not needed for atomics.
+    for (IR::Block* const block : program.blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            if (!IsNon32BitSharedLoadStore(inst)) {
+                continue;
+            }
+            IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
+            const IR::U32 offset{inst.Arg(0)};
+            if (False(atomic_types & IR::Type::U16)) {
+                switch (inst.GetOpcode()) {
+                case IR::Opcode::LoadSharedU16: {
+                    const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
+                    const IR::U32 dword_value{ir.LoadShared(32, false, dword_offset)};
+                    const IR::U32 bit_offset{
+                        ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
+                    const IR::U32 value{ir.BitFieldExtract(dword_value, bit_offset, ir.Imm32(16U))};
+                    inst.ReplaceUsesWithAndRemove(ir.UConvert(16, value));
+                    continue;
+                }
+                case IR::Opcode::WriteSharedU16: {
+                    const IR::U32 value{ir.UConvert(32, IR::U16{inst.Arg(1)})};
+                    const IR::U32 bit_offset{
+                        ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))};
+                    const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))};
+                    const IR::U32 dword_value{
+                        ir.LoadShared(32, false, ir.BitwiseAnd(offset, dword_offset))};
+                    const IR::U32 new_dword_value{
+                        ir.BitFieldInsert(dword_value, value, bit_offset, ir.Imm32(16U))};
+                    ir.WriteShared(32, new_dword_value, dword_offset);
+                    inst.Invalidate();
+                    continue;
+                }
+                default:
+                    break;
+                }
+            }
+            if (False(atomic_types & IR::Type::U64)) {
+                switch (inst.GetOpcode()) {
+                case IR::Opcode::LoadSharedU64: {
+                    const IR::U32 value0{ir.LoadShared(32, false, offset)};
+                    const IR::U32 value1{ir.LoadShared(32, false, ir.IAdd(offset, ir.Imm32(4U)))};
+                    const IR::Value value{ir.PackUint2x32(ir.CompositeConstruct(value0, value1))};
+                    inst.ReplaceUsesWithAndRemove(value);
+                    continue;
+                }
+                case IR::Opcode::WriteSharedU64: {
+                    const IR::Value value{ir.UnpackUint2x32(IR::U64{inst.Arg(1)})};
+                    const IR::U32 value0{ir.CompositeExtract(value, 0)};
+                    const IR::U32 value1{ir.CompositeExtract(value, 1)};
+                    ir.WriteShared(32, value0, offset);
+                    ir.WriteShared(32, value1, ir.IAdd(offset, ir.Imm32(4U)));
+                    inst.Invalidate();
+                    continue;
+                }
+                default:
+                    break;
+                }
+            }
+        }
+    }
+}
+
+} // namespace Shader::Optimization
diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
index 839a8ddc5..a6900e180 100644
--- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp
@@ -34,20 +34,74 @@ static bool IsSharedAccess(const IR::Inst& inst) {
     }
 }
 
+IR::Type CalculateSharedMemoryTypes(IR::Program& program) {
+    IR::Type used_types{IR::Type::Void};
+    for (IR::Block* const block : program.blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            if (!IsSharedAccess(inst)) {
+                continue;
+            }
+            switch (inst.GetOpcode()) {
+            case IR::Opcode::LoadSharedU16:
+            case IR::Opcode::WriteSharedU16:
+                used_types |= IR::Type::U16;
+                break;
+            case IR::Opcode::LoadSharedU32:
+            case IR::Opcode::WriteSharedU32:
+            case IR::Opcode::SharedAtomicIAdd32:
+            case IR::Opcode::SharedAtomicISub32:
+            case IR::Opcode::SharedAtomicSMin32:
+            case IR::Opcode::SharedAtomicUMin32:
+            case IR::Opcode::SharedAtomicSMax32:
+            case IR::Opcode::SharedAtomicUMax32:
+            case IR::Opcode::SharedAtomicInc32:
+            case IR::Opcode::SharedAtomicDec32:
+            case IR::Opcode::SharedAtomicAnd32:
+            case IR::Opcode::SharedAtomicOr32:
+            case IR::Opcode::SharedAtomicXor32:
+                used_types |= IR::Type::U32;
+                break;
+            case IR::Opcode::LoadSharedU64:
+            case IR::Opcode::WriteSharedU64:
+            case IR::Opcode::SharedAtomicIAdd64:
+                used_types |= IR::Type::U64;
+                break;
+            default:
+                break;
+            }
+        }
+    }
+    return used_types;
+}
+
 void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info,
                                const Profile& profile) {
     if (program.info.stage != Stage::Compute) {
         return;
     }
-    // Only perform the transform if there is shared memory and either host shared memory is
-    // insufficient or the device does not support VK_KHR_workgroup_memory_explicit_layout
+
+    // Run this pass if:
+    // * There are shared memory instructions.
+    // * One of the following is true:
+    //   * Requested shared memory size is too large for the host shared memory.
+    //   * Workgroup explicit memory is not supported and multiple shared memory types are used.
     const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size;
-    if (shared_memory_size == 0 || (shared_memory_size <= profile.max_shared_memory_size &&
-                                    profile.supports_workgroup_explicit_memory_layout)) {
+    const auto used_types = CalculateSharedMemoryTypes(program);
+    if (used_types == IR::Type::Void || (shared_memory_size <= profile.max_shared_memory_size &&
+                                         (profile.supports_workgroup_explicit_memory_layout ||
+                                          std::popcount(static_cast<u32>(used_types)) == 1))) {
         return;
     }
+
+    // Add a buffer binding for shared memory storage buffer.
     const u32 binding = static_cast<u32>(program.info.buffers.size());
-    IR::Type used_types{};
+    program.info.buffers.push_back({
+        .used_types = used_types,
+        .inline_cbuf = AmdGpu::Buffer::Null(),
+        .buffer_type = BufferType::SharedMemory,
+        .is_written = true,
+    });
+
     for (IR::Block* const block : program.blocks) {
         for (IR::Inst& inst : block->Instructions()) {
             if (!IsSharedAccess(inst)) {
@@ -58,29 +112,21 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
             const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex),
                                            ir.Imm32(shared_memory_size));
             const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset);
-            // Replace shared atomics first
             switch (inst.GetOpcode()) {
             case IR::Opcode::SharedAtomicIAdd32:
-                inst.ReplaceUsesWithAndRemove(
-                    ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U32;
-                continue;
             case IR::Opcode::SharedAtomicIAdd64:
                 inst.ReplaceUsesWithAndRemove(
                     ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U64;
                 continue;
             case IR::Opcode::SharedAtomicISub32:
                 inst.ReplaceUsesWithAndRemove(
                     ir.BufferAtomicISub(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U32;
                 continue;
             case IR::Opcode::SharedAtomicSMin32:
             case IR::Opcode::SharedAtomicUMin32: {
                 const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32;
                 inst.ReplaceUsesWithAndRemove(
                     ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {}));
-                used_types |= IR::Type::U32;
                 continue;
             }
             case IR::Opcode::SharedAtomicSMax32:
@@ -88,73 +134,49 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_
                 const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32;
                 inst.ReplaceUsesWithAndRemove(
                     ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {}));
-                used_types |= IR::Type::U32;
                 continue;
             }
             case IR::Opcode::SharedAtomicInc32:
                 inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {}));
-                used_types |= IR::Type::U32;
                 continue;
             case IR::Opcode::SharedAtomicDec32:
                 inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {}));
-                used_types |= IR::Type::U32;
                 continue;
             case IR::Opcode::SharedAtomicAnd32:
                 inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U32;
                 continue;
             case IR::Opcode::SharedAtomicOr32:
                 inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U32;
                 continue;
             case IR::Opcode::SharedAtomicXor32:
                 inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {}));
-                used_types |= IR::Type::U32;
                 continue;
-            default:
-                break;
-            }
-            // Replace shared operations.
-            switch (inst.GetOpcode()) {
             case IR::Opcode::LoadSharedU16:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {}));
-                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::LoadSharedU32:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {}));
-                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::LoadSharedU64:
                 inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {}));
-                used_types |= IR::Type::U64;
                 break;
             case IR::Opcode::WriteSharedU16:
                 ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {});
                 inst.Invalidate();
-                used_types |= IR::Type::U16;
                 break;
             case IR::Opcode::WriteSharedU32:
                 ir.StoreBufferU32(1, handle, address, inst.Arg(1), {});
                 inst.Invalidate();
-                used_types |= IR::Type::U32;
                 break;
             case IR::Opcode::WriteSharedU64:
                 ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {});
                 inst.Invalidate();
-                used_types |= IR::Type::U64;
                 break;
             default:
                 break;
             }
         }
     }
-    // Add buffer binding for shared memory storage buffer.
-    program.info.buffers.push_back({
-        .used_types = used_types,
-        .inline_cbuf = AmdGpu::Buffer::Null(),
-        .buffer_type = BufferType::SharedMemory,
-        .is_written = true,
-    });
 }
 
 } // namespace Shader::Optimization
diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp
index 9f92857d6..e17fb1c9e 100644
--- a/src/shader_recompiler/recompiler.cpp
+++ b/src/shader_recompiler/recompiler.cpp
@@ -78,6 +78,7 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
     Shader::Optimization::FlattenExtendedUserdataPass(program);
     Shader::Optimization::ResourceTrackingPass(program);
     Shader::Optimization::LowerBufferFormatToRaw(program);
+    Shader::Optimization::SharedMemorySimplifyPass(program, profile);
     Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile);
     Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile);
     Shader::Optimization::IdentityRemovalPass(program.blocks);