diff --git a/CMakeLists.txt b/CMakeLists.txt
index 378b8f78d..84146bb01 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,7 +210,10 @@ set(GNM_LIB src/core/libraries/gnmdriver/gnmdriver.cpp
             src/core/libraries/gnmdriver/gnm_error.h
 )
 
-set(KERNEL_LIB src/core/libraries/kernel/threads/condvar.cpp
+set(KERNEL_LIB src/core/libraries/kernel/sync/mutex.cpp
+               src/core/libraries/kernel/sync/mutex.h
+               src/core/libraries/kernel/sync/semaphore.h
+               src/core/libraries/kernel/threads/condvar.cpp
                src/core/libraries/kernel/threads/event_flag.cpp
                src/core/libraries/kernel/threads/exception.cpp
                src/core/libraries/kernel/threads/exception.h
@@ -875,6 +878,10 @@ target_link_libraries(shadps4 PRIVATE Boost::headers GPUOpen::VulkanMemoryAlloca
 target_compile_definitions(shadps4 PRIVATE IMGUI_USER_CONFIG="imgui/imgui_config.h")
 target_compile_definitions(Dear_ImGui PRIVATE IMGUI_USER_CONFIG="${PROJECT_SOURCE_DIR}/src/imgui/imgui_config.h")
 
+if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    target_compile_definitions(shadps4 PRIVATE ENABLE_USERFAULTFD)
+endif()
+
 if (APPLE)
   option(USE_SYSTEM_VULKAN_LOADER "Enables using the system Vulkan loader instead of directly linking with MoltenVK. Useful for loading validation layers." OFF)
   if (USE_SYSTEM_VULKAN_LOADER)
diff --git a/src/common/ntapi.cpp b/src/common/ntapi.cpp
index ffdedb17f..e0ff1cef0 100644
--- a/src/common/ntapi.cpp
+++ b/src/common/ntapi.cpp
@@ -6,7 +6,6 @@
 #include "ntapi.h"
 
 NtClose_t NtClose = nullptr;
-NtDelayExecution_t NtDelayExecution = nullptr;
 NtSetInformationFile_t NtSetInformationFile = nullptr;
 NtCreateThread_t NtCreateThread = nullptr;
 NtTerminateThread_t NtTerminateThread = nullptr;
@@ -18,7 +17,6 @@ void Initialize() {
 
     // http://stackoverflow.com/a/31411628/4725495
     NtClose = (NtClose_t)GetProcAddress(nt_handle, "NtClose");
-    NtDelayExecution = (NtDelayExecution_t)GetProcAddress(nt_handle, "NtDelayExecution");
     NtSetInformationFile =
         (NtSetInformationFile_t)GetProcAddress(nt_handle, "NtSetInformationFile");
     NtCreateThread = (NtCreateThread_t)GetProcAddress(nt_handle, "NtCreateThread");
diff --git a/src/common/ntapi.h b/src/common/ntapi.h
index 743174061..cb1ba7f1c 100644
--- a/src/common/ntapi.h
+++ b/src/common/ntapi.h
@@ -408,7 +408,7 @@ typedef struct _TEB {                             /* win32/win64 */
 #ifdef _WIN64
     PVOID SystemReserved1[30]; /*    /0190 */
 #else
-    PVOID SystemReserved1[26]; /* 10c/     used for krnl386 private data in Wine */
+    PVOID SystemReserved1[26]; /* 10c/     */
 #endif
     char PlaceholderCompatibilityMode;                       /* 174/0280 */
     BOOLEAN PlaceholderHydrationAlwaysExplicit;              /* 175/0281 */
@@ -430,13 +430,13 @@ typedef struct _TEB {                             /* win32/win64 */
     BYTE SpareBytes1[23];                    /* 1b9/     */
     ULONG TxFsContext;                       /* 1d0/     */
 #endif
-    GDI_TEB_BATCH GdiTebBatch;          /* 1d4/02f0 used for ntdll private data in Wine */
+    GDI_TEB_BATCH GdiTebBatch;          /* 1d4/02f0 */
     CLIENT_ID RealClientId;             /* 6b4/07d8 */
     HANDLE GdiCachedProcessHandle;      /* 6bc/07e8 */
     ULONG GdiClientPID;                 /* 6c0/07f0 */
     ULONG GdiClientTID;                 /* 6c4/07f4 */
     PVOID GdiThreadLocaleInfo;          /* 6c8/07f8 */
-    ULONG_PTR Win32ClientInfo[62];      /* 6cc/0800 used for user32 private data in Wine */
+    ULONG_PTR Win32ClientInfo[62];      /* 6cc/0800 */
     PVOID glDispatchTable[233];         /* 7c4/09f0 */
     PVOID glReserved1[29];              /* b68/1138 */
     PVOID glReserved2;                  /* bdc/1220 */
@@ -511,8 +511,6 @@ static_assert(offsetof(TEB, DeallocationStack) ==
 
 typedef u64(__stdcall* NtClose_t)(HANDLE Handle);
 
-typedef u64(__stdcall* NtDelayExecution_t)(BOOL Alertable, PLARGE_INTEGER DelayInterval);
-
 typedef u64(__stdcall* NtSetInformationFile_t)(HANDLE FileHandle, PIO_STATUS_BLOCK IoStatusBlock,
                                                PVOID FileInformation, ULONG Length,
                                                FILE_INFORMATION_CLASS FileInformationClass);
@@ -525,7 +523,6 @@ typedef u64(__stdcall* NtCreateThread_t)(PHANDLE ThreadHandle, ACCESS_MASK Desir
 typedef u64(__stdcall* NtTerminateThread_t)(HANDLE ThreadHandle, u64 ExitStatus);
 
 extern NtClose_t NtClose;
-extern NtDelayExecution_t NtDelayExecution;
 extern NtSetInformationFile_t NtSetInformationFile;
 extern NtCreateThread_t NtCreateThread;
 extern NtTerminateThread_t NtTerminateThread;
diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index 46df68c38..c87aea6ef 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -147,6 +147,10 @@ void SetCurrentThreadName(const char* name) {
     SetThreadDescription(GetCurrentThread(), UTF8ToUTF16W(name).data());
 }
 
+void SetThreadName(void* thread, const char* name) {
+    SetThreadDescription(thread, UTF8ToUTF16W(name).data());
+}
+
 #else // !MSVC_VER, so must be POSIX threads
 
 // MinGW with the POSIX threading model does not support pthread_setname_np
@@ -170,11 +174,19 @@ void SetCurrentThreadName(const char* name) {
     pthread_setname_np(pthread_self(), name);
 #endif
 }
+
+void SetThreadName(void* thread, const char* name) {
+    // TODO
+}
 #endif
 
 #if defined(_WIN32)
 void SetCurrentThreadName(const char*) {
-    // Do Nothing on MingW
+    // Do Nothing on MinGW
+}
+
+void SetThreadName(void* thread, const char* name) {
+    // Do Nothing on MinGW
 }
 #endif
 
diff --git a/src/common/thread.h b/src/common/thread.h
index fd962f8e5..175ba9445 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -23,6 +23,8 @@ void SetCurrentThreadPriority(ThreadPriority new_priority);
 
 void SetCurrentThreadName(const char* name);
 
+void SetThreadName(void* thread, const char* name);
+
 class AccurateTimer {
     std::chrono::nanoseconds target_interval{};
     std::chrono::nanoseconds total_wait{};
diff --git a/src/core/devices/logger.cpp b/src/core/devices/logger.cpp
index bf5a28382..6f104509c 100644
--- a/src/core/devices/logger.cpp
+++ b/src/core/devices/logger.cpp
@@ -15,6 +15,7 @@ s64 Logger::write(const void* buf, size_t nbytes) {
     log(static_cast<const char*>(buf), nbytes);
     return nbytes;
 }
+
 size_t Logger::writev(const Libraries::Kernel::SceKernelIovec* iov, int iovcnt) {
     for (int i = 0; i < iovcnt; i++) {
         log(static_cast<const char*>(iov[i].iov_base), iov[i].iov_len);
diff --git a/src/core/libraries/kernel/sync/mutex.cpp b/src/core/libraries/kernel/sync/mutex.cpp
new file mode 100644
index 000000000..c5e3eba1d
--- /dev/null
+++ b/src/core/libraries/kernel/sync/mutex.cpp
@@ -0,0 +1,52 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "mutex.h"
+
+#include "common/assert.h"
+
+namespace Libraries::Kernel {
+
+TimedMutex::TimedMutex() {
+#ifdef _WIN64
+    mtx = CreateMutex(nullptr, false, nullptr);
+    ASSERT(mtx);
+#endif
+}
+
+TimedMutex::~TimedMutex() {
+#ifdef _WIN64
+    CloseHandle(mtx);
+#endif
+}
+
+void TimedMutex::lock() {
+#ifdef _WIN64
+    for (;;) {
+        u64 res = WaitForSingleObjectEx(mtx, INFINITE, true);
+        if (res == WAIT_OBJECT_0) {
+            return;
+        }
+    }
+#else
+    mtx.lock();
+#endif
+}
+
+bool TimedMutex::try_lock() {
+#ifdef _WIN64
+    return WaitForSingleObjectEx(mtx, 0, true) == WAIT_OBJECT_0;
+#else
+    return mtx.try_lock();
+#endif
+}
+
+void TimedMutex::unlock() {
+#ifdef _WIN64
+    ReleaseMutex(mtx);
+#else
+    mtx.unlock();
+#endif
+}
+
+} // namespace Libraries::Kernel
\ No newline at end of file
diff --git a/src/core/libraries/kernel/sync/mutex.h b/src/core/libraries/kernel/sync/mutex.h
new file mode 100644
index 000000000..f14a920b4
--- /dev/null
+++ b/src/core/libraries/kernel/sync/mutex.h
@@ -0,0 +1,80 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <chrono>
+
+#include "common/types.h"
+
+#ifdef _WIN64
+#include <windows.h>
+#else
+#include <mutex>
+#endif
+
+namespace Libraries::Kernel {
+
+class TimedMutex {
+public:
+    TimedMutex();
+    ~TimedMutex();
+
+    void lock();
+    bool try_lock();
+
+    void unlock();
+
+    template <class Rep, class Period>
+    bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
+#ifdef _WIN64
+        constexpr auto zero = std::chrono::duration<Rep, Period>::zero();
+        const auto now = std::chrono::steady_clock::now();
+
+        std::chrono::steady_clock::time_point abs_time = now;
+        if (rel_time > zero) {
+            constexpr auto max = (std::chrono::steady_clock::time_point::max)();
+            if (abs_time < max - rel_time) {
+                abs_time += rel_time;
+            } else {
+                abs_time = max;
+            }
+        }
+
+        return try_lock_until(abs_time);
+#else
+        return mtx.try_lock_for(rel_time);
+#endif
+    }
+
+    template <class Clock, class Duration>
+    bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
+#ifdef _WIN64
+        for (;;) {
+            const auto now = Clock::now();
+            if (abs_time <= now) {
+                return false;
+            }
+
+            const auto rel_ms = std::chrono::ceil<std::chrono::milliseconds>(abs_time - now);
+            u64 res = WaitForSingleObjectEx(mtx, static_cast<u64>(rel_ms.count()), true);
+            if (res == WAIT_OBJECT_0) {
+                return true;
+            } else if (res == WAIT_TIMEOUT) {
+                return false;
+            }
+        }
+#else
+        return mtx.try_lock_until(abs_time);
+#endif
+    }
+
+private:
+#ifdef _WIN64
+    HANDLE mtx;
+#else
+    std::timed_mutex mtx;
+#endif
+};
+
+} // namespace Libraries::Kernel
\ No newline at end of file
diff --git a/src/core/libraries/kernel/sync/semaphore.h b/src/core/libraries/kernel/sync/semaphore.h
new file mode 100644
index 000000000..884b08968
--- /dev/null
+++ b/src/core/libraries/kernel/sync/semaphore.h
@@ -0,0 +1,151 @@
+// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+
+#include "common/assert.h"
+#include "common/types.h"
+
+#ifdef _WIN64
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <dispatch/dispatch.h>
+#else
+#include <semaphore>
+#endif
+
+namespace Libraries::Kernel {
+
+template <s64 max>
+class Semaphore {
+public:
+    Semaphore(s32 initialCount)
+#if !defined(_WIN64) && !defined(__APPLE__)
+        : sem{initialCount}
+#endif
+    {
+#ifdef _WIN64
+        sem = CreateSemaphore(nullptr, initialCount, max, nullptr);
+        ASSERT(sem);
+#elif defined(__APPLE__)
+        sem = dispatch_semaphore_create(initialCount);
+        ASSERT(sem);
+#endif
+    }
+
+    ~Semaphore() {
+#ifdef _WIN64
+        CloseHandle(sem);
+#elif defined(__APPLE__)
+        dispatch_release(sem);
+#endif
+    }
+
+    void release() {
+#ifdef _WIN64
+        ReleaseSemaphore(sem, 1, nullptr);
+#elif defined(__APPLE__)
+        dispatch_semaphore_signal(sem);
+#else
+        sem.release();
+#endif
+    }
+
+    void acquire() {
+#ifdef _WIN64
+        for (;;) {
+            u64 res = WaitForSingleObjectEx(sem, INFINITE, true);
+            if (res == WAIT_OBJECT_0) {
+                return;
+            }
+        }
+#elif defined(__APPLE__)
+        for (;;) {
+            const auto res = dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
+            if (res == 0) {
+                return;
+            }
+        }
+#else
+        sem.acquire();
+#endif
+    }
+
+    bool try_acquire() {
+#ifdef _WIN64
+        return WaitForSingleObjectEx(sem, 0, true) == WAIT_OBJECT_0;
+#elif defined(__APPLE__)
+        return dispatch_semaphore_wait(sem, DISPATCH_TIME_NOW) == 0;
+#else
+        return sem.try_acquire();
+#endif
+    }
+
+    template <class Rep, class Period>
+    bool try_acquire_for(const std::chrono::duration<Rep, Period>& rel_time) {
+#ifdef _WIN64
+        const auto rel_time_ms = std::chrono::ceil<std::chrono::milliseconds>(rel_time);
+        const u64 timeout_ms = static_cast<u64>(rel_time_ms.count());
+
+        if (timeout_ms == 0) {
+            return false;
+        }
+
+        return WaitForSingleObjectEx(sem, timeout_ms, true) == WAIT_OBJECT_0;
+#elif defined(__APPLE__)
+        const auto rel_time_ns = std::chrono::ceil<std::chrono::nanoseconds>(rel_time).count();
+        const auto timeout = dispatch_time(DISPATCH_TIME_NOW, rel_time_ns);
+        return dispatch_semaphore_wait(sem, timeout) == 0;
+#else
+        return sem.try_acquire_for(rel_time);
+#endif
+    }
+
+    template <class Clock, class Duration>
+    bool try_acquire_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
+#ifdef _WIN64
+        const auto now = Clock::now();
+        if (now >= abs_time) {
+            return false;
+        }
+
+        const auto rel_time = std::chrono::ceil<std::chrono::milliseconds>(abs_time - now);
+        const u64 timeout_ms = static_cast<u64>(rel_time.count());
+        if (timeout_ms == 0) {
+            return false;
+        }
+
+        u64 res = WaitForSingleObjectEx(sem, static_cast<u64>(timeout_ms), true);
+        return res == WAIT_OBJECT_0;
+#elif defined(__APPLE__)
+        auto abs_s = std::chrono::time_point_cast<std::chrono::seconds>(abs_time);
+        auto abs_ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(abs_time) -
+                      std::chrono::time_point_cast<std::chrono::nanoseconds>(abs_s);
+        const timespec abs_timespec = {
+            .tv_sec = abs_s.time_since_epoch().count(),
+            .tv_nsec = abs_ns.count(),
+        };
+        const auto timeout = dispatch_walltime(&abs_timespec, 0);
+        return dispatch_semaphore_wait(sem, timeout) == 0;
+#else
+        return sem.try_acquire_until(abs_time);
+#endif
+    }
+
+private:
+#ifdef _WIN64
+    HANDLE sem;
+#elif defined(__APPLE__)
+    dispatch_semaphore_t sem;
+#else
+    std::counting_semaphore<max> sem;
+#endif
+};
+
+using BinarySemaphore = Semaphore<1>;
+using CountingSemaphore = Semaphore<0x7FFFFFFF /*ORBIS_KERNEL_SEM_VALUE_MAX*/>;
+
+} // namespace Libraries::Kernel
\ No newline at end of file
diff --git a/src/core/libraries/kernel/threads/condvar.cpp b/src/core/libraries/kernel/threads/condvar.cpp
index cbe8f6ca7..2927899d9 100644
--- a/src/core/libraries/kernel/threads/condvar.cpp
+++ b/src/core/libraries/kernel/threads/condvar.cpp
@@ -191,7 +191,7 @@ int PthreadCond::Signal() {
     PthreadMutex* mp = td->mutex_obj;
     has_user_waiters = SleepqRemove(sq, td);
 
-    std::binary_semaphore* waddr = nullptr;
+    BinarySemaphore* waddr = nullptr;
     if (mp->m_owner == curthread) {
         if (curthread->nwaiter_defer >= Pthread::MaxDeferWaiters) {
             curthread->WakeAll();
@@ -211,7 +211,7 @@ int PthreadCond::Signal() {
 
 struct BroadcastArg {
     Pthread* curthread;
-    std::binary_semaphore* waddrs[Pthread::MaxDeferWaiters];
+    BinarySemaphore* waddrs[Pthread::MaxDeferWaiters];
     int count;
 };
 
diff --git a/src/core/libraries/kernel/threads/event_flag.cpp b/src/core/libraries/kernel/threads/event_flag.cpp
index 39925153c..ce75bed9e 100644
--- a/src/core/libraries/kernel/threads/event_flag.cpp
+++ b/src/core/libraries/kernel/threads/event_flag.cpp
@@ -118,7 +118,6 @@ public:
         }
 
         m_bits |= bits;
-
         m_cond_var.notify_all();
     }
 
diff --git a/src/core/libraries/kernel/threads/pthread.cpp b/src/core/libraries/kernel/threads/pthread.cpp
index a562c51b2..b2fe09934 100644
--- a/src/core/libraries/kernel/threads/pthread.cpp
+++ b/src/core/libraries/kernel/threads/pthread.cpp
@@ -380,6 +380,7 @@ int PS4_SYSV_ABI posix_sched_get_priority_min() {
 
 int PS4_SYSV_ABI posix_pthread_rename_np(PthreadT thread, const char* name) {
     LOG_INFO(Kernel_Pthread, "name = {}", name);
+    Common::SetThreadName(reinterpret_cast<void*>(thread->native_thr.GetHandle()), name);
     thread->name = name;
     return ORBIS_OK;
 }
diff --git a/src/core/libraries/kernel/threads/pthread.h b/src/core/libraries/kernel/threads/pthread.h
index 9d71c75e8..456c2ef37 100644
--- a/src/core/libraries/kernel/threads/pthread.h
+++ b/src/core/libraries/kernel/threads/pthread.h
@@ -11,6 +11,8 @@
 #include <shared_mutex>
 
 #include "common/enum.h"
+#include "core/libraries/kernel/sync/mutex.h"
+#include "core/libraries/kernel/sync/semaphore.h"
 #include "core/libraries/kernel/time.h"
 #include "core/thread.h"
 #include "core/tls.h"
@@ -44,7 +46,7 @@ enum class PthreadMutexProt : u32 {
 };
 
 struct PthreadMutex {
-    std::timed_mutex m_lock;
+    TimedMutex m_lock;
     PthreadMutexFlags m_flags;
     Pthread* m_owner;
     int m_count;
@@ -288,14 +290,14 @@ struct Pthread {
     int report_events;
     int event_mask;
     std::string name;
-    std::binary_semaphore wake_sema{0};
+    BinarySemaphore wake_sema{0};
     SleepQueue* sleepqueue;
     void* wchan;
     PthreadMutex* mutex_obj;
     bool will_sleep;
     bool has_user_waiters;
     int nwaiter_defer;
-    std::binary_semaphore* defer_waiters[MaxDeferWaiters];
+    BinarySemaphore* defer_waiters[MaxDeferWaiters];
 
     bool InCritical() const noexcept {
         return locklevel > 0 || critical_count > 0;
diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp
index e3c7e9092..5aa04f251 100644
--- a/src/core/libraries/kernel/threads/semaphore.cpp
+++ b/src/core/libraries/kernel/threads/semaphore.cpp
@@ -6,6 +6,8 @@
 #include <mutex>
 #include <semaphore>
 
+#include "core/libraries/kernel/sync/semaphore.h"
+
 #include "common/logging/log.h"
 #include "core/libraries/kernel/kernel.h"
 #include "core/libraries/kernel/orbis_error.h"
@@ -21,7 +23,7 @@ constexpr int ORBIS_KERNEL_SEM_VALUE_MAX = 0x7FFFFFFF;
 struct PthreadSem {
     explicit PthreadSem(s32 value_) : semaphore{value_}, value{value_} {}
 
-    std::counting_semaphore<ORBIS_KERNEL_SEM_VALUE_MAX> semaphore;
+    CountingSemaphore semaphore;
     std::atomic<s32> value;
 };
 
@@ -75,7 +77,7 @@ public:
             it = wait_list.erase(it);
             token_count -= waiter->need_count;
             waiter->was_signaled = true;
-            waiter->cv.notify_one();
+            waiter->sem.release();
         }
 
         return true;
@@ -88,7 +90,7 @@ public:
         }
         for (auto* waiter : wait_list) {
             waiter->was_cancled = true;
-            waiter->cv.notify_one();
+            waiter->sem.release();
         }
         wait_list.clear();
         token_count = set_count < 0 ? init_count : set_count;
@@ -99,21 +101,21 @@ public:
         std::scoped_lock lk{mutex};
         for (auto* waiter : wait_list) {
             waiter->was_deleted = true;
-            waiter->cv.notify_one();
+            waiter->sem.release();
         }
         wait_list.clear();
     }
 
 public:
     struct WaitingThread {
-        std::condition_variable cv;
+        BinarySemaphore sem;
         u32 priority;
         s32 need_count;
         bool was_signaled{};
         bool was_deleted{};
         bool was_cancled{};
 
-        explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} {
+        explicit WaitingThread(s32 need_count, bool is_fifo) : sem{0}, need_count{need_count} {
             // Retrieve calling thread priority for sorting into waiting threads list.
             if (!is_fifo) {
                 priority = g_curthread->attr.prio;
@@ -134,24 +136,26 @@ public:
         }
 
         int Wait(std::unique_lock<std::mutex>& lk, u32* timeout) {
+            lk.unlock();
             if (!timeout) {
                 // Wait indefinitely until we are woken up.
-                cv.wait(lk);
+                sem.acquire();
+                lk.lock();
                 return GetResult(false);
             }
             // Wait until timeout runs out, recording how much remaining time there was.
             const auto start = std::chrono::high_resolution_clock::now();
-            const auto signaled = cv.wait_for(lk, std::chrono::microseconds(*timeout),
-                                              [this] { return was_signaled; });
+            sem.try_acquire_for(std::chrono::microseconds(*timeout));
             const auto end = std::chrono::high_resolution_clock::now();
             const auto time =
                 std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
-            if (signaled) {
+            lk.lock();
+            if (was_signaled) {
                 *timeout -= time;
             } else {
                 *timeout = 0;
             }
-            return GetResult(!signaled);
+            return GetResult(!was_signaled);
         }
     };
 
diff --git a/src/core/libraries/kernel/time.cpp b/src/core/libraries/kernel/time.cpp
index b586431ab..2565b8078 100644
--- a/src/core/libraries/kernel/time.cpp
+++ b/src/core/libraries/kernel/time.cpp
@@ -52,7 +52,22 @@ u64 PS4_SYSV_ABI sceKernelReadTsc() {
 
 int PS4_SYSV_ABI sceKernelUsleep(u32 microseconds) {
 #ifdef _WIN64
-    std::this_thread::sleep_for(std::chrono::microseconds(microseconds));
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    auto total_wait_time = std::chrono::microseconds(microseconds);
+
+    while (total_wait_time.count() > 0) {
+        auto wait_time = std::chrono::ceil<std::chrono::milliseconds>(total_wait_time).count();
+        u64 res = SleepEx(static_cast<u64>(wait_time), true);
+        if (res == WAIT_IO_COMPLETION) {
+            auto elapsedTime = std::chrono::high_resolution_clock::now() - start_time;
+            auto elapsedMicroseconds =
+                std::chrono::duration_cast<std::chrono::microseconds>(elapsedTime).count();
+            total_wait_time = std::chrono::microseconds(microseconds - elapsedMicroseconds);
+        } else {
+            break;
+        }
+    }
+
     return 0;
 #else
     timespec start;
diff --git a/src/core/module.cpp b/src/core/module.cpp
index ef34f25c1..70afb932c 100644
--- a/src/core/module.cpp
+++ b/src/core/module.cpp
@@ -470,8 +470,8 @@ OrbisKernelModuleInfoEx Module::GetModuleInfoEx() const {
         .tls_align = tls.align,
         .init_proc_addr = base_virtual_addr + dynamic_info.init_virtual_addr,
         .fini_proc_addr = base_virtual_addr + dynamic_info.fini_virtual_addr,
-        .eh_frame_hdr_addr = eh_frame_hdr_addr,
-        .eh_frame_addr = eh_frame_addr,
+        .eh_frame_hdr_addr = base_virtual_addr + eh_frame_hdr_addr,
+        .eh_frame_addr = base_virtual_addr + eh_frame_addr,
         .eh_frame_hdr_size = eh_frame_hdr_size,
         .eh_frame_size = eh_frame_size,
         .segments = info.segments,
diff --git a/src/emulator.cpp b/src/emulator.cpp
index 60d6e18d7..8a7c04cf4 100644
--- a/src/emulator.cpp
+++ b/src/emulator.cpp
@@ -266,7 +266,7 @@ void Emulator::Run(const std::filesystem::path& file) {
 }
 
 void Emulator::LoadSystemModules(const std::filesystem::path& file, std::string game_serial) {
-    constexpr std::array<SysModules, 10> ModulesToLoad{
+    constexpr std::array<SysModules, 14> ModulesToLoad{
         {{"libSceNgs2.sprx", &Libraries::Ngs2::RegisterlibSceNgs2},
          {"libSceFiber.sprx", &Libraries::Fiber::RegisterlibSceFiber},
          {"libSceUlt.sprx", nullptr},
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index d8c0a17bd..b578f0c52 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -326,7 +326,9 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
 Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
     const auto& buffer = ctx.texture_buffers[handle];
     const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
-    const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset);
+    const Id coord =
+        ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
+                   buffer.coord_offset);
     Id texel = buffer.is_storage ? ctx.OpImageRead(buffer.result_type, tex_buffer, coord)
                                  : ctx.OpImageFetch(buffer.result_type, tex_buffer, coord);
     if (buffer.is_integer) {
@@ -372,7 +374,9 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
 void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
     const auto& buffer = ctx.texture_buffers[handle];
     const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
-    const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset);
+    const Id coord =
+        ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
+                   buffer.coord_offset);
     if (buffer.is_integer) {
         value = ctx.OpBitcast(buffer.result_type, value);
     }
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 4ce9f4221..5c7278c6b 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -207,6 +207,8 @@ void EmitContext::DefineBufferOffsets() {
                                    push_data_block, ConstU32(half), ConstU32(comp))};
         const Id value{OpLoad(U32[1], ptr)};
         tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(6U));
+        tex_buffer.coord_shift =
+            OpBitFieldUExtract(U32[1], value, ConstU32(offset + 6U), ConstU32(2U));
         Name(tex_buffer.coord_offset, fmt::format("texbuf{}_off", binding));
     }
 }
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
index 1c5da946d..4e5e7dd3b 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h
@@ -223,6 +223,7 @@ public:
     struct TextureBufferDefinition {
         Id id;
         Id coord_offset;
+        Id coord_shift;
         u32 binding;
         Id image_type;
         Id result_type;
diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h
index ee9f5c805..080b0eb22 100644
--- a/src/shader_recompiler/frontend/fetch_shader.h
+++ b/src/shader_recompiler/frontend/fetch_shader.h
@@ -58,19 +58,6 @@ struct FetchShaderData {
                }) != attributes.end();
     }
 
-    [[nodiscard]] std::pair<u32, u32> GetDrawOffsets(const AmdGpu::Liverpool::Regs& regs,
-                                                     const Info& info) const {
-        u32 vertex_offset = regs.index_offset;
-        u32 instance_offset = 0;
-        if (vertex_offset == 0 && vertex_offset_sgpr != -1) {
-            vertex_offset = info.user_data[vertex_offset_sgpr];
-        }
-        if (instance_offset_sgpr != -1) {
-            instance_offset = info.user_data[instance_offset_sgpr];
-        }
-        return {vertex_offset, instance_offset};
-    }
-
     bool operator==(const FetchShaderData& other) const {
         return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr &&
                instance_offset_sgpr == other.instance_offset_sgpr;
diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
index de8b9da87..75ad957b3 100644
--- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp
+++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp
@@ -50,6 +50,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
             return S_OR_B64(NegateMode::None, false, inst);
         case Opcode::S_XOR_B32:
             return S_XOR_B32(inst);
+        case Opcode::S_NOT_B32:
+            return S_NOT_B32(inst);
         case Opcode::S_XOR_B64:
             return S_OR_B64(NegateMode::None, true, inst);
         case Opcode::S_ANDN2_B32:
@@ -94,6 +96,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
             return S_BREV_B32(inst);
         case Opcode::S_BCNT1_I32_B64:
             return S_BCNT1_I32_B64(inst);
+        case Opcode::S_FF1_I32_B64:
+            return S_FF1_I32_B64(inst);
         case Opcode::S_AND_SAVEEXEC_B64:
             return S_SAVEEXEC_B64(NegateMode::None, false, inst);
         case Opcode::S_ORN2_SAVEEXEC_B64:
@@ -301,6 +305,10 @@ void Translator::S_AND_B64(NegateMode negate, const GcnInst& inst) {
             ASSERT_MSG(-s32(operand.code) + SignedConstIntNegMin - 1 == -1,
                        "SignedConstIntNeg must be -1");
             return ir.Imm1(true);
+        case OperandField::LiteralConst:
+            ASSERT_MSG(operand.code == 0 || operand.code == std::numeric_limits<u32>::max(),
+                       "Unsupported literal {:#x}", operand.code);
+            return ir.Imm1(operand.code & 1);
         default:
             UNREACHABLE();
         }
@@ -382,6 +390,13 @@ void Translator::S_XOR_B32(const GcnInst& inst) {
     ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
 }
 
+void Translator::S_NOT_B32(const GcnInst& inst) {
+    const IR::U32 src0{GetSrc(inst.src[0])};
+    const IR::U32 result{ir.BitwiseNot(src0)};
+    SetDst(inst.dst[0], result);
+    ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
+}
+
 void Translator::S_LSHL_B32(const GcnInst& inst) {
     const IR::U32 src0{GetSrc(inst.src[0])};
     const IR::U32 src1{GetSrc(inst.src[1])};
@@ -560,6 +575,12 @@ void Translator::S_BCNT1_I32_B64(const GcnInst& inst) {
     ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
 }
 
+void Translator::S_FF1_I32_B64(const GcnInst& inst) {
+    const IR::U32 src0{GetSrc(inst.src[0])};
+    const IR::U32 result{ir.Select(ir.IEqual(src0, ir.Imm32(0U)), ir.Imm32(-1), ir.FindILsb(src0))};
+    SetDst(inst.dst[0], result);
+}
+
 void Translator::S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst) {
     // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs)
     // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination
diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h
index 3b89372bd..dd379d8ea 100644
--- a/src/shader_recompiler/frontend/translate/translate.h
+++ b/src/shader_recompiler/frontend/translate/translate.h
@@ -96,6 +96,7 @@ public:
     void S_MUL_I32(const GcnInst& inst);
     void S_BFE_U32(const GcnInst& inst);
     void S_ABSDIFF_I32(const GcnInst& inst);
+    void S_NOT_B32(const GcnInst& inst);
 
     // SOPK
     void S_MOVK(const GcnInst& inst);
@@ -109,6 +110,7 @@ public:
     void S_NOT_B64(const GcnInst& inst);
     void S_BREV_B32(const GcnInst& inst);
     void S_BCNT1_I32_B64(const GcnInst& inst);
+    void S_FF1_I32_B64(const GcnInst& inst);
     void S_GETPC_B64(u32 pc, const GcnInst& inst);
     void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst);
 
diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h
index d382d0e7c..494bbb4bb 100644
--- a/src/shader_recompiler/info.h
+++ b/src/shader_recompiler/info.h
@@ -105,6 +105,11 @@ struct PushData {
         ASSERT(offset < 256 && binding < buf_offsets.size());
         buf_offsets[binding] = offset;
     }
+
+    void AddTexelOffset(u32 binding, u32 multiplier, u32 texel_offset) {
+        ASSERT(texel_offset < 64 && multiplier < 16);
+        buf_offsets[binding] = texel_offset | ((std::bit_width(multiplier) - 1) << 6);
+    }
 };
 static_assert(sizeof(PushData) <= 128,
               "PushData size is greater than minimum size guaranteed by Vulkan spec");
diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h
index 740b89dda..2a3bd62f4 100644
--- a/src/shader_recompiler/specialization.h
+++ b/src/shader_recompiler/specialization.h
@@ -9,7 +9,6 @@
 #include "frontend/fetch_shader.h"
 #include "shader_recompiler/backend/bindings.h"
 #include "shader_recompiler/info.h"
-#include "shader_recompiler/ir/passes/srt.h"
 
 namespace Shader {
 
@@ -22,8 +21,12 @@ struct VsAttribSpecialization {
 struct BufferSpecialization {
     u16 stride : 14;
     u16 is_storage : 1;
+    u32 size = 0;
 
-    auto operator<=>(const BufferSpecialization&) const = default;
+    bool operator==(const BufferSpecialization& other) const {
+        return stride == other.stride && is_storage == other.is_storage &&
+               (size >= other.is_storage || is_storage);
+    }
 };
 
 struct TextureBufferSpecialization {
@@ -57,7 +60,7 @@ struct StageSpecialization {
 
     const Shader::Info* info;
     RuntimeInfo runtime_info;
-    Gcn::FetchShaderData fetch_shader_data{};
+    std::optional<Gcn::FetchShaderData> fetch_shader_data{};
     boost::container::small_vector<VsAttribSpecialization, 32> vs_attribs;
     std::bitset<MaxStageResources> bitset{};
     boost::container::small_vector<BufferSpecialization, 16> buffers;
@@ -69,15 +72,14 @@ struct StageSpecialization {
     explicit StageSpecialization(const Info& info_, RuntimeInfo runtime_info_,
                                  const Profile& profile_, Backend::Bindings start_)
         : info{&info_}, runtime_info{runtime_info_}, start{start_} {
-        if (const auto fetch_shader = Gcn::ParseFetchShader(info_)) {
-            fetch_shader_data = *fetch_shader;
-            if (info_.stage == Stage::Vertex && !profile_.support_legacy_vertex_attributes) {
-                // Specialize shader on VS input number types to follow spec.
-                ForEachSharp(vs_attribs, fetch_shader_data.attributes,
-                             [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
-                                 spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt());
-                             });
-            }
+        fetch_shader_data = Gcn::ParseFetchShader(info_);
+        if (info_.stage == Stage::Vertex && fetch_shader_data &&
+            !profile_.support_legacy_vertex_attributes) {
+            // Specialize shader on VS input number types to follow spec.
+            ForEachSharp(vs_attribs, fetch_shader_data->attributes,
+                         [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
+                             spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt());
+                         });
         }
         u32 binding{};
         if (info->has_readconst) {
@@ -87,6 +89,9 @@ struct StageSpecialization {
                      [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
                          spec.stride = sharp.GetStride();
                          spec.is_storage = desc.IsStorage(sharp);
+                         if (!spec.is_storage) {
+                             spec.size = sharp.GetSize();
+                         }
                      });
         ForEachSharp(binding, tex_buffers, info->texture_buffers,
                      [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp
index 1bbd77f82..a4eae8e7a 100644
--- a/src/video_core/amdgpu/liverpool.cpp
+++ b/src/video_core/amdgpu/liverpool.cpp
@@ -163,8 +163,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
         }
         case PM4ItOpcode::IndirectBufferConst: {
             const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
-            auto task = ProcessCeUpdate(
-                {indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
+            auto task =
+                ProcessCeUpdate({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
             while (!task.handle.done()) {
                 task.handle.resume();
 
@@ -565,7 +565,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
             }
             case PM4ItOpcode::DmaData: {
                 const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
-                if (dma_data->dst_addr_lo == 0x3022C) {
+                if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
                     break;
                 }
                 if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {
@@ -700,7 +700,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
         }
         case PM4ItOpcode::DmaData: {
             const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
-            if (dma_data->dst_addr_lo == 0x3022C) {
+            if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
                 break;
             }
             if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b1bf77f8a..3dab95db7 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -42,7 +42,7 @@ public:
 
     struct Traits {
         using Entry = BufferId;
-        static constexpr size_t AddressSpaceBits = 39;
+        static constexpr size_t AddressSpaceBits = 40;
         static constexpr size_t FirstLevelBits = 14;
         static constexpr size_t PageBits = CACHING_PAGEBITS;
     };
diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h
index 375701c4c..a59bcfff5 100644
--- a/src/video_core/buffer_cache/memory_tracker_base.h
+++ b/src/video_core/buffer_cache/memory_tracker_base.h
@@ -14,7 +14,7 @@ namespace VideoCore {
 
 class MemoryTracker {
 public:
-    static constexpr size_t MAX_CPU_PAGE_BITS = 39;
+    static constexpr size_t MAX_CPU_PAGE_BITS = 40;
     static constexpr size_t HIGHER_PAGE_BITS = 22;
     static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
     static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;
diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp
index d26a7067a..80b91b825 100644
--- a/src/video_core/page_manager.cpp
+++ b/src/video_core/page_manager.cpp
@@ -29,7 +29,7 @@ namespace VideoCore {
 constexpr size_t PAGESIZE = 4_KB;
 constexpr size_t PAGEBITS = 12;
 
-#if ENABLE_USERFAULTFD
+#ifdef ENABLE_USERFAULTFD
 struct PageManager::Impl {
     Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
         uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 47713f0ff..82a029b95 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -279,6 +279,8 @@ bool PipelineCache::RefreshGraphicsKey() {
         ++remapped_cb;
     }
 
+    fetch_shader = std::nullopt;
+
     Shader::Backend::Bindings binding{};
     const auto& TryBindStageRemap = [&](Shader::Stage stage_in, Shader::Stage stage_out) -> bool {
         const auto stage_in_idx = static_cast<u32>(stage_in);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 620e5f103..4e858c0d3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -171,6 +171,22 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) {
     return state;
 }
 
+[[nodiscard]] std::pair<u32, u32> GetDrawOffsets(
+    const AmdGpu::Liverpool::Regs& regs, const Shader::Info& info,
+    const std::optional<Shader::Gcn::FetchShaderData>& fetch_shader) {
+    u32 vertex_offset = regs.index_offset;
+    u32 instance_offset = 0;
+    if (fetch_shader) {
+        if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) {
+            vertex_offset = info.user_data[fetch_shader->vertex_offset_sgpr];
+        }
+        if (fetch_shader->instance_offset_sgpr != -1) {
+            instance_offset = info.user_data[fetch_shader->instance_offset_sgpr];
+        }
+    }
+    return {vertex_offset, instance_offset};
+}
+
 void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
     RENDERER_TRACE;
 
@@ -198,7 +214,7 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
     BeginRendering(*pipeline, state);
     UpdateDynamicState(*pipeline);
 
-    const auto [vertex_offset, instance_offset] = fetch_shader->GetDrawOffsets(regs, vs_info);
+    const auto [vertex_offset, instance_offset] = GetDrawOffsets(regs, vs_info, fetch_shader);
 
     const auto cmdbuf = scheduler.CommandBuffer();
     cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle());
@@ -532,12 +548,13 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
             const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
                 vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id);
             const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3;
-            ASSERT_MSG(fmt_stride == vsharp.GetStride(),
+            const u32 buf_stride = vsharp.GetStride();
+            ASSERT_MSG(buf_stride % fmt_stride == 0,
                        "Texel buffer stride must match format stride");
             const u32 offset_aligned = Common::AlignDown(offset, alignment);
             const u32 adjust = offset - offset_aligned;
             ASSERT(adjust % fmt_stride == 0);
-            push_data.AddOffset(binding.buffer, adjust / fmt_stride);
+            push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride);
             buffer_view =
                 vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust, desc.is_written,
                                 vsharp.GetDataFmt(), vsharp.GetNumberFmt());
diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp
index 7430168d0..9823cb4dc 100644
--- a/src/video_core/texture_cache/tile_manager.cpp
+++ b/src/video_core/texture_cache/tile_manager.cpp
@@ -392,7 +392,8 @@ std::pair<vk::Buffer, u32> TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o
     const auto* detiler = GetDetiler(image);
     if (!detiler) {
         if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled &&
-            image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled) {
+            image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled &&
+            image.info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) {
             LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
                       vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
         }