diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index ccbe54d0a..1aba14bc5 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -166,14 +166,21 @@ void EmitGetGotoVariable(EmitContext&) {
 using PointerType = EmitContext::PointerType;
 
 Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) {
-    const u32 flatbuf_off_dw = inst->Flags<u32>();
+    const u32 flatbuf_offset = inst->Flags<u32>();
+    const auto& flatbuf_buffer{ctx.buffers.back()};
+    ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf);
+    const auto [flatbuf_buffer_id, flatbuf_pointer_type] = flatbuf_buffer[PointerType::U32];
+    const auto ptr{ctx.OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, ctx.u32_zero_value,
+                                     ctx.ConstU32(flatbuf_offset))};
+    return ctx.OpLoad(ctx.U32[1], ptr);
+
     // We can only provide a fallback for immediate offsets.
-    if (flatbuf_off_dw == 0) {
-        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
-    } else {
-        return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
-                                  ctx.ConstU32(flatbuf_off_dw));
-    }
+    // if (flatbuf_off_dw == 0) {
+    //    return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset);
+    //} else {
+    //    return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const, addr, offset,
+    //                              ctx.ConstU32(flatbuf_off_dw));
+    //}
 }
 
 template <PointerType type>
diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
index b3b4ac36a..fa51119b2 100644
--- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
+++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp
@@ -136,6 +136,10 @@ void CollectShaderInfoPass(IR::Program& program) {
         }
     }
 
+    program.info.readconst_types = Info::ReadConstType::None;
+    program.info.dma_types = IR::Type::Void;
+    return;
+
     if (program.info.dma_types != IR::Type::Void) {
         program.info.buffers.push_back({
             .used_types = IR::Type::U64,
diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h
index 586980113..560689271 100644
--- a/src/video_core/amdgpu/pm4_cmds.h
+++ b/src/video_core/amdgpu/pm4_cmds.h
@@ -421,8 +421,8 @@ struct PM4CmdEventWriteEop {
     PM4Type3Header header;
     union {
         u32 event_control;
-        BitField<0, 6, EventType> event_type;  ///< Event type written to VGT_EVENT_INITIATOR
-        BitField<8, 4, u32> event_index; ///< Event index
+        BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR
+        BitField<8, 4, u32> event_index;      ///< Event index
     };
     u32 address_lo;
     union {
diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp
index 484fae386..530925606 100644
--- a/src/video_core/buffer_cache/buffer_cache.cpp
+++ b/src/video_core/buffer_cache/buffer_cache.cpp
@@ -7,6 +7,7 @@
 #include "common/alignment.h"
 #include "common/debug.h"
 #include "common/div_ceil.h"
+#include "common/thread.h"
 #include "common/types.h"
 #include "core/memory.h"
 #include "video_core/amdgpu/liverpool.h"
@@ -27,7 +28,7 @@ static constexpr size_t UboStreamBufferSize = 128_MB;
 static constexpr size_t DownloadBufferSize = 128_MB;
 static constexpr size_t DeviceBufferSize = 128_MB;
 static constexpr size_t MaxPageFaults = 1024;
-static constexpr size_t DownloadSizeThreshold = 1_MB;
+static constexpr size_t DownloadSizeThreshold = 512_KB;
 
 BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
                          AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_,
@@ -128,16 +129,24 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
                           "Fault Buffer Parser Pipeline");
 
     instance.GetDevice().destroyShaderModule(module);
+
+    async_download_thread = std::jthread{std::bind_front(&BufferCache::DownloadThread, this)};
 }
 
 BufferCache::~BufferCache() = default;
 
 void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) {
     const bool is_tracked = IsRegionRegistered(device_addr, size);
-    if (is_tracked) {
-        // Mark the page as CPU modified to stop tracking writes.
-        memory_tracker.MarkRegionAsCpuModified(device_addr, size);
+    if (!is_tracked) {
+        return;
     }
+
+    // Wait for any pending downloads to this page.
+    const u64 target_tick = page_table[device_addr >> CACHING_PAGEBITS].target_tick;
+    WaitForTargetTick(target_tick);
+
+    // Mark the page as CPU modified to stop tracking writes.
+    memory_tracker.MarkRegionAsCpuModified(device_addr, size);
 }
 
 void BufferCache::ReadMemory(VAddr device_addr, u64 size) {
@@ -215,6 +224,11 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
     pending_download_ranges.ForEach([&](VAddr interval_lower, VAddr interval_upper) {
         const std::size_t size = interval_upper - interval_lower;
         const VAddr device_addr = interval_lower;
+        const u64 page_begin = device_addr >> CACHING_PAGEBITS;
+        const u64 page_end = Common::DivCeil(device_addr + size, CACHING_PAGESIZE);
+        for (u64 page = page_begin; page != page_end; ++page) {
+            page_table[page].target_tick = current_download_tick;
+        }
         ForEachBufferInRange(device_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
             const VAddr buffer_start = buffer.CpuAddr();
             const VAddr buffer_end = buffer_start + buffer.SizeBytes();
@@ -257,14 +271,14 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
         Buffer& buffer = slot_buffers[buffer_id];
         cmdbuf.copyBuffer(buffer.Handle(), download_buffer.Handle(), buffer_copies);
     }
-    scheduler.DeferOperation([this, download, offset, copies]() {
+    const auto writeback_host = [this, download, offset, copies = std::move(copies)]() {
         auto* memory = Core::Memory::Instance();
         for (auto it = copies.begin(); it != copies.end(); ++it) {
             auto& buffer_copies = it.value();
             const BufferId buffer_id = it.key();
-            Buffer& buffer = slot_buffers[buffer_id];
+            const VAddr buffer_base = slot_buffers[buffer_id].CpuAddr();
             for (auto& copy : buffer_copies) {
-                const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset;
+                const VAddr copy_device_addr = buffer_base + copy.srcOffset;
                 const u64 dst_offset = copy.dstOffset - offset;
                 if (!memory->TryWriteBacking(std::bit_cast<u8*>(copy_device_addr),
                                              download + dst_offset, copy.size)) {
@@ -273,12 +287,18 @@ bool BufferCache::CommitPendingDownloads(bool wait_done) {
                 }
             }
         }
-    });
-    if (wait_done) {
-        scheduler.Finish();
-    } else {
-        scheduler.Flush();
+    };
+    {
+        std::scoped_lock lk{queue_mutex};
+        async_downloads.emplace(std::move(writeback_host), scheduler.CurrentTick(),
+                                current_download_tick);
     }
+    queue_cv.notify_one();
+    scheduler.Flush();
+    if (wait_done) {
+        WaitForTargetTick(current_download_tick);
+    }
+    ++current_download_tick;
     return true;
 }
 
@@ -1233,4 +1253,29 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
     buffer.is_deleted = true;
 }
 
+void BufferCache::DownloadThread(std::stop_token stoken) {
+    Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor");
+
+    while (!stoken.stop_requested()) {
+        PendingDownload download;
+        {
+            std::unique_lock lk{queue_mutex};
+            Common::CondvarWait(queue_cv, lk, stoken, [this] { return !async_downloads.empty(); });
+            if (stoken.stop_requested()) {
+                break;
+            }
+            download = std::move(async_downloads.front());
+            async_downloads.pop();
+        }
+
+        // Wait for GPU to complete its work and writeback data to host
+        scheduler.Wait(download.gpu_tick);
+        download.callback();
+
+        // Signal completion of download
+        download_tick.store(download.signal_tick);
+        download_tick.notify_all();
+    }
+}
+
 } // namespace VideoCore
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 48e27ba81..674f288b5 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -3,10 +3,15 @@
 
 #pragma once
 
+#include <atomic>
+#include <condition_variable>
 #include <shared_mutex>
+#include <thread>
 #include <boost/container/small_vector.hpp>
+#include <queue>
 #include "common/slot_vector.h"
 #include "common/types.h"
+#include "common/unique_function.h"
 #include "video_core/buffer_cache/buffer.h"
 #include "video_core/buffer_cache/memory_tracker.h"
 #include "video_core/buffer_cache/range_set.h"
@@ -51,7 +56,7 @@ public:
 
     struct PageData {
         BufferId buffer_id{};
-        u64 fence_tick;
+        u64 target_tick{};
     };
 
     struct Traits {
@@ -176,6 +181,14 @@ private:
         return !buffer_id || slot_buffers[buffer_id].is_deleted;
     }
 
+    inline void WaitForTargetTick(u64 target_tick) {
+        u64 tick = download_tick.load();
+        while (tick < target_tick) {
+            download_tick.wait(tick);
+            tick = download_tick.load();
+        }
+    }
+
     void DownloadBufferMemory(const Buffer& buffer, VAddr device_addr, u64 size);
 
     [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size);
@@ -201,6 +214,8 @@ private:
 
     void DeleteBuffer(BufferId buffer_id);
 
+    void DownloadThread(std::stop_token token);
+
     const Vulkan::Instance& instance;
     Vulkan::Scheduler& scheduler;
     AmdGpu::Liverpool* liverpool;
@@ -224,6 +239,17 @@ private:
     vk::UniqueDescriptorSetLayout fault_process_desc_layout;
     vk::UniquePipeline fault_process_pipeline;
     vk::UniquePipelineLayout fault_process_pipeline_layout;
+    std::jthread async_download_thread;
+    struct PendingDownload {
+        Common::UniqueFunction<void> callback;
+        u64 gpu_tick;
+        u64 signal_tick;
+    };
+    std::mutex queue_mutex;
+    std::condition_variable_any queue_cv;
+    std::queue<PendingDownload> async_downloads;
+    u64 current_download_tick{0};
+    std::atomic<u64> download_tick{1};
 };
 
 } // namespace VideoCore