diff --git a/CMakeLists.txt b/CMakeLists.txt index 466933608..38532760d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -689,6 +689,7 @@ set(COMMON src/common/logging/backend.cpp src/common/recursive_lock.cpp src/common/recursive_lock.h src/common/sha1.h + src/common/shared_first_mutex.h src/common/signal_context.h src/common/signal_context.cpp src/common/singleton.h diff --git a/src/common/config.cpp b/src/common/config.cpp index 9c316949a..4a764a4c6 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -51,6 +51,8 @@ static bool isShowSplash = false; static std::string isSideTrophy = "right"; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; +static bool readbacksEnabled = false; +static bool directMemoryAccessEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = true; static u32 vblankDivider = 1; @@ -240,6 +242,14 @@ bool copyGPUCmdBuffers() { return shouldCopyGPUBuffers; } +bool readbacks() { + return readbacksEnabled; +} + +bool directMemoryAccess() { + return directMemoryAccessEnabled; +} + bool dumpShaders() { return shouldDumpShaders; } @@ -344,6 +354,14 @@ void setCopyGPUCmdBuffers(bool enable) { shouldCopyGPUBuffers = enable; } +void setReadbacks(bool enable) { + readbacksEnabled = enable; +} + +void setDirectMemoryAccess(bool enable) { + directMemoryAccessEnabled = enable; +} + void setDumpShaders(bool enable) { shouldDumpShaders = enable; } @@ -586,6 +604,8 @@ void load(const std::filesystem::path& path) { screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); isNullGpu = toml::find_or(gpu, "nullGpu", false); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", false); + readbacksEnabled = toml::find_or(gpu, "readbacks", false); + directMemoryAccessEnabled = toml::find_or(gpu, "directMemoryAccess", false); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", false); shouldPatchShaders = toml::find_or(gpu, "patchShaders", true); vblankDivider = toml::find_or(gpu, "vblankDivider", 1); @@ -735,6 +755,8 @@ void save(const std::filesystem::path& path) { data["GPU"]["screenHeight"] = screenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; + data["GPU"]["readbacks"] = readbacksEnabled; + data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; data["GPU"]["vblankDivider"] = vblankDivider; diff --git a/src/common/config.h b/src/common/config.h index 38114983f..931fa68e2 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -45,6 +45,10 @@ bool nullGpu(); void setNullGpu(bool enable); bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); +bool readbacks(); +void setReadbacks(bool enable); +bool directMemoryAccess(); +void setDirectMemoryAccess(bool enable); bool dumpShaders(); void setDumpShaders(bool enable); u32 vblankDiv(); diff --git a/src/common/shared_first_mutex.h b/src/common/shared_first_mutex.h new file mode 100644 index 000000000..b150c956b --- /dev/null +++ b/src/common/shared_first_mutex.h @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +namespace Common { + +// Like std::shared_mutex, but reader has priority over writer. +class SharedFirstMutex { +public: + void lock() { + std::unique_lock lock(mtx); + cv.wait(lock, [this]() { return !writer_active && readers == 0; }); + writer_active = true; + } + + void unlock() { + std::lock_guard lock(mtx); + writer_active = false; + cv.notify_all(); + } + + void lock_shared() { + std::unique_lock lock(mtx); + cv.wait(lock, [this]() { return !writer_active; }); + ++readers; + } + + void unlock_shared() { + std::lock_guard lock(mtx); + if (--readers == 0) { + cv.notify_all(); + } + } + +private: + std::mutex mtx; + std::condition_variable cv; + int readers = 0; + bool writer_active = false; +}; + +} // namespace Common diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 2e66bdf83..2e29f70ee 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -302,14 +302,15 @@ struct AddressSpace::Impl { new_flags = PAGE_READWRITE; } else if (read && !write) { new_flags = PAGE_READONLY; - } else if (execute && !read && not write) { + } else if (execute && !read && !write) { new_flags = PAGE_EXECUTE; } else if (!read && !write && !execute) { new_flags = PAGE_NOACCESS; } else { LOG_CRITICAL(Common_Memory, - "Unsupported protection flag combination for address {:#x}, size {}", - virtual_addr, size); + "Unsupported protection flag combination for address {:#x}, size {}, " + "read={}, write={}, execute={}", + virtual_addr, size, read, write, execute); return; } diff --git a/src/core/address_space.h b/src/core/address_space.h index d7f3efc75..85b4c36ac 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -11,6 +11,7 @@ namespace Core { enum class MemoryPermission : u32 { + None = 0, Read = 1 << 0, Write = 1 << 1, ReadWrite = Read | Write, diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 9cf340050..8c3ab1612 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -2834,7 +2834,7 @@ void RegisterlibSceGnmDriver(Core::Loader::SymbolsResolver* sym) { } if (Config::copyGPUCmdBuffers()) { - liverpool->reserveCopyBufferSpace(); + liverpool->ReserveCopyBufferSpace(); } Platform::IrqC::Instance()->Register(Platform::InterruptId::GpuIdle, ResetSubmissionLock, diff --git a/src/emulator.cpp b/src/emulator.cpp index 99fd50af5..fbab5929b 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -132,6 +132,8 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar LOG_INFO(Config, "General LogType: {}", Config::getLogType()); LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole()); LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu()); + LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks()); + LOG_INFO(Config, "GPU directMemoryAccess: {}", Config::directMemoryAccess()); LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders()); LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv()); LOG_INFO(Config, "Vulkan gpuId: {}", Config::getGpuId()); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index 85e93f3fb..e37acb2e4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -200,10 +200,18 @@ Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); } +Id EmitBufferAtomicSMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); +} + Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMin); } +Id EmitBufferAtomicUMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMin); +} + Id EmitBufferAtomicFMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { if (ctx.profile.supports_buffer_fp32_atomic_min_max) { return BufferAtomicU32(ctx, inst, handle, address, value, diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 564fb3f80..f3a8c518c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" +#include "common/config.h" #include "common/logging/log.h" #include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" @@ -167,6 +168,9 @@ using PointerSize = EmitContext::PointerSize; Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { const u32 flatbuf_off_dw = inst->Flags(); + if (!Config::directMemoryAccess()) { + return ctx.EmitFlatbufferLoad(ctx.ConstU32(flatbuf_off_dw)); + } // We can only provide a fallback for immediate offsets. if (flatbuf_off_dw == 0) { return ctx.OpFunctionCall(ctx.U32[1], ctx.read_const_dynamic, addr, offset); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 15a8fd99b..1ac2266bd 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -91,7 +91,9 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicSMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicUMin64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicFMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -406,14 +408,20 @@ Id EmitULessThan32(EmitContext& ctx, Id lhs, Id rhs); Id EmitULessThan64(EmitContext& ctx, Id lhs, Id rhs); Id EmitIEqual32(EmitContext& ctx, Id lhs, Id rhs); Id EmitIEqual64(EmitContext& ctx, Id lhs, Id rhs); -Id EmitSLessThanEqual(EmitContext& ctx, Id lhs, Id rhs); -Id EmitULessThanEqual(EmitContext& ctx, Id lhs, Id rhs); -Id EmitSGreaterThan(EmitContext& ctx, Id lhs, Id rhs); -Id EmitUGreaterThan(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSLessThanEqual32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSLessThanEqual64(EmitContext& ctx, Id lhs, Id rhs); +Id EmitULessThanEqual32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitULessThanEqual64(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSGreaterThan32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSGreaterThan64(EmitContext& ctx, Id lhs, Id rhs); +Id EmitUGreaterThan32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitUGreaterThan64(EmitContext& ctx, Id lhs, Id rhs); Id EmitINotEqual32(EmitContext& ctx, Id lhs, Id rhs); Id EmitINotEqual64(EmitContext& ctx, Id lhs, Id rhs); -Id EmitSGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs); -Id EmitUGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitSGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs); +Id EmitUGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs); +Id EmitUGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs); Id EmitLogicalOr(EmitContext& ctx, Id a, Id b); Id EmitLogicalAnd(EmitContext& ctx, Id a, Id b); Id EmitLogicalXor(EmitContext& ctx, Id a, Id b); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 1a995354d..ddc1e7574 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -371,19 +371,35 @@ Id EmitIEqual64(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpIEqual(ctx.U1[1], lhs, rhs); } -Id EmitSLessThanEqual(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSLessThanEqual32(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpSLessThanEqual(ctx.U1[1], lhs, rhs); } -Id EmitULessThanEqual(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSLessThanEqual64(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpSLessThanEqual(ctx.U1[1], lhs, rhs); +} + +Id EmitULessThanEqual32(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpULessThanEqual(ctx.U1[1], lhs, rhs); } -Id EmitSGreaterThan(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitULessThanEqual64(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpULessThanEqual(ctx.U1[1], lhs, rhs); +} + +Id EmitSGreaterThan32(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpSGreaterThan(ctx.U1[1], lhs, rhs); } -Id EmitUGreaterThan(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSGreaterThan64(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpSGreaterThan(ctx.U1[1], lhs, rhs); +} + +Id EmitUGreaterThan32(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpUGreaterThan(ctx.U1[1], lhs, rhs); +} + +Id EmitUGreaterThan64(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpUGreaterThan(ctx.U1[1], lhs, rhs); } @@ -395,11 +411,19 @@ Id EmitINotEqual64(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpINotEqual(ctx.U1[1], lhs, rhs); } -Id EmitSGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpSGreaterThanEqual(ctx.U1[1], lhs, rhs); } -Id EmitUGreaterThanEqual(EmitContext& ctx, Id lhs, Id rhs) { +Id EmitSGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpSGreaterThanEqual(ctx.U1[1], lhs, rhs); +} + +Id EmitUGreaterThanEqual32(EmitContext& ctx, Id lhs, Id rhs) { + return ctx.OpUGreaterThanEqual(ctx.U1[1], lhs, rhs); +} + +Id EmitUGreaterThanEqual64(EmitContext& ctx, Id lhs, Id rhs) { return ctx.OpUGreaterThanEqual(ctx.U1[1], lhs, rhs); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 524914ad4..77336c9ec 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -784,19 +784,6 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte }; void EmitContext::DefineBuffers() { - if (!profile.supports_robust_buffer_access && !info.uses_dma) { - // In case Flatbuf has not already been bound by IR and is needed - // to query buffer sizes, bind it now. - info.buffers.push_back({ - .used_types = IR::Type::U32, - // We can't guarantee that flatbuf will not grow past UBO - // limit if there are a lot of ReadConsts. (We could specialize) - .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits::max()), - .buffer_type = BufferType::Flatbuf, - }); - // In the future we may want to read buffer sizes from GPU memory if available. - // info.readconst_types |= Info::ReadConstType::Immediate; - } for (const auto& desc : info.buffers) { const auto buf_sharp = desc.GetSharp(info); const bool is_storage = desc.IsStorage(buf_sharp, profile); @@ -1219,14 +1206,7 @@ Id EmitContext::DefineReadConst(bool dynamic) { if (dynamic) { return u32_zero_value; } else { - const auto& flatbuf_buffer{buffers[flatbuf_index]}; - ASSERT(flatbuf_buffer.binding >= 0 && - flatbuf_buffer.buffer_type == BufferType::Flatbuf); - const auto [flatbuf_buffer_id, flatbuf_pointer_type] = - flatbuf_buffer.Alias(PointerType::U32); - const auto ptr{OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value, - flatbuf_offset)}; - return OpLoad(U32[1], ptr); + return EmitFlatbufferLoad(flatbuf_offset); } }); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index f8c6416e8..28e9099d8 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -180,6 +180,16 @@ public: return OpAccessChain(result_type, shared_mem, index); } + Id EmitFlatbufferLoad(Id flatbuf_offset) { + const auto& flatbuf_buffer{buffers[flatbuf_index]}; + ASSERT(flatbuf_buffer.binding >= 0 && flatbuf_buffer.buffer_type == BufferType::Flatbuf); + const auto [flatbuf_buffer_id, flatbuf_pointer_type] = + flatbuf_buffer.aliases[u32(PointerType::U32)]; + const auto ptr{ + OpAccessChain(flatbuf_pointer_type, flatbuf_buffer_id, u32_zero_value, flatbuf_offset)}; + return OpLoad(U32[1], ptr); + } + Info& info; const RuntimeInfo& runtime_info; const Profile& profile; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index ece334bcd..b5bfec344 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -20,7 +20,7 @@ namespace Shader::Gcn { enum class ConditionOp : u32 { F, EQ, - LG, + LG, // NE GT, GE, LT, @@ -230,7 +230,7 @@ public: // VOPC void V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst); void V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst); - void V_CMP_NE_U64(const GcnInst& inst); + void V_CMP_U64(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst); void V_CMP_CLASS_F32(const GcnInst& inst); // VOP3a diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 3b88e4dec..54f1088f2 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -327,8 +327,10 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_CMP_U32(ConditionOp::TRU, false, true, inst); // V_CMP_{OP8}_U64 + case Opcode::V_CMP_EQ_U64: + return V_CMP_U64(ConditionOp::EQ, false, false, inst); case Opcode::V_CMP_NE_U64: - return V_CMP_NE_U64(inst); + return V_CMP_U64(ConditionOp::LG, false, false, inst); case Opcode::V_CMP_CLASS_F32: return V_CMP_CLASS_F32(inst); @@ -556,27 +558,31 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) { void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { if (!is_low) { - // v_mbcnt_hi_u32_b32 v2, -1, 0 + // v_mbcnt_hi_u32_b32 vX, -1, 0 if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193 && inst.src[1].field == OperandField::ConstZero) { return; } - // v_mbcnt_hi_u32_b32 vX, exec_hi, 0 - if (inst.src[0].field == OperandField::ExecHi && - inst.src[1].field == OperandField::ConstZero) { - return; + // v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ + if ((inst.src[0].field == OperandField::ExecHi || + inst.src[0].field == OperandField::VccHi) && + (inst.src[1].field == OperandField::ConstZero || + inst.src[1].field == OperandField::VectorGPR)) { + return SetDst(inst.dst[0], GetSrc(inst.src[1])); } + UNREACHABLE(); } else { - // v_mbcnt_lo_u32_b32 v2, -1, vX + // v_mbcnt_lo_u32_b32 vY, -1, vX // used combined with above to fetch lane id in non-compute stages if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193) { - SetDst(inst.dst[0], ir.LaneId()); + return SetDst(inst.dst[0], ir.LaneId()); } - // v_mbcnt_lo_u32_b32 v20, exec_lo, vX - // used combined in above for append buffer indexing. - if (inst.src[0].field == OperandField::ExecLo) { - SetDst(inst.dst[0], ir.Imm32(0)); + // v_mbcnt_lo_u32_b32 vY, exec_lo, vX + // used combined with above for append buffer indexing. + if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) { + return SetDst(inst.dst[0], GetSrc(inst.src[1])); } + UNREACHABLE(); } } @@ -996,39 +1002,32 @@ void Translator::V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const } } -void Translator::V_CMP_NE_U64(const GcnInst& inst) { - const auto get_src = [&](const InstOperand& operand) { - switch (operand.field) { - case OperandField::VccLo: - return ir.GetVcc(); - case OperandField::ExecLo: - return ir.GetExec(); - case OperandField::ScalarGPR: - return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); - case OperandField::ConstZero: - return ir.Imm1(false); +void Translator::V_CMP_U64(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst) { + const IR::U64 src0{GetSrc64(inst.src[0])}; + const IR::U64 src1{GetSrc64(inst.src[1])}; + const IR::U1 result = [&] { + switch (op) { + case ConditionOp::EQ: + return ir.IEqual(src0, src1); + case ConditionOp::LG: // NE + return ir.INotEqual(src0, src1); default: - UNREACHABLE(); + UNREACHABLE_MSG("Unsupported V_CMP_U64 condition operation: {}", u32(op)); } - }; - const IR::U1 src0{get_src(inst.src[0])}; - auto op = [&inst, this](auto x) { - switch (inst.src[1].field) { - case OperandField::ConstZero: - return x; - case OperandField::SignedConstIntNeg: - return ir.LogicalNot(x); - default: - UNREACHABLE_MSG("unhandled V_CMP_NE_U64 source argument {}", u32(inst.src[1].field)); - } - }; + }(); + + if (is_signed) { + UNREACHABLE_MSG("V_CMP_U64 with signed integers is not supported"); + } + if (set_exec) { + UNREACHABLE_MSG("Exec setting for V_CMP_U64 is not supported"); + } + switch (inst.dst[1].field) { case OperandField::VccLo: - ir.SetVcc(op(src0)); - break; + return ir.SetVcc(result); case OperandField::ScalarGPR: - ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), op(src0)); - break; + return ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), result); default: UNREACHABLE(); } diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 8dcf70a07..91f545cfd 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -74,8 +74,12 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_ATOMIC(AtomicOp::CmpSwap, inst); case Opcode::BUFFER_ATOMIC_SMIN: return BUFFER_ATOMIC(AtomicOp::Smin, inst); + case Opcode::BUFFER_ATOMIC_SMIN_X2: + return BUFFER_ATOMIC(AtomicOp::Smin, inst); case Opcode::BUFFER_ATOMIC_UMIN: return BUFFER_ATOMIC(AtomicOp::Umin, inst); + case Opcode::BUFFER_ATOMIC_UMIN_X2: + return BUFFER_ATOMIC(AtomicOp::Umin, inst); case Opcode::BUFFER_ATOMIC_SMAX: return BUFFER_ATOMIC(AtomicOp::Smax, inst); case Opcode::BUFFER_ATOMIC_SMAX_X2: diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 2497864c0..3d64cc5da 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -500,8 +500,16 @@ Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, con Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value, bool is_signed, BufferInstInfo info) { - return is_signed ? Inst(Opcode::BufferAtomicSMin32, Flags{info}, handle, address, value) - : Inst(Opcode::BufferAtomicUMin32, Flags{info}, handle, address, value); + switch (value.Type()) { + case Type::U32: + return is_signed ? Inst(Opcode::BufferAtomicSMin32, Flags{info}, handle, address, value) + : Inst(Opcode::BufferAtomicUMin32, Flags{info}, handle, address, value); + case Type::U64: + return is_signed ? Inst(Opcode::BufferAtomicSMin64, Flags{info}, handle, address, value) + : Inst(Opcode::BufferAtomicUMin64, Flags{info}, handle, address, value); + default: + ThrowInvalidType(value.Type()); + } } Value IREmitter::BufferAtomicFMin(const Value& handle, const Value& address, const Value& value, @@ -1712,12 +1720,32 @@ U1 IREmitter::IEqual(const U32U64& lhs, const U32U64& rhs) { } } -U1 IREmitter::ILessThanEqual(const U32& lhs, const U32& rhs, bool is_signed) { - return Inst(is_signed ? Opcode::SLessThanEqual : Opcode::ULessThanEqual, lhs, rhs); +U1 IREmitter::ILessThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed) { + if (lhs.Type() != rhs.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); + } + switch (lhs.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SLessThanEqual32 : Opcode::ULessThanEqual32, lhs, rhs); + case Type::U64: + return Inst(is_signed ? Opcode::SLessThanEqual64 : Opcode::ULessThanEqual64, lhs, rhs); + default: + ThrowInvalidType(lhs.Type()); + } } -U1 IREmitter::IGreaterThan(const U32& lhs, const U32& rhs, bool is_signed) { - return Inst(is_signed ? Opcode::SGreaterThan : Opcode::UGreaterThan, lhs, rhs); +U1 IREmitter::IGreaterThan(const U32U64& lhs, const U32U64& rhs, bool is_signed) { + if (lhs.Type() != rhs.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); + } + switch (lhs.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SGreaterThan32 : Opcode::UGreaterThan32, lhs, rhs); + case Type::U64: + return Inst(is_signed ? Opcode::SGreaterThan64 : Opcode::UGreaterThan64, lhs, rhs); + default: + ThrowInvalidType(lhs.Type()); + } } U1 IREmitter::INotEqual(const U32U64& lhs, const U32U64& rhs) { @@ -1734,8 +1762,20 @@ U1 IREmitter::INotEqual(const U32U64& lhs, const U32U64& rhs) { } } -U1 IREmitter::IGreaterThanEqual(const U32& lhs, const U32& rhs, bool is_signed) { - return Inst(is_signed ? Opcode::SGreaterThanEqual : Opcode::UGreaterThanEqual, lhs, rhs); +U1 IREmitter::IGreaterThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed) { + if (lhs.Type() != rhs.Type()) { + UNREACHABLE_MSG("Mismatching types {} and {}", lhs.Type(), rhs.Type()); + } + switch (lhs.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SGreaterThanEqual32 : Opcode::UGreaterThanEqual32, lhs, + rhs); + case Type::U64: + return Inst(is_signed ? Opcode::SGreaterThanEqual64 : Opcode::UGreaterThanEqual64, lhs, + rhs); + default: + ThrowInvalidType(lhs.Type()); + } } U1 IREmitter::LogicalOr(const U1& a, const U1& b) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 9e2f79978..119e3752e 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -299,10 +299,10 @@ public: [[nodiscard]] U1 ILessThan(const U32U64& lhs, const U32U64& rhs, bool is_signed); [[nodiscard]] U1 IEqual(const U32U64& lhs, const U32U64& rhs); - [[nodiscard]] U1 ILessThanEqual(const U32& lhs, const U32& rhs, bool is_signed); - [[nodiscard]] U1 IGreaterThan(const U32& lhs, const U32& rhs, bool is_signed); + [[nodiscard]] U1 ILessThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed); + [[nodiscard]] U1 IGreaterThan(const U32U64& lhs, const U32U64& rhs, bool is_signed); [[nodiscard]] U1 INotEqual(const U32U64& lhs, const U32U64& rhs); - [[nodiscard]] U1 IGreaterThanEqual(const U32& lhs, const U32& rhs, bool is_signed); + [[nodiscard]] U1 IGreaterThanEqual(const U32U64& lhs, const U32U64& rhs, bool is_signed); [[nodiscard]] U1 LogicalOr(const U1& a, const U1& b); [[nodiscard]] U1 LogicalAnd(const U1& a, const U1& b); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 8d46a0071..84bdb5739 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -70,7 +70,9 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::BufferAtomicIAdd64: case Opcode::BufferAtomicISub32: case Opcode::BufferAtomicSMin32: + case Opcode::BufferAtomicSMin64: case Opcode::BufferAtomicUMin32: + case Opcode::BufferAtomicUMin64: case Opcode::BufferAtomicFMin32: case Opcode::BufferAtomicSMax32: case Opcode::BufferAtomicSMax64: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 7fc514de9..008f44659 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -124,7 +124,9 @@ OPCODE(BufferAtomicIAdd32, U32, Opaq OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicISub32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) +OPCODE(BufferAtomicSMin64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) +OPCODE(BufferAtomicUMin64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicFMin32, U32, Opaque, Opaque, F32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax64, U64, Opaque, Opaque, U64 ) @@ -382,14 +384,20 @@ OPCODE(ULessThan32, U1, U32, OPCODE(ULessThan64, U1, U64, U64, ) OPCODE(IEqual32, U1, U32, U32, ) OPCODE(IEqual64, U1, U64, U64, ) -OPCODE(SLessThanEqual, U1, U32, U32, ) -OPCODE(ULessThanEqual, U1, U32, U32, ) -OPCODE(SGreaterThan, U1, U32, U32, ) -OPCODE(UGreaterThan, U1, U32, U32, ) +OPCODE(SLessThanEqual32, U1, U32, U32, ) +OPCODE(SLessThanEqual64, U1, U64, U64, ) +OPCODE(ULessThanEqual32, U1, U32, U32, ) +OPCODE(ULessThanEqual64, U1, U64, U64, ) +OPCODE(SGreaterThan32, U1, U32, U32, ) +OPCODE(SGreaterThan64, U1, U64, U64, ) +OPCODE(UGreaterThan32, U1, U32, U32, ) +OPCODE(UGreaterThan64, U1, U64, U64, ) OPCODE(INotEqual32, U1, U32, U32, ) OPCODE(INotEqual64, U1, U64, U64, ) -OPCODE(SGreaterThanEqual, U1, U32, U32, ) -OPCODE(UGreaterThanEqual, U1, U32, U32, ) +OPCODE(SGreaterThanEqual32, U1, U32, U32, ) +OPCODE(SGreaterThanEqual64, U1, U64, U64, ) +OPCODE(UGreaterThanEqual32, U1, U32, U32, ) +OPCODE(UGreaterThanEqual64, U1, U64, U64, ) // Logical operations OPCODE(LogicalOr, U1, U1, U1, ) diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index 5c66b1115..2a39d3a2e 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -381,24 +381,42 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::ULessThan64: FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a < b; }); return; - case IR::Opcode::SLessThanEqual: + case IR::Opcode::SLessThanEqual32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a <= b; }); return; - case IR::Opcode::ULessThanEqual: + case IR::Opcode::SLessThanEqual64: + FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a <= b; }); + return; + case IR::Opcode::ULessThanEqual32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a <= b; }); return; - case IR::Opcode::SGreaterThan: + case IR::Opcode::ULessThanEqual64: + FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a <= b; }); + return; + case IR::Opcode::SGreaterThan32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a > b; }); return; - case IR::Opcode::UGreaterThan: + case IR::Opcode::SGreaterThan64: + FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a > b; }); + return; + case IR::Opcode::UGreaterThan32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a > b; }); return; - case IR::Opcode::SGreaterThanEqual: + case IR::Opcode::UGreaterThan64: + FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a > b; }); + return; + case IR::Opcode::SGreaterThanEqual32: FoldWhenAllImmediates(inst, [](s32 a, s32 b) { return a >= b; }); return; - case IR::Opcode::UGreaterThanEqual: + case IR::Opcode::SGreaterThanEqual64: + FoldWhenAllImmediates(inst, [](s64 a, s64 b) { return a >= b; }); + return; + case IR::Opcode::UGreaterThanEqual32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a >= b; }); return; + case IR::Opcode::UGreaterThanEqual64: + FoldWhenAllImmediates(inst, [](u64 a, u64 b) { return a >= b; }); + return; case IR::Opcode::IEqual32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a == b; }); return; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 57d36f6df..fdae9d3cf 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -19,7 +19,7 @@ void ConstantPropagationPass(IR::BlockList& program); void FlattenExtendedUserdataPass(IR::Program& program); void ReadLaneEliminationPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); -void CollectShaderInfoPass(IR::Program& program); +void CollectShaderInfoPass(IR::Program& program, const Profile& profile); void LowerBufferFormatToRaw(IR::Program& program); void LowerFp64ToFp32(IR::Program& program); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index ffb785584..d5d140c93 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -20,7 +20,9 @@ bool IsBufferAtomic(const IR::Inst& inst) { case IR::Opcode::BufferAtomicIAdd64: case IR::Opcode::BufferAtomicISub32: case IR::Opcode::BufferAtomicSMin32: + case IR::Opcode::BufferAtomicSMin64: case IR::Opcode::BufferAtomicUMin32: + case IR::Opcode::BufferAtomicUMin64: case IR::Opcode::BufferAtomicFMin32: case IR::Opcode::BufferAtomicSMax32: case IR::Opcode::BufferAtomicSMax64: @@ -97,6 +99,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { case IR::Opcode::LoadBufferU64: case IR::Opcode::StoreBufferU64: case IR::Opcode::BufferAtomicIAdd64: + case IR::Opcode::BufferAtomicSMax64: + case IR::Opcode::BufferAtomicSMin64: + case IR::Opcode::BufferAtomicUMax64: + case IR::Opcode::BufferAtomicUMin64: return IR::Type::U64; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: @@ -118,6 +124,10 @@ u32 BufferAddressShift(const IR::Inst& inst, AmdGpu::DataFormat data_format) { case IR::Opcode::LoadBufferU64: case IR::Opcode::StoreBufferU64: case IR::Opcode::BufferAtomicIAdd64: + case IR::Opcode::BufferAtomicSMax64: + case IR::Opcode::BufferAtomicSMin64: + case IR::Opcode::BufferAtomicUMax64: + case IR::Opcode::BufferAtomicUMin64: return 3; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: { diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 59668870b..a87dceb0a 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/config.h" #include "shader_recompiler/ir/program.h" #include "video_core/buffer_cache/buffer_cache.h" @@ -102,7 +103,9 @@ void Visit(Info& info, const IR::Inst& inst) { break; case IR::Opcode::BufferAtomicIAdd64: case IR::Opcode::BufferAtomicSMax64: + case IR::Opcode::BufferAtomicSMin64: case IR::Opcode::BufferAtomicUMax64: + case IR::Opcode::BufferAtomicUMin64: info.uses_buffer_int64_atomics = true; break; case IR::Opcode::LaneId: @@ -136,7 +139,7 @@ void Visit(Info& info, const IR::Inst& inst) { } } -void CollectShaderInfoPass(IR::Program& program) { +void CollectShaderInfoPass(IR::Program& program, const Profile& profile) { auto& info = program.info; for (IR::Block* const block : program.post_order_blocks) { for (IR::Inst& inst : block->Instructions()) { @@ -144,6 +147,25 @@ void CollectShaderInfoPass(IR::Program& program) { } } + // In case Flatbuf has not already been bound by IR and is needed + // to query buffer sizes, bind it now. + if (!profile.supports_robust_buffer_access && !info.uses_dma) { + info.buffers.push_back({ + .used_types = IR::Type::U32, + // We can't guarantee that flatbuf will not grow past UBO + // limit if there are a lot of ReadConsts. (We could specialize) + .inline_cbuf = AmdGpu::Buffer::Placeholder(std::numeric_limits::max()), + .buffer_type = BufferType::Flatbuf, + }); + // In the future we may want to read buffer sizes from GPU memory if available. + // info.readconst_types |= Info::ReadConstType::Immediate; + } + + if (!Config::directMemoryAccess()) { + info.uses_dma = false; + info.readconst_types = Info::ReadConstType::None; + } + if (info.uses_dma) { info.buffers.push_back({ .used_types = IR::Type::U64, diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index e17fb1c9e..2da9e7b01 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -84,7 +84,7 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); - Shader::Optimization::CollectShaderInfoPass(program); + Shader::Optimization::CollectShaderInfoPass(program, profile); Shader::IR::DumpProgram(program, info); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 464f02e3a..9b8c28b66 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -72,8 +72,23 @@ Liverpool::~Liverpool() { process_thread.join(); } +void Liverpool::ProcessCommands() { + // Process incoming commands with high priority + while (num_commands) { + Common::UniqueFunction callback{}; + { + std::scoped_lock lk{submit_mutex}; + callback = std::move(command_queue.front()); + command_queue.pop(); + --num_commands; + } + callback(); + } +} + void Liverpool::Process(std::stop_token stoken) { Common::SetCurrentThreadName("shadPS4:GpuCommandProcessor"); + gpu_id = std::this_thread::get_id(); while (!stoken.stop_requested()) { { @@ -90,18 +105,7 @@ void Liverpool::Process(std::stop_token stoken) { curr_qid = -1; while (num_submits || num_commands) { - - // Process incoming commands with high priority - while (num_commands) { - Common::UniqueFunction callback{}; - { - std::unique_lock lk{submit_mutex}; - callback = std::move(command_queue.front()); - command_queue.pop(); - --num_commands; - } - callback(); - } + ProcessCommands(); curr_qid = (curr_qid + 1) % num_mapped_queues; @@ -147,6 +151,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { FIBER_ENTER(ccb_task_name); while (!ccb.empty()) { + ProcessCommands(); + const auto* header = reinterpret_cast(ccb.data()); const u32 type = header->type; if (type != 3) { @@ -224,6 +230,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(dcb.data()); while (!dcb.empty()) { + ProcessCommands(); + const auto* header = reinterpret_cast(dcb.data()); const u32 type = header->type; @@ -638,9 +646,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, - dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { @@ -649,14 +656,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spansrc_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), - dma_data->NumBytes(), false); + rasterizer->CopyBuffer(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -702,6 +710,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); while (!rewind->Valid()) { YIELD_GFX(); @@ -801,29 +812,32 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span -Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid) { +Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { FIBER_ENTER(acb_task_name[vqid]); auto& queue = asc_queues[{vqid}]; - auto base_addr = reinterpret_cast(acb); - while (acb_dwords > 0) { - auto* header = reinterpret_cast(acb); + auto base_addr = reinterpret_cast(acb.data()); + while (!acb.empty()) { + ProcessCommands(); + + auto* header = reinterpret_cast(acb.data()); u32 next_dw_off = header->type3.NumWords() + 1; // If we have a buffered packet, use it. if (queue.tmp_dwords > 0) [[unlikely]] { header = reinterpret_cast(queue.tmp_packet.data()); next_dw_off = header->type3.NumWords() + 1 - queue.tmp_dwords; - std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb, next_dw_off * sizeof(u32)); + std::memcpy(queue.tmp_packet.data() + queue.tmp_dwords, acb.data(), + next_dw_off * sizeof(u32)); queue.tmp_dwords = 0; } // If the packet is split across ring boundary, buffer until next submission - if (next_dw_off > acb_dwords) [[unlikely]] { - std::memcpy(queue.tmp_packet.data(), acb, acb_dwords * sizeof(u32)); - queue.tmp_dwords = acb_dwords; + if (next_dw_off > acb.size()) [[unlikely]] { + std::memcpy(queue.tmp_packet.data(), acb.data(), acb.size_bytes()); + queue.tmp_dwords = acb.size(); if constexpr (!is_indirect) { - *queue.read_addr += acb_dwords; + *queue.read_addr += acb.size(); *queue.read_addr %= queue.ring_size_dw; } break; @@ -832,9 +846,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq if (header->type == 2) { // Type-2 packet are used for padding purposes next_dw_off = 1; - acb += next_dw_off; - acb_dwords -= next_dw_off; - + acb = NextPacket(acb, next_dw_off); if constexpr (!is_indirect) { *queue.read_addr += next_dw_off; *queue.read_addr %= queue.ring_size_dw; @@ -856,8 +868,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } case PM4ItOpcode::IndirectBuffer: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCompute(indirect_buffer->Address(), - indirect_buffer->ib_size, vqid); + auto task = ProcessCompute( + {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); RESUME_ASC(task, vqid); while (!task.handle.done()) { @@ -876,8 +888,8 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress(), - dma_data->NumBytes(), true); + rasterizer->CopyBuffer(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true, false); } else if (dma_data->src_sel == DmaDataSrc::Data && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { @@ -886,14 +898,14 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } else if (dma_data->src_sel == DmaDataSrc::Gds && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - // LOG_WARNING(Render_Vulkan, "GDS memory read"); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->src_addr_lo, + dma_data->NumBytes(), false, true); } else if ((dma_data->src_sel == DmaDataSrc::Memory || dma_data->src_sel == DmaDataSrc::MemoryUsingL2) && (dma_data->dst_sel == DmaDataDst::Memory || dma_data->dst_sel == DmaDataDst::MemoryUsingL2)) { - rasterizer->InlineData(dma_data->DstAddress(), - dma_data->SrcAddress(), dma_data->NumBytes(), - false); + rasterizer->CopyBuffer(dma_data->DstAddress(), dma_data->SrcAddress(), + dma_data->NumBytes(), false, false); } else { UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); @@ -904,6 +916,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq break; } case PM4ItOpcode::Rewind: { + if (!rasterizer) { + break; + } const PM4CmdRewind* rewind = reinterpret_cast(header); while (!rewind->Valid()) { YIELD_ASC(vqid); @@ -1016,8 +1031,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq static_cast(opcode), header->type3.NumWords()); } - acb += next_dw_off; - acb_dwords -= next_dw_off; + acb = NextPacket(acb, next_dw_off); if constexpr (!is_indirect) { *queue.read_addr += next_dw_off; @@ -1087,7 +1101,7 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span acb) { auto& queue = mapped_queues[gnm_vqid]; const auto vqid = gnm_vqid - 1; - const auto& task = ProcessCompute(acb.data(), acb.size(), vqid); + const auto& task = ProcessCompute(acb, vqid); { std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index d88a44375..0613823ab 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1512,14 +1513,32 @@ public: rasterizer = rasterizer_; } - void SendCommand(Common::UniqueFunction&& func) { - std::scoped_lock lk{submit_mutex}; - command_queue.emplace(std::move(func)); - ++num_commands; - submit_cv.notify_one(); + template + void SendCommand(auto&& func) { + if (std::this_thread::get_id() == gpu_id) { + return func(); + } + if constexpr (wait_done) { + std::binary_semaphore sem{0}; + { + std::scoped_lock lk{submit_mutex}; + command_queue.emplace([&sem, &func] { + func(); + sem.release(); + }); + ++num_commands; + submit_cv.notify_one(); + } + sem.acquire(); + } else { + std::scoped_lock lk{submit_mutex}; + command_queue.emplace(std::move(func)); + ++num_commands; + submit_cv.notify_one(); + } } - void reserveCopyBufferSpace() { + void ReserveCopyBufferSpace() { GpuQueue& gfx_queue = mapped_queues[GfxQueueId]; std::scoped_lock lk(gfx_queue.m_access); @@ -1581,8 +1600,9 @@ private: Task ProcessGraphics(std::span dcb, std::span ccb); Task ProcessCeUpdate(std::span ccb); template - Task ProcessCompute(const u32* acb, u32 acb_dwords, u32 vqid); + Task ProcessCompute(std::span acb, u32 vqid); + void ProcessCommands(); void Process(std::stop_token stoken); struct GpuQueue { @@ -1626,6 +1646,7 @@ private: std::mutex submit_mutex; std::condition_variable_any submit_cv; std::queue> command_queue{}; + std::thread::id gpu_id; int curr_qid{-1}; }; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 23f9dc0bc..d55e05d1e 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include #include "common/alignment.h" #include "common/debug.h" #include "common/scope_exit.h" @@ -9,6 +10,7 @@ #include "core/memory.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker.h" #include "video_core/host_shaders/fault_buffer_process_comp.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -27,10 +29,10 @@ static constexpr size_t DeviceBufferSize = 128_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, - TextureCache& texture_cache_, PageManager& tracker_) - : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, - memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, + AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, + PageManager& tracker) + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize}, @@ -38,13 +40,14 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, - fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE), - memory_tracker{tracker} { + fault_buffer(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, FAULT_BUFFER_SIZE) { Vulkan::SetObjectName(instance.GetDevice(), gds_buffer.Handle(), "GDS Buffer"); Vulkan::SetObjectName(instance.GetDevice(), bda_pagetable_buffer.Handle(), "BDA Page Table Buffer"); Vulkan::SetObjectName(instance.GetDevice(), fault_buffer.Handle(), "Fault Buffer"); + memory_tracker = std::make_unique(tracker); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16); @@ -129,22 +132,26 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s BufferCache::~BufferCache() = default; -void BufferCache::InvalidateMemory(VAddr device_addr, u64 size, bool unmap) { - const bool is_tracked = IsRegionRegistered(device_addr, size); - if (is_tracked) { - // Mark the page as CPU modified to stop tracking writes. - memory_tracker.MarkRegionAsCpuModified(device_addr, size); - - if (unmap) { - return; - } +void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { + if (!IsRegionRegistered(device_addr, size)) { + return; } + memory_tracker->InvalidateRegion( + device_addr, size, Config::readbacks(), + [this, device_addr, size] { ReadMemory(device_addr, size, true); }); } -void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { +void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { + liverpool->SendCommand([this, device_addr, size, is_write] { + Buffer& buffer = slot_buffers[FindBuffer(device_addr, size)]; + DownloadBufferMemory(buffer, device_addr, size, is_write); + }); +} + +void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write) { boost::container::small_vector copies; u64 total_size_bytes = 0; - memory_tracker.ForEachDownloadRange( + memory_tracker->ForEachDownloadRange( device_addr, size, [&](u64 device_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { @@ -155,7 +162,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si .dstOffset = total_size_bytes, .size = new_size, }); - total_size_bytes += new_size; + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; }; gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download); gpu_modified_ranges.Subtract(device_addr_out, range_size); @@ -173,10 +183,16 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.copyBuffer(buffer.buffer, download_buffer.Handle(), copies); scheduler.Finish(); + auto* memory = Core::Memory::Instance(); for (const auto& copy : copies) { const VAddr copy_device_addr = buffer.CpuAddr() + copy.srcOffset; const u64 dst_offset = copy.dstOffset - offset; - std::memcpy(std::bit_cast(copy_device_addr), download + dst_offset, copy.size); + memory->TryWriteBacking(std::bit_cast(copy_device_addr), download + dst_offset, + copy.size); + } + memory_tracker->UnmarkRegionAsGpuModified(device_addr, size); + if (is_write) { + memory_tracker->MarkRegionAsCpuModified(device_addr, size); } } @@ -296,9 +312,11 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); - if (!is_gds && !IsRegionGpuModified(address, num_bytes)) { - memcpy(std::bit_cast(address), value, num_bytes); - return; + if (!is_gds) { + ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + if (!IsRegionRegistered(address, num_bytes)) { + return; + } } Buffer* buffer = [&] { if (is_gds) { @@ -326,25 +344,107 @@ void BufferCache::WriteData(VAddr address, const void* value, u32 num_bytes, boo WriteDataBuffer(*buffer, address, value, num_bytes); } +void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + if (!dst_gds && !IsRegionGpuModified(dst, num_bytes)) { + if (!src_gds && !IsRegionGpuModified(src, num_bytes)) { + // Both buffers were not transferred to GPU yet. Can safely copy in host memory. + memcpy(std::bit_cast(dst), std::bit_cast(src), num_bytes); + return; + } + // Without a readback there's nothing we can do with this + // Fallback to creating dst buffer on GPU to at least have this data there + } + auto& src_buffer = [&] -> const Buffer& { + if (src_gds) { + return gds_buffer; + } + // Avoid using ObtainBuffer here as that might give us the stream buffer. + const BufferId buffer_id = FindBuffer(src, num_bytes); + auto& buffer = slot_buffers[buffer_id]; + SynchronizeBuffer(buffer, src, num_bytes, false, false); + return buffer; + }(); + auto& dst_buffer = [&] -> const Buffer& { + if (dst_gds) { + return gds_buffer; + } + // Prefer using ObtainBuffer here as that will auto-mark the region as GPU modified. + const auto [buffer, offset] = ObtainBuffer(dst, num_bytes, true); + return *buffer; + }(); + vk::BufferCopy region{ + .srcOffset = src_buffer.Offset(src), + .dstOffset = dst_buffer.Offset(dst), + .size = num_bytes, + }; + const vk::BufferMemoryBarrier2 buf_barriers_before[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_before, + }); + cmdbuf.copyBuffer(src_buffer.Handle(), dst_buffer.Handle(), region); + const vk::BufferMemoryBarrier2 buf_barriers_after[2] = { + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = dst_buffer.Handle(), + .offset = dst_buffer.Offset(dst), + .size = num_bytes, + }, + { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite, + .buffer = src_buffer.Handle(), + .offset = src_buffer.Offset(src), + .size = num_bytes, + }, + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 2, + .pBufferMemoryBarriers = buf_barriers_after, + }); +} + std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { - // For small uniform buffers that have not been modified by gpu - // use device local stream buffer to reduce renderpass breaks. - // Maybe we want to modify the threshold now that the page size is 16KB? - static constexpr u64 StreamThreshold = CACHING_PAGESIZE; - const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); - if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { + // For read-only buffers use device local stream buffer to reduce renderpass breaks. + if (!is_written && size <= CACHING_PAGESIZE && !IsRegionGpuModified(device_addr, size)) { const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); return {&stream_buffer, offset}; } - - if (!buffer_id || slot_buffers[buffer_id].is_deleted) { + if (IsBufferInvalid(buffer_id)) { buffer_id = FindBuffer(device_addr, size); } Buffer& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer); + SynchronizeBuffer(buffer, device_addr, size, is_written, is_texel_buffer); if (is_written) { - memory_tracker.MarkRegionAsGpuModified(device_addr, size); gpu_modified_ranges.Add(device_addr, size); } return {&buffer, buffer.Offset(device_addr)}; @@ -352,21 +452,17 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b std::pair BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) { // Check if any buffer contains the full requested range. - const u64 page = gpu_addr >> CACHING_PAGEBITS; - const BufferId buffer_id = page_table[page].buffer_id; + const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id; if (buffer_id) { - Buffer& buffer = slot_buffers[buffer_id]; - if (buffer.IsInBounds(gpu_addr, size)) { - SynchronizeBuffer(buffer, gpu_addr, size, false); + if (Buffer& buffer = slot_buffers[buffer_id]; buffer.IsInBounds(gpu_addr, size)) { + SynchronizeBuffer(buffer, gpu_addr, size, false, false); return {&buffer, buffer.Offset(gpu_addr)}; } } - // If no buffer contains the full requested range but some buffer within was GPU-modified, - // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications. - if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) { + // If some buffer within was GPU modified create a full buffer to avoid losing GPU data. + if (IsRegionGpuModified(gpu_addr, size)) { return ObtainBuffer(gpu_addr, size, false, false); } - // In all other cases, just do a CPU copy to the staging buffer. const auto [data, offset] = staging_buffer.Map(size, 16); memory->CopySparseMemory(gpu_addr, data, size); @@ -380,11 +476,11 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { } bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionCpuModified(addr, size); + return memory_tracker->IsRegionCpuModified(addr, size); } bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionGpuModified(addr, size); + return memory_tracker->IsRegionGpuModified(addr, size); } BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { @@ -718,56 +814,27 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { } } -void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, +void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; - u64 total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); - memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { - copies.push_back(vk::BufferCopy{ - .srcOffset = total_size_bytes, - .dstOffset = device_addr_out - buffer_start, - .size = range_size, + memory_tracker->ForEachUploadRange( + device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { + const u64 offset = staging_buffer.Copy(device_addr_out, range_size); + copies.push_back(vk::BufferCopy{ + .srcOffset = offset, + .dstOffset = device_addr_out - buffer_start, + .size = range_size, + }); }); - total_size_bytes += range_size; - }); SCOPE_EXIT { if (is_texel_buffer) { SynchronizeBufferFromImage(buffer, device_addr, size); } }; - if (total_size_bytes == 0) { + if (copies.empty()) { return; } - vk::Buffer src_buffer = staging_buffer.Handle(); - if (total_size_bytes < StagingBufferSize) { - const auto [staging, offset] = staging_buffer.Map(total_size_bytes); - for (auto& copy : copies) { - u8* const src_pointer = staging + copy.srcOffset; - const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; - std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); - // Apply the staging offset - copy.srcOffset += offset; - } - staging_buffer.Commit(); - } else { - // For large one time transfers use a temporary host buffer. - // RenderDoc can lag quite a bit if the stream buffer is too large. - Buffer temp_buffer{instance, - scheduler, - MemoryUsage::Upload, - 0, - vk::BufferUsageFlagBits::eTransferSrc, - total_size_bytes}; - src_buffer = temp_buffer.Handle(); - u8* const staging = temp_buffer.mapped_data.data(); - for (auto& copy : copies) { - u8* const src_pointer = staging + copy.srcOffset; - const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; - std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); - } - scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable {}); - } scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); const vk::BufferMemoryBarrier2 pre_barrier = { @@ -794,7 +861,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &pre_barrier, }); - cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); + cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -925,7 +992,7 @@ void BufferCache::SynchronizeBuffersInRange(VAddr device_addr, u64 size) { VAddr start = std::max(buffer.CpuAddr(), device_addr); VAddr end = std::min(buffer.CpuAddr() + buffer.SizeBytes(), device_addr_end); u32 size = static_cast(end - start); - SynchronizeBuffer(buffer, start, size, false); + SynchronizeBuffer(buffer, start, size, false, false); }); } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 651ba84dc..900a27aee 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -9,7 +9,6 @@ #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" -#include "video_core/buffer_cache/memory_tracker.h" #include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" @@ -21,13 +20,6 @@ namespace Core { class MemoryManager; } -namespace Shader { -namespace Gcn { -struct FetchShaderData; -} -struct Info; -} // namespace Shader - namespace Vulkan { class GraphicsPipeline; } @@ -39,6 +31,8 @@ using BufferId = Common::SlotId; static constexpr BufferId NULL_BUFFER_ID{0}; class TextureCache; +class MemoryTracker; +class PageManager; class BufferCache { public: @@ -69,10 +63,16 @@ public: bool has_stream_leap = false; }; + using IntervalSet = + boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool, - TextureCache& texture_cache, PageManager& tracker); + AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, + PageManager& tracker); ~BufferCache(); /// Returns a pointer to GDS device local buffer. @@ -110,7 +110,10 @@ public: } /// Invalidates any buffer in the logical page range. - void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); + void InvalidateMemory(VAddr device_addr, u64 size); + + /// Waits on pending downloads in the logical page range. + void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); /// Binds host vertex buffers for the current draw. void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline); @@ -124,6 +127,9 @@ public: /// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data) void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + /// Performs buffer to buffer data copy on the GPU. + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, @@ -166,7 +172,11 @@ private: }); } - void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); + inline bool IsBufferInvalid(BufferId buffer_id) const { + return !buffer_id || slot_buffers[buffer_id].is_deleted; + } + + void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size, bool is_write); [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); @@ -181,7 +191,8 @@ private: template void ChangeRegister(BufferId buffer_id); - void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer); + void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, + bool is_texel_buffer); bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); @@ -193,11 +204,10 @@ private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; - Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; TextureCache& texture_cache; - PageManager& tracker; + std::unique_ptr memory_tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; StreamBuffer download_buffer; @@ -209,7 +219,6 @@ private: Common::SlotVector slot_buffers; RangeSet gpu_modified_ranges; SplitRangeMap buffer_ranges; - MemoryTracker memory_tracker; PageTable page_table; vk::UniqueDescriptorSetLayout fault_process_desc_layout; vk::UniquePipeline fault_process_pipeline; diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index 3dbffdabd..ca87c7df0 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -27,6 +27,7 @@ public: bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { return IteratePages( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; return manager->template IsRegionModified(offset, size); }); } @@ -35,6 +36,7 @@ public: bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { return IteratePages( query_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; return manager->template IsRegionModified(offset, size); }); } @@ -43,26 +45,57 @@ public: void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { IteratePages(dirty_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; manager->template ChangeRegionState( manager->GetCpuAddr() + offset, size); }); } - /// Mark region as modified from the host GPU - void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { IteratePages(dirty_cpu_addr, query_size, [](RegionManager* manager, u64 offset, size_t size) { - manager->template ChangeRegionState( + std::scoped_lock lk{manager->lock}; + manager->template ChangeRegionState( manager->GetCpuAddr() + offset, size); }); } + /// Removes all protection from a page and ensures GPU data has been flushed if requested + void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { + IteratePages( + cpu_addr, size, + [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) { + const bool should_flush = [&] { + // Perform both the GPU modification check and CPU state change with the lock + // in case we are racing with GPU thread trying to mark the page as GPU + // modified. If we need to flush the flush function is going to perform CPU + // state change. + std::scoped_lock lk{manager->lock}; + if (try_flush && manager->template IsRegionModified(offset, size)) { + return true; + } + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + return false; + }(); + if (should_flush) { + on_flush(); + } + }); + } + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { IteratePages(query_cpu_range, query_size, - [&func](RegionManager* manager, u64 offset, size_t size) { + [&func, is_written](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, func); + if (is_written) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + } }); } @@ -71,6 +104,7 @@ public: void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, auto&& func) { IteratePages(query_cpu_range, query_size, [&func](RegionManager* manager, u64 offset, size_t size) { + std::scoped_lock lk{manager->lock}; manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, func); }); diff --git a/src/video_core/buffer_cache/region_definitions.h b/src/video_core/buffer_cache/region_definitions.h index f035704d9..76e7ee263 100644 --- a/src/video_core/buffer_cache/region_definitions.h +++ b/src/video_core/buffer_cache/region_definitions.h @@ -3,7 +3,6 @@ #pragma once -#include #include "common/bit_array.h" #include "common/types.h" @@ -20,9 +19,8 @@ constexpr u64 NUM_PAGES_PER_REGION = TRACKER_HIGHER_PAGE_SIZE / TRACKER_BYTES_PE enum class Type { CPU, GPU, - Writeable, }; using RegionBits = Common::BitArray; -} // namespace VideoCore \ No newline at end of file +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/region_manager.h b/src/video_core/buffer_cache/region_manager.h index e8ec21129..608b16fb3 100644 --- a/src/video_core/buffer_cache/region_manager.h +++ b/src/video_core/buffer_cache/region_manager.h @@ -3,9 +3,9 @@ #pragma once -#include -#include +#include "common/config.h" #include "common/div_ceil.h" +#include "common/logging/log.h" #ifdef __linux__ #include "common/adaptive_mutex.h" @@ -19,8 +19,14 @@ namespace VideoCore { +#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP +using LockType = Common::AdaptiveMutex; +#else +using LockType = Common::SpinLock; +#endif + /** - * Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region. + * Allows tracking CPU and GPU modification of pages in a contigious 16MB virtual address region. * Information is stored in bitsets for spacial locality and fast update of single pages. */ class RegionManager { @@ -30,6 +36,7 @@ public: cpu.Fill(); gpu.Clear(); writeable.Fill(); + readable.Fill(); } explicit RegionManager() = default; @@ -47,29 +54,19 @@ public: template RegionBits& GetRegionBits() noexcept { - static_assert(type != Type::Writeable); if constexpr (type == Type::CPU) { return cpu; } else if constexpr (type == Type::GPU) { return gpu; - } else if constexpr (type == Type::Writeable) { - return writeable; - } else { - static_assert(false, "Invalid type"); } } template const RegionBits& GetRegionBits() const noexcept { - static_assert(type != Type::Writeable); if constexpr (type == Type::CPU) { return cpu; } else if constexpr (type == Type::GPU) { return gpu; - } else if constexpr (type == Type::Writeable) { - return writeable; - } else { - static_assert(false, "Invalid type"); } } @@ -89,8 +86,6 @@ public: if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { return; } - std::scoped_lock lk{lock}; - static_assert(type != Type::Writeable); RegionBits& bits = GetRegionBits(); if constexpr (enable) { @@ -99,7 +94,9 @@ public: bits.UnsetRange(start_page, end_page); } if constexpr (type == Type::CPU) { - UpdateProtection(); + UpdateProtection(); + } else if (Config::readbacks()) { + UpdateProtection(); } } @@ -121,27 +118,22 @@ public: if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { return; } - std::scoped_lock lk{lock}; - static_assert(type != Type::Writeable); RegionBits& bits = GetRegionBits(); RegionBits mask(bits, start_page, end_page); - // TODO: this will not be needed once we handle readbacks - if constexpr (type == Type::GPU) { - mask &= ~writeable; + if constexpr (clear) { + bits.UnsetRange(start_page, end_page); + if constexpr (type == Type::CPU) { + UpdateProtection(); + } else if (Config::readbacks()) { + UpdateProtection(); + } } for (const auto& [start, end] : mask) { func(cpu_addr + start * TRACKER_BYTES_PER_PAGE, (end - start) * TRACKER_BYTES_PER_PAGE); } - - if constexpr (clear) { - bits.UnsetRange(start_page, end_page); - if constexpr (type == Type::CPU) { - UpdateProtection(); - } - } } /** @@ -151,7 +143,7 @@ public: * @param size Size in bytes of the region to query for modifications */ template - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) noexcept { RENDERER_TRACE; const size_t start_page = SanitizeAddress(offset) / TRACKER_BYTES_PER_PAGE; const size_t end_page = @@ -159,20 +151,14 @@ public: if (start_page >= NUM_PAGES_PER_REGION || end_page <= start_page) { return false; } - // std::scoped_lock lk{lock}; // Is this needed? - static_assert(type != Type::Writeable); const RegionBits& bits = GetRegionBits(); RegionBits test(bits, start_page, end_page); - - // TODO: this will not be needed once we handle readbacks - if constexpr (type == Type::GPU) { - test &= ~writeable; - } - return test.Any(); } + LockType lock; + private: /** * Notify tracker about changes in the CPU tracking state of a word in the buffer @@ -181,31 +167,29 @@ private: * @param current_bits Current state of the word * @param new_bits New state of the word * - * @tparam add_to_tracker True when the tracker should start tracking the new pages + * @tparam track True when the tracker should start tracking the new pages */ - template + template void UpdateProtection() { RENDERER_TRACE; - RegionBits mask = cpu ^ writeable; - + RegionBits mask = is_read ? (~gpu ^ readable) : (cpu ^ writeable); if (mask.None()) { - return; // No changes to the CPU tracking state + return; } - - writeable = cpu; - tracker->UpdatePageWatchersForRegion(cpu_addr, mask); + if constexpr (is_read) { + readable = ~gpu; + } else { + writeable = cpu; + } + tracker->UpdatePageWatchersForRegion(cpu_addr, mask); } -#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP - Common::AdaptiveMutex lock; -#else - Common::SpinLock lock; -#endif PageManager* tracker; VAddr cpu_addr = 0; RegionBits cpu; RegionBits gpu; RegionBits writeable; + RegionBits readable; }; } // namespace VideoCore diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 15dbf909c..63297bfdc 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -13,6 +13,7 @@ #ifndef _WIN64 #include +#include "common/adaptive_mutex.h" #ifdef ENABLE_USERFAULTFD #include #include @@ -23,6 +24,7 @@ #endif #else #include +#include "common/spin_lock.h" #endif #ifdef __linux__ @@ -38,22 +40,45 @@ constexpr size_t PAGE_BITS = 12; struct PageManager::Impl { struct PageState { - u8 num_watchers{}; + u8 num_write_watchers : 7; + // At the moment only buffer cache can request read watchers. + // And buffers cannot overlap, thus only 1 can exist per page. + u8 num_read_watchers : 1; - Core::MemoryPermission Perm() const noexcept { - return num_watchers == 0 ? Core::MemoryPermission::ReadWrite - : Core::MemoryPermission::Read; + Core::MemoryPermission WritePerm() const noexcept { + return num_write_watchers == 0 ? Core::MemoryPermission::Write + : Core::MemoryPermission::None; } - template + Core::MemoryPermission ReadPerm() const noexcept { + return num_read_watchers == 0 ? Core::MemoryPermission::Read + : Core::MemoryPermission::None; + } + + Core::MemoryPermission Perms() const noexcept { + return ReadPerm() | WritePerm(); + } + + template u8 AddDelta() { - if constexpr (delta == 1) { - return ++num_watchers; - } else if constexpr (delta == -1) { - ASSERT_MSG(num_watchers > 0, "Not enough watchers"); - return --num_watchers; + if constexpr (is_read) { + if constexpr (delta == 1) { + return ++num_read_watchers; + } else if (delta == -1) { + ASSERT_MSG(num_read_watchers > 0, "Not enough watchers"); + return --num_read_watchers; + } else { + return num_read_watchers; + } } else { - return num_watchers; + if constexpr (delta == 1) { + return ++num_write_watchers; + } else if (delta == -1) { + ASSERT_MSG(num_write_watchers > 0, "Not enough watchers"); + return --num_write_watchers; + } else { + return num_write_watchers; + } } } }; @@ -176,19 +201,23 @@ struct PageManager::Impl { RENDERER_TRACE; auto* memory = Core::Memory::Instance(); auto& impl = memory->GetAddressSpace(); + ASSERT_MSG(perms != Core::MemoryPermission::Write, + "Attempted to protect region as write-only which is not a valid permission"); impl.Protect(address, size, perms); } static bool GuestFaultSignalHandler(void* context, void* fault_address) { const auto addr = reinterpret_cast(fault_address); if (Common::IsWriteError(context)) { - return rasterizer->InvalidateMemory(addr, 1); + return rasterizer->InvalidateMemory(addr, 8); + } else { + return rasterizer->ReadMemory(addr, 8); } return false; } - #endif - template + + template void UpdatePageWatchers(VAddr addr, u64 size) { RENDERER_TRACE; @@ -200,7 +229,7 @@ struct PageManager::Impl { const auto lock_end = locks.begin() + Common::DivCeil(page_end, PAGES_PER_LOCK); Common::RangeLockGuard lk(lock_start, lock_end); - auto perms = cached_pages[page].Perm(); + auto perms = cached_pages[page].Perms(); u64 range_begin = 0; u64 range_bytes = 0; u64 potential_range_bytes = 0; @@ -226,9 +255,9 @@ struct PageManager::Impl { PageState& state = cached_pages[page]; // Apply the change to the page state - const u8 new_count = state.AddDelta(); + const u8 new_count = state.AddDelta(); - if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] { // If the protection changed add pending (un)protect action release_pending(); perms = new_perms; @@ -253,25 +282,23 @@ struct PageManager::Impl { release_pending(); } - template + template void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) { RENDERER_TRACE; auto start_range = mask.FirstRange(); auto end_range = mask.LastRange(); if (start_range.second == end_range.second) { - // Optimization: if all pages are contiguous, use the regular UpdatePageWatchers + // if all pages are contiguous, use the regular UpdatePageWatchers const VAddr start_addr = base_addr + (start_range.first << PAGE_BITS); const u64 size = (start_range.second - start_range.first) << PAGE_BITS; - - UpdatePageWatchers(start_addr, size); - return; + return UpdatePageWatchers(start_addr, size); } size_t base_page = (base_addr >> PAGE_BITS); ASSERT(base_page % PAGES_PER_LOCK == 0); std::scoped_lock lk(locks[base_page / PAGES_PER_LOCK]); - auto perms = cached_pages[base_page + start_range.first].Perm(); + auto perms = cached_pages[base_page + start_range.first].Perms(); u64 range_begin = 0; u64 range_bytes = 0; u64 potential_range_bytes = 0; @@ -292,9 +319,10 @@ struct PageManager::Impl { const bool update = mask.Get(page); // Apply the change to the page state - const u8 new_count = update ? state.AddDelta() : state.AddDelta<0>(); + const u8 new_count = + update ? state.AddDelta() : state.AddDelta<0, is_read>(); - if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] { // If the protection changed add pending (un)protect action release_pending(); perms = new_perms; @@ -348,19 +376,23 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) { template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const { - impl->UpdatePageWatchers(addr, size); + impl->UpdatePageWatchers(addr, size); } -template +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const { - impl->UpdatePageWatchersForRegion(base_addr, mask); + impl->UpdatePageWatchersForRegion(base_addr, mask); } template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; -template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, - RegionBits& mask) const; -template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, - RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; } // namespace VideoCore diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h index 561087ead..4ca41cb43 100644 --- a/src/video_core/page_manager.h +++ b/src/video_core/page_manager.h @@ -37,9 +37,8 @@ public: template void UpdatePageWatchers(VAddr addr, u64 size) const; - /// Updates watches in the pages touching the specified region - /// using a mask. - template + /// Updates watches in the pages touching the specified region using a mask. + template void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const; /// Returns page aligned address. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 86adfcaa5..514de1743 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -36,7 +36,7 @@ static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, *this, liverpool_, texture_cache, page_manager}, + buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { @@ -471,7 +471,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { uses_dma |= stage->uses_dma; } - if (uses_dma && !fault_process_pending) { + if (uses_dma) { // We only use fault buffer for DMA right now. { Common::RecursiveSharedLock lock{mapped_ranges_mutex}; @@ -945,6 +945,10 @@ void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, boo buffer_cache.InlineData(address, value, num_bytes, is_gds); } +void Rasterizer::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) { + buffer_cache.CopyBuffer(dst, src, num_bytes, dst_gds, src_gds); +} + u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { auto* gds_buf = buffer_cache.GetGdsBuffer(); u32 value; @@ -957,11 +961,20 @@ bool Rasterizer::InvalidateMemory(VAddr addr, u64 size) { // Not GPU mapped memory, can skip invalidation logic entirely. return false; } - buffer_cache.InvalidateMemory(addr, size, false); + buffer_cache.InvalidateMemory(addr, size); texture_cache.InvalidateMemory(addr, size); return true; } +bool Rasterizer::ReadMemory(VAddr addr, u64 size) { + if (!IsMapped(addr, size)) { + // Not GPU mapped memory, can skip invalidation logic entirely. + return false; + } + buffer_cache.ReadMemory(addr, size); + return true; +} + bool Rasterizer::IsMapped(VAddr addr, u64 size) { if (size == 0) { // There is no memory, so not mapped. @@ -982,7 +995,7 @@ void Rasterizer::MapMemory(VAddr addr, u64 size) { } void Rasterizer::UnmapMemory(VAddr addr, u64 size) { - buffer_cache.InvalidateMemory(addr, size, true); + buffer_cache.InvalidateMemory(addr, size); texture_cache.UnmapMemory(addr, size); page_manager.OnGpuUnmap(addr, size); { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index fb9ca4bbe..4a978746c 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -5,6 +5,7 @@ #include #include "common/recursive_lock.h" +#include "common/shared_first_mutex.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" @@ -56,8 +57,10 @@ public: bool from_guest = false); void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds); u32 ReadDataFromGds(u32 gsd_offset); bool InvalidateMemory(VAddr addr, u64 size); + bool ReadMemory(VAddr addr, u64 size); bool IsMapped(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); void UnmapMemory(VAddr addr, u64 size); @@ -120,7 +123,7 @@ private: AmdGpu::Liverpool* liverpool; Core::MemoryManager* memory; boost::icl::interval_set mapped_ranges; - std::shared_mutex mapped_ranges_mutex; + Common::SharedFirstMutex mapped_ranges_mutex; PipelineCache pipeline_cache; boost::container::static_vector<