From fed064931ad599f2de628cd9ad72c640da3f061b Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Mon, 5 May 2025 05:24:08 -0500 Subject: [PATCH 01/11] Core: Fix module load addresses (#2879) * Fix module map addresses Most modules are mapped starting at 0x800000000, with no gaps between mappings. * Hardcode hardware accurate base address Looking at our address space, all platforms will have this base address mapped, so there shouldn't be any problem in using it. * Clang * Swap module mapping to NoFlags, remove offset code Since real hardware has no gap between module mappings, the Fixed flag is just an annoyance to work around, and has no impact on the actual mappings. Swapping the module mappings to use flags NoFlags instead simplifies our code slightly. * Fix module mapping names On real hardware, the file extension is part of the mapping name. Easiest way to manage this is to swap the name to be `file.filename().string()` instead of `file.stem().string()` * Fix patches Completely missed this, whoops. --- src/core/address_space.h | 2 -- src/core/module.cpp | 13 +++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/core/address_space.h b/src/core/address_space.h index 7ccc2cd1e..d7f3efc75 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -19,8 +19,6 @@ enum class MemoryPermission : u32 { }; DECLARE_ENUM_FLAG_OPERATORS(MemoryPermission) -constexpr VAddr CODE_BASE_OFFSET = 0x100000000ULL; - constexpr VAddr SYSTEM_MANAGED_MIN = 0x00000400000ULL; constexpr VAddr SYSTEM_MANAGED_MAX = 0x07FFFFBFFFULL; constexpr VAddr SYSTEM_RESERVED_MIN = 0x07FFFFC000ULL; diff --git a/src/core/module.cpp b/src/core/module.cpp index cbe44457c..f31bbed6c 100644 --- a/src/core/module.cpp +++ b/src/core/module.cpp @@ -19,8 +19,7 @@ namespace Core { using EntryFunc = PS4_SYSV_ABI int (*)(size_t args, const void* argp, void* param); -static u64 LoadOffset = CODE_BASE_OFFSET; -static constexpr u64 CODE_BASE_INCR = 0x010000000u; +static constexpr u64 ModuleLoadBase = 0x800000000; static u64 GetAlignedSize(const elf_program_header& phdr) { return (phdr.p_align != 0 ? (phdr.p_memsz + (phdr.p_align - 1)) & ~(phdr.p_align - 1) @@ -84,7 +83,7 @@ static std::string StringToNid(std::string_view symbol) { } Module::Module(Core::MemoryManager* memory_, const std::filesystem::path& file_, u32& max_tls_index) - : memory{memory_}, file{file_}, name{file.stem().string()} { + : memory{memory_}, file{file_}, name{file.filename().string()} { elf.Open(file); if (elf.IsElfFile()) { LoadModuleToMemory(max_tls_index); @@ -113,10 +112,8 @@ void Module::LoadModuleToMemory(u32& max_tls_index) { // Map module segments (and possible TLS trampolines) void** out_addr = reinterpret_cast(&base_virtual_addr); - memory->MapMemory(out_addr, memory->SystemReservedVirtualBase() + LoadOffset, - aligned_base_size + TrampolineSize, MemoryProt::CpuReadWrite, - MemoryMapFlags::Fixed, VMAType::Code, name, true); - LoadOffset += CODE_BASE_INCR * (1 + aligned_base_size / CODE_BASE_INCR); + memory->MapMemory(out_addr, ModuleLoadBase, aligned_base_size + TrampolineSize, + MemoryProt::CpuReadWrite, MemoryMapFlags::NoFlags, VMAType::Code, name, true); LOG_INFO(Core_Linker, "Loading module {} to {}", name, fmt::ptr(*out_addr)); #ifdef ARCH_X86_64 @@ -229,7 +226,7 @@ void Module::LoadModuleToMemory(u32& max_tls_index) { LOG_INFO(Core_Linker, "program entry addr ..........: {:#018x}", entry_addr); if (MemoryPatcher::g_eboot_address == 0) { - if (name == "eboot") { + if (name == "eboot.bin") { MemoryPatcher::g_eboot_address = base_virtual_addr; MemoryPatcher::g_eboot_image_size = base_size; MemoryPatcher::OnGameLoaded(); From c7fb3ebd93a40a406e4dc6fdbfc03c00c58bec4a Mon Sep 17 00:00:00 2001 From: MajorP93 Date: Wed, 7 May 2025 02:11:32 +0200 Subject: [PATCH 02/11] shader_recompiler: Widen num_conversion bitfield (#2886) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We do this in order to be able to actually fit in all possible values from AmdGpu::NumberConversion. Fixes gcc compiler warnings: warning: ‘Shader::PsColorBuffer::num_conversion’ is too small to hold all values of ‘enum class AmdGpu::NumberConversion’ --- src/shader_recompiler/runtime_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 517392b98..b8ed42f5b 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -169,10 +169,10 @@ static constexpr u32 MaxColorBuffers = 8; struct PsColorBuffer { AmdGpu::NumberFormat num_format : 4; - AmdGpu::NumberConversion num_conversion : 2; + AmdGpu::NumberConversion num_conversion : 3; AmdGpu::Liverpool::ShaderExportFormat export_format : 4; u32 needs_unorm_fixup : 1; - u32 pad : 21; + u32 pad : 20; AmdGpu::CompMapping swizzle; auto operator<=>(const PsColorBuffer&) const noexcept = default; From 1aa7eb8a422ca90a1b7cfcc45f30331139f7cccf Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Wed, 7 May 2025 23:50:16 +0200 Subject: [PATCH 03/11] add scePthreadSetaffinity and emulate affinity (#2885) * add implementation * fix preprocessor * fixes squidbus's comments * fix clang * comment became fucked up? * fix removed return --- src/core/libraries/kernel/threads/pthread.cpp | 70 +++++++++++++++++++ src/core/libraries/kernel/threads/pthread.h | 2 + 2 files changed, 72 insertions(+) diff --git a/src/core/libraries/kernel/threads/pthread.cpp b/src/core/libraries/kernel/threads/pthread.cpp index c4127ecf2..e791e74bf 100644 --- a/src/core/libraries/kernel/threads/pthread.cpp +++ b/src/core/libraries/kernel/threads/pthread.cpp @@ -289,7 +289,12 @@ int PS4_SYSV_ABI posix_pthread_create_name_np(PthreadT* thread, const PthreadAtt /* Create thread */ new_thread->native_thr = Core::NativeThread(); int ret = new_thread->native_thr.Create(RunThread, new_thread, &new_thread->attr); + ASSERT_MSG(ret == 0, "Failed to create thread with error {}", ret); + + if (attr != nullptr && *attr != nullptr && (*attr)->cpuset != nullptr) { + new_thread->SetAffinity((*attr)->cpuset); + } if (ret) { *thread = nullptr; } @@ -521,6 +526,69 @@ int PS4_SYSV_ABI posix_pthread_setcancelstate(PthreadCancelState state, return 0; } +int Pthread::SetAffinity(const Cpuset* cpuset) { + const auto processor_count = std::thread::hardware_concurrency(); + if (processor_count < 8) { + return 0; + } + if (cpuset == nullptr) { + return POSIX_EINVAL; + } + + u64 mask = cpuset->bits; + + uintptr_t handle = native_thr.GetHandle(); + if (handle == 0) { + return POSIX_ESRCH; + } + + // We don't use this currently because some games gets performance problems + // when applying affinity even on strong hardware + /* + #ifdef _WIN64 + DWORD_PTR affinity_mask = static_cast(mask); + if (!SetThreadAffinityMask(reinterpret_cast(handle), affinity_mask)) { + return POSIX_EINVAL; + } + + #elif defined(__linux__) + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + + u64 mask = cpuset->bits; + for (int cpu = 0; cpu < std::min(64, CPU_SETSIZE); ++cpu) { + if (mask & (1ULL << cpu)) { + CPU_SET(cpu, &cpu_set); + } + } + + int result = + pthread_setaffinity_np(static_cast(handle), sizeof(cpu_set_t), &cpu_set); + if (result != 0) { + return POSIX_EINVAL; + } + #endif + */ + return 0; +} + +int PS4_SYSV_ABI posix_pthread_setaffinity_np(PthreadT thread, size_t cpusetsize, + const Cpuset* cpusetp) { + if (thread == nullptr || cpusetp == nullptr) { + return POSIX_EINVAL; + } + thread->attr.cpusetsize = cpusetsize; + return thread->SetAffinity(cpusetp); +} + +int PS4_SYSV_ABI scePthreadSetaffinity(PthreadT thread, const Cpuset mask) { + int result = posix_pthread_setaffinity_np(thread, 0x10, &mask); + if (result != 0) { + return ErrnoToSceKernelError(result); + } + return 0; +} + void RegisterThread(Core::Loader::SymbolsResolver* sym) { // Posix LIB_FUNCTION("Z4QosVuAsA0", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_once); @@ -544,6 +612,7 @@ void RegisterThread(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("Z4QosVuAsA0", "libkernel", 1, "libkernel", 1, 1, posix_pthread_once); LIB_FUNCTION("EotR8a3ASf4", "libkernel", 1, "libkernel", 1, 1, posix_pthread_self); LIB_FUNCTION("OxhIB8LB-PQ", "libkernel", 1, "libkernel", 1, 1, posix_pthread_create); + LIB_FUNCTION("5KWrg7-ZqvE", "libkernel", 1, "libkernel", 1, 1, posix_pthread_setaffinity_np); // Orbis LIB_FUNCTION("14bOACANTBo", "libkernel", 1, "libkernel", 1, 1, ORBIS(posix_pthread_once)); @@ -566,6 +635,7 @@ void RegisterThread(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("W0Hpm2X0uPE", "libkernel", 1, "libkernel", 1, 1, ORBIS(posix_pthread_setprio)); LIB_FUNCTION("rNhWz+lvOMU", "libkernel", 1, "libkernel", 1, 1, _sceKernelSetThreadDtors); LIB_FUNCTION("6XG4B33N09g", "libkernel", 1, "libkernel", 1, 1, sched_yield); + LIB_FUNCTION("bt3CTBKmGyI", "libkernel", 1, "libkernel", 1, 1, scePthreadSetaffinity) } } // namespace Libraries::Kernel diff --git a/src/core/libraries/kernel/threads/pthread.h b/src/core/libraries/kernel/threads/pthread.h index 089156776..09eed11b8 100644 --- a/src/core/libraries/kernel/threads/pthread.h +++ b/src/core/libraries/kernel/threads/pthread.h @@ -332,6 +332,8 @@ struct Pthread { return true; } } + + int SetAffinity(const Cpuset* cpuset); }; using PthreadT = Pthread*; From 3b7c36e1ba435e96e16c81d11b5c8a526513ff21 Mon Sep 17 00:00:00 2001 From: Vinicius Rangel Date: Wed, 7 May 2025 19:20:55 -0300 Subject: [PATCH 04/11] Clear stack before executing guest code (#2877) * Clear stack before executing guest code * clang, don't optimize me :rotating_light: avoid ClearStack function being optimized in release builds --- src/core/tls.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/core/tls.h b/src/core/tls.h index 6edd6a297..46ca8153b 100644 --- a/src/core/tls.h +++ b/src/core/tls.h @@ -5,6 +5,8 @@ #include "common/types.h" +void* memset(void* ptr, int value, size_t num); + namespace Xbyak { class CodeGenerator; } @@ -41,9 +43,18 @@ Tcb* GetTcbBase(); /// Makes sure TLS is initialized for the thread before entering guest. void EnsureThreadInitialized(); +template +__attribute__((optnone)) void ClearStack() { + volatile void* buf = alloca(size); + memset(const_cast(buf), 0, size); + buf = nullptr; +} + template ReturnType ExecuteGuest(PS4_SYSV_ABI ReturnType (*func)(FuncArgs...), CallArgs&&... args) { EnsureThreadInitialized(); + // clear stack to avoid trash from EnsureThreadInitialized + ClearStack<13_KB>(); return func(std::forward(args)...); } From 58df609ba00e09435c79d6a6649bce6176f06f78 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Thu, 8 May 2025 19:59:12 +0300 Subject: [PATCH 05/11] Optimize games that hit unpatchable EXTRQ/INSERTQ (#2888) * Make signal handler faster * I love clang-format * Use faster decoding * MacOS CI --- src/core/cpu_patches.cpp | 259 ++++++++++++++++++++------------------- 1 file changed, 136 insertions(+), 123 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index c8106b270..8937ef04b 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -464,9 +464,8 @@ static std::pair TryPatch(u8* code, PatchModule* module) { if (needs_trampoline && instruction.length < 5) { // Trampoline is needed but instruction is too short to patch. - // Return false and length to fall back to the illegal instruction handler, - // or to signal to AOT compilation that this instruction should be skipped and - // handled at runtime. + // Return false and length to signal to AOT compilation that this instruction + // should be skipped and handled at runtime. return std::make_pair(false, instruction.length); } @@ -512,136 +511,137 @@ static std::pair TryPatch(u8* code, PatchModule* module) { #if defined(ARCH_X86_64) +static bool Is4ByteExtrqOrInsertq(void* code_address) { + u8* bytes = (u8*)code_address; + if (bytes[0] == 0x66 && bytes[1] == 0x0F && bytes[2] == 0x79) { + return true; // extrq + } else if (bytes[0] == 0xF2 && bytes[1] == 0x0F && bytes[2] == 0x79) { + return true; // insertq + } else { + return false; + } +} + static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { - ZydisDecodedInstruction instruction; - ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; - const auto status = - Common::Decoder::Instance()->decodeInstruction(instruction, operands, code_address); + // We need to decode the instruction to find out what it is. Normally we'd use a fully fleshed + // out decoder like Zydis, however Zydis does a bunch of stuff that impact performance that we + // don't care about. We can get information about the instruction a lot faster by writing a mini + // decoder here, since we know it is definitely an extrq or an insertq. If for some reason we + // need to interpret more instructions in the future (I don't see why we would), we can revert + // to using Zydis. + ZydisMnemonic mnemonic; + u8* bytes = (u8*)code_address; + if (bytes[0] == 0x66) { + mnemonic = ZYDIS_MNEMONIC_EXTRQ; + } else if (bytes[0] == 0xF2) { + mnemonic = ZYDIS_MNEMONIC_INSERTQ; + } else { + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + const auto status = + Common::Decoder::Instance()->decodeInstruction(instruction, operands, code_address); + LOG_ERROR(Core, "Unhandled illegal instruction at code address {}: {}", + fmt::ptr(code_address), + ZYAN_SUCCESS(status) ? ZydisMnemonicGetString(instruction.mnemonic) + : "Failed to decode"); + return false; + } - switch (instruction.mnemonic) { + ASSERT(bytes[1] == 0x0F && bytes[2] == 0x79); + + // Note: It's guaranteed that there's no REX prefix in these instructions checked by + // Is4ByteExtrqOrInsertq + u8 modrm = bytes[3]; + u8 rm = modrm & 0b111; + u8 reg = (modrm >> 3) & 0b111; + u8 mod = (modrm >> 6) & 0b11; + + ASSERT(mod == 0b11); // Any instruction we interpret here uses reg/reg addressing only + + int dstIndex = reg; + int srcIndex = rm; + + switch (mnemonic) { case ZYDIS_MNEMONIC_EXTRQ: { - bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && - operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; - if (immediateForm) { - LOG_CRITICAL(Core, "EXTRQ immediate form should have been patched at code address: {}", - fmt::ptr(code_address)); - return false; + const auto dst = Common::GetXmmPointer(ctx, dstIndex); + const auto src = Common::GetXmmPointer(ctx, srcIndex); + + u64 lowQWordSrc; + memcpy(&lowQWordSrc, src, sizeof(lowQWordSrc)); + + u64 lowQWordDst; + memcpy(&lowQWordDst, dst, sizeof(lowQWordDst)); + + u64 length = lowQWordSrc & 0x3F; + u64 mask; + if (length == 0) { + length = 64; // for the check below + mask = 0xFFFF'FFFF'FFFF'FFFF; } else { - ASSERT_MSG(operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && - operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER && - operands[0].reg.value >= ZYDIS_REGISTER_XMM0 && - operands[0].reg.value <= ZYDIS_REGISTER_XMM15 && - operands[1].reg.value >= ZYDIS_REGISTER_XMM0 && - operands[1].reg.value <= ZYDIS_REGISTER_XMM15, - "Unexpected operand types for EXTRQ instruction"); - - const auto dstIndex = operands[0].reg.value - ZYDIS_REGISTER_XMM0; - const auto srcIndex = operands[1].reg.value - ZYDIS_REGISTER_XMM0; - - const auto dst = Common::GetXmmPointer(ctx, dstIndex); - const auto src = Common::GetXmmPointer(ctx, srcIndex); - - u64 lowQWordSrc; - memcpy(&lowQWordSrc, src, sizeof(lowQWordSrc)); - - u64 lowQWordDst; - memcpy(&lowQWordDst, dst, sizeof(lowQWordDst)); - - u64 length = lowQWordSrc & 0x3F; - u64 mask; - if (length == 0) { - length = 64; // for the check below - mask = 0xFFFF'FFFF'FFFF'FFFF; - } else { - mask = (1ULL << length) - 1; - } - - u64 index = (lowQWordSrc >> 8) & 0x3F; - if (length + index > 64) { - // Undefined behavior if length + index is bigger than 64 according to the spec, - // we'll warn and continue execution. - LOG_TRACE(Core, - "extrq at {} with length {} and index {} is bigger than 64, " - "undefined behavior", - fmt::ptr(code_address), length, index); - } - - lowQWordDst >>= index; - lowQWordDst &= mask; - - memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); - - Common::IncrementRip(ctx, instruction.length); - - return true; + mask = (1ULL << length) - 1; } - break; + + u64 index = (lowQWordSrc >> 8) & 0x3F; + if (length + index > 64) { + // Undefined behavior if length + index is bigger than 64 according to the spec, + // we'll warn and continue execution. + LOG_TRACE(Core, + "extrq at {} with length {} and index {} is bigger than 64, " + "undefined behavior", + fmt::ptr(code_address), length, index); + } + + lowQWordDst >>= index; + lowQWordDst &= mask; + + memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); + + Common::IncrementRip(ctx, 4); + + return true; } case ZYDIS_MNEMONIC_INSERTQ: { - bool immediateForm = operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && - operands[3].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; - if (immediateForm) { - LOG_CRITICAL(Core, - "INSERTQ immediate form should have been patched at code address: {}", - fmt::ptr(code_address)); - return false; + const auto dst = Common::GetXmmPointer(ctx, dstIndex); + const auto src = Common::GetXmmPointer(ctx, srcIndex); + + u64 lowQWordSrc, highQWordSrc; + memcpy(&lowQWordSrc, src, sizeof(lowQWordSrc)); + memcpy(&highQWordSrc, (u8*)src + 8, sizeof(highQWordSrc)); + + u64 lowQWordDst; + memcpy(&lowQWordDst, dst, sizeof(lowQWordDst)); + + u64 length = highQWordSrc & 0x3F; + u64 mask; + if (length == 0) { + length = 64; // for the check below + mask = 0xFFFF'FFFF'FFFF'FFFF; } else { - ASSERT_MSG(operands[2].type == ZYDIS_OPERAND_TYPE_UNUSED && - operands[3].type == ZYDIS_OPERAND_TYPE_UNUSED, - "operands 2 and 3 must be unused for register form."); - - ASSERT_MSG(operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && - operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER, - "operands 0 and 1 must be registers."); - - const auto dstIndex = operands[0].reg.value - ZYDIS_REGISTER_XMM0; - const auto srcIndex = operands[1].reg.value - ZYDIS_REGISTER_XMM0; - - const auto dst = Common::GetXmmPointer(ctx, dstIndex); - const auto src = Common::GetXmmPointer(ctx, srcIndex); - - u64 lowQWordSrc, highQWordSrc; - memcpy(&lowQWordSrc, src, sizeof(lowQWordSrc)); - memcpy(&highQWordSrc, (u8*)src + 8, sizeof(highQWordSrc)); - - u64 lowQWordDst; - memcpy(&lowQWordDst, dst, sizeof(lowQWordDst)); - - u64 length = highQWordSrc & 0x3F; - u64 mask; - if (length == 0) { - length = 64; // for the check below - mask = 0xFFFF'FFFF'FFFF'FFFF; - } else { - mask = (1ULL << length) - 1; - } - - u64 index = (highQWordSrc >> 8) & 0x3F; - if (length + index > 64) { - // Undefined behavior if length + index is bigger than 64 according to the spec, - // we'll warn and continue execution. - LOG_TRACE(Core, - "insertq at {} with length {} and index {} is bigger than 64, " - "undefined behavior", - fmt::ptr(code_address), length, index); - } - - lowQWordSrc &= mask; - lowQWordDst &= ~(mask << index); - lowQWordDst |= lowQWordSrc << index; - - memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); - - Common::IncrementRip(ctx, instruction.length); - - return true; + mask = (1ULL << length) - 1; } - break; + + u64 index = (highQWordSrc >> 8) & 0x3F; + if (length + index > 64) { + // Undefined behavior if length + index is bigger than 64 according to the spec, + // we'll warn and continue execution. + LOG_TRACE(Core, + "insertq at {} with length {} and index {} is bigger than 64, " + "undefined behavior", + fmt::ptr(code_address), length, index); + } + + lowQWordSrc &= mask; + lowQWordDst &= ~(mask << index); + lowQWordDst |= lowQWordSrc << index; + + memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); + + Common::IncrementRip(ctx, 4); + + return true; } default: { - LOG_ERROR(Core, "Unhandled illegal instruction at code address {}: {}", - fmt::ptr(code_address), ZydisMnemonicGetString(instruction.mnemonic)); - return false; + UNREACHABLE(); } } @@ -695,9 +695,22 @@ static bool PatchesAccessViolationHandler(void* context, void* /* fault_address static bool PatchesIllegalInstructionHandler(void* context) { void* code_address = Common::GetRip(context); - if (!TryPatchJit(code_address)) { + if (Is4ByteExtrqOrInsertq(code_address)) { + // The instruction is not big enough for a relative jump, don't try to patch it and pass it + // to our illegal instruction interpreter directly return TryExecuteIllegalInstruction(context, code_address); + } else { + if (!TryPatchJit(code_address)) { + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + const auto status = + Common::Decoder::Instance()->decodeInstruction(instruction, operands, code_address); + LOG_ERROR(Core, "Failed to patch address {:x} -- mnemonic: {}", (u64)code_address, + ZYAN_SUCCESS(status) ? ZydisMnemonicGetString(instruction.mnemonic) + : "Failed to decode"); + } } + return true; } From 46b88bd10f0d6d8dc59a80866a625a75e739a0af Mon Sep 17 00:00:00 2001 From: mailwl Date: Fri, 9 May 2025 11:08:22 +0300 Subject: [PATCH 06/11] [Libs] Stubs sceSigninDialog (#2890) * [Libs] Stubs SigninDialog * clang-format * clang-format again * remove magic constant * log dialog finished status --- CMakeLists.txt | 2 + src/common/logging/filter.cpp | 1 + src/common/logging/types.h | 1 + src/core/libraries/libs.cpp | 2 + .../libraries/signin_dialog/signindialog.cpp | 64 +++++++++++++++++++ .../libraries/signin_dialog/signindialog.h | 29 +++++++++ 6 files changed, 99 insertions(+) create mode 100644 src/core/libraries/signin_dialog/signindialog.cpp create mode 100644 src/core/libraries/signin_dialog/signindialog.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f55767611..9b10d0e5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -597,6 +597,8 @@ set(MISC_LIBS src/core/libraries/screenshot/screenshot.cpp src/core/libraries/move/move.h src/core/libraries/ulobjmgr/ulobjmgr.cpp src/core/libraries/ulobjmgr/ulobjmgr.h + src/core/libraries/signin_dialog/signindialog.cpp + src/core/libraries/signin_dialog/signindialog.h ) set(DEV_TOOLS src/core/devtools/layer.cpp diff --git a/src/common/logging/filter.cpp b/src/common/logging/filter.cpp index 867d62916..622af93cc 100644 --- a/src/common/logging/filter.cpp +++ b/src/common/logging/filter.cpp @@ -137,6 +137,7 @@ bool ParseFilterRule(Filter& instance, Iterator begin, Iterator end) { SUB(Lib, NpParty) \ SUB(Lib, Zlib) \ SUB(Lib, Hmd) \ + SUB(Lib, SigninDialog) \ CLS(Frontend) \ CLS(Render) \ SUB(Render, Vulkan) \ diff --git a/src/common/logging/types.h b/src/common/logging/types.h index e5714a81a..27a87e082 100644 --- a/src/common/logging/types.h +++ b/src/common/logging/types.h @@ -104,6 +104,7 @@ enum class Class : u8 { Lib_NpParty, ///< The LibSceNpParty implementation Lib_Zlib, ///< The LibSceZlib implementation. Lib_Hmd, ///< The LibSceHmd implementation. + Lib_SigninDialog, ///< The LibSigninDialog implementation. Frontend, ///< Emulator UI Render, ///< Video Core Render_Vulkan, ///< Vulkan backend diff --git a/src/core/libraries/libs.cpp b/src/core/libraries/libs.cpp index 3f5baf640..3826ff793 100644 --- a/src/core/libraries/libs.cpp +++ b/src/core/libraries/libs.cpp @@ -45,6 +45,7 @@ #include "core/libraries/save_data/savedata.h" #include "core/libraries/screenshot/screenshot.h" #include "core/libraries/share_play/shareplay.h" +#include "core/libraries/signin_dialog/signindialog.h" #include "core/libraries/system/commondialog.h" #include "core/libraries/system/msgdialog.h" #include "core/libraries/system/posix.h" @@ -120,6 +121,7 @@ void InitHLELibs(Core::Loader::SymbolsResolver* sym) { Libraries::Hmd::RegisterlibSceHmd(sym); Libraries::DiscMap::RegisterlibSceDiscMap(sym); Libraries::Ulobjmgr::RegisterlibSceUlobjmgr(sym); + Libraries::SigninDialog::RegisterlibSceSigninDialog(sym); } } // namespace Libraries diff --git a/src/core/libraries/signin_dialog/signindialog.cpp b/src/core/libraries/signin_dialog/signindialog.cpp new file mode 100644 index 000000000..0e4eb63a2 --- /dev/null +++ b/src/core/libraries/signin_dialog/signindialog.cpp @@ -0,0 +1,64 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +// Generated By moduleGenerator +#include "common/logging/log.h" +#include "core/libraries/error_codes.h" +#include "core/libraries/libs.h" +#include "signindialog.h" + +namespace Libraries::SigninDialog { + +s32 PS4_SYSV_ABI sceSigninDialogInitialize() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called"); + return ORBIS_OK; +} + +s32 PS4_SYSV_ABI sceSigninDialogOpen() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called"); + return ORBIS_OK; +} + +Status PS4_SYSV_ABI sceSigninDialogGetStatus() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called, return 'finished' status"); + return Status::FINISHED; +} + +Status PS4_SYSV_ABI sceSigninDialogUpdateStatus() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called, return 'finished' status"); + return Status::FINISHED; +} + +s32 PS4_SYSV_ABI sceSigninDialogGetResult() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called"); + return ORBIS_OK; +} + +s32 PS4_SYSV_ABI sceSigninDialogClose() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called"); + return ORBIS_OK; +} + +s32 PS4_SYSV_ABI sceSigninDialogTerminate() { + LOG_ERROR(Lib_SigninDialog, "(STUBBED) called"); + return ORBIS_OK; +} + +void RegisterlibSceSigninDialog(Core::Loader::SymbolsResolver* sym) { + LIB_FUNCTION("mlYGfmqE3fQ", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogInitialize); + LIB_FUNCTION("JlpJVoRWv7U", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogOpen); + LIB_FUNCTION("2m077aeC+PA", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogGetStatus); + LIB_FUNCTION("Bw31liTFT3A", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogUpdateStatus); + LIB_FUNCTION("nqG7rqnYw1U", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogGetResult); + LIB_FUNCTION("M3OkENHcyiU", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogClose); + LIB_FUNCTION("LXlmS6PvJdU", "libSceSigninDialog", 1, "libSceSigninDialog", 1, 1, + sceSigninDialogTerminate); +}; + +} // namespace Libraries::SigninDialog diff --git a/src/core/libraries/signin_dialog/signindialog.h b/src/core/libraries/signin_dialog/signindialog.h new file mode 100644 index 000000000..8726ad1f6 --- /dev/null +++ b/src/core/libraries/signin_dialog/signindialog.h @@ -0,0 +1,29 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once +#include "common/types.h" + +namespace Core::Loader { +class SymbolsResolver; +} + +enum class Status : u32 { + NONE = 0, + INITIALIZED = 1, + RUNNING = 2, + FINISHED = 3, +}; + +namespace Libraries::SigninDialog { + +s32 PS4_SYSV_ABI sceSigninDialogInitialize(); +s32 PS4_SYSV_ABI sceSigninDialogOpen(); +Status PS4_SYSV_ABI sceSigninDialogGetStatus(); +Status PS4_SYSV_ABI sceSigninDialogUpdateStatus(); +s32 PS4_SYSV_ABI sceSigninDialogGetResult(); +s32 PS4_SYSV_ABI sceSigninDialogClose(); +s32 PS4_SYSV_ABI sceSigninDialogTerminate(); + +void RegisterlibSceSigninDialog(Core::Loader::SymbolsResolver* sym); +} // namespace Libraries::SigninDialog From 8e7c5a4d995106661524173914af15aeeb11511a Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Fri, 9 May 2025 17:33:32 +0200 Subject: [PATCH 07/11] Remove deprecated include (#2893) --- src/core/libraries/libc_internal/printf.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/libraries/libc_internal/printf.h b/src/core/libraries/libc_internal/printf.h index fe63481a0..9c22e922c 100644 --- a/src/core/libraries/libc_internal/printf.h +++ b/src/core/libraries/libc_internal/printf.h @@ -56,7 +56,6 @@ #include #include -#include #include #include #include From b130fe6ed59277ff66ff8579ce3aa14452f2416c Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Fri, 9 May 2025 08:43:20 -0700 Subject: [PATCH 08/11] vulkan: Handle incompatible depth format using null binding. (#2892) Co-authored-by: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> --- src/shader_recompiler/info.h | 5 +++ .../ir/passes/resource_tracking_pass.cpp | 6 +++ src/video_core/amdgpu/resource.h | 13 +++++++ .../renderer_vulkan/vk_rasterizer.cpp | 5 ++- src/video_core/texture_cache/image_view.h | 2 - .../texture_cache/texture_cache.cpp | 37 ++++++++++++------- src/video_core/texture_cache/texture_cache.h | 4 ++ 7 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 784f8b4d2..12e48c8e4 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -281,6 +281,11 @@ constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept // Fall back to null image if unbound. return AmdGpu::Image::Null(); } + const auto data_fmt = image.GetDataFmt(); + if (is_depth && data_fmt != AmdGpu::DataFormat::Format16 && + data_fmt != AmdGpu::DataFormat::Format32) { + return AmdGpu::Image::NullDepth(); + } return image; } diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 1de255e4d..cc0bf83d3 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -363,6 +363,12 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!"); image = AmdGpu::Image::Null(); } + const auto data_fmt = image.GetDataFmt(); + if (inst_info.is_depth && data_fmt != AmdGpu::DataFormat::Format16 && + data_fmt != AmdGpu::DataFormat::Format32) { + LOG_ERROR(Render_Vulkan, "Shader compiled using non-depth image with depth instruction!"); + image = AmdGpu::Image::NullDepth(); + } ASSERT(image.GetType() != AmdGpu::ImageType::Invalid); const bool is_written = inst.GetOpcode() == IR::Opcode::ImageWrite; diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index c387c7bf2..9060074fb 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -219,6 +219,19 @@ struct Image { return image; } + static constexpr Image NullDepth() { + Image image{}; + image.data_format = u64(DataFormat::Format32); + image.num_format = u64(NumberFormat::Float); + image.dst_sel_x = u64(CompSwizzle::Red); + image.dst_sel_y = u64(CompSwizzle::Green); + image.dst_sel_z = u64(CompSwizzle::Blue); + image.dst_sel_w = u64(CompSwizzle::Alpha); + image.tiling_index = u64(TilingMode::Texture_MicroTiled); + image.type = u64(ImageType::Color2D); + return image; + } + bool Valid() const { return (type & 0x8u) != 0; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 4caa781b9..e7b42a34b 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -618,8 +618,9 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (instance.IsNullDescriptorSupported()) { image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); } else { - auto& null_image = texture_cache.GetImageView(VideoCore::NULL_IMAGE_VIEW_ID); - image_infos.emplace_back(VK_NULL_HANDLE, *null_image.image_view, + auto& null_image_view = + texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc.view_info); + image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, vk::ImageLayout::eGeneral); } } else { diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index 23c703d23..6a17490bf 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -34,8 +34,6 @@ struct ImageViewInfo { struct Image; -constexpr Common::SlotId NULL_IMAGE_VIEW_ID{0}; - struct ImageView { ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info, Image& image, ImageId image_id); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 047bb3dfe..82f4d6413 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -8,6 +8,7 @@ #include "common/debug.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" +#include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/host_compatibility.h" @@ -23,31 +24,41 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, tile_manager{instance, scheduler} { + // Create basic null image at fixed image ID. + const auto null_id = GetNullImage(vk::Format::eR8G8B8A8Unorm); + ASSERT(null_id.index == NULL_IMAGE_ID.index); +} + +TextureCache::~TextureCache() = default; + +ImageId TextureCache::GetNullImage(const vk::Format format) { + const auto existing_image = null_images.find(format); + if (existing_image != null_images.end()) { + return existing_image->second; + } + ImageInfo info{}; - info.pixel_format = vk::Format::eR8G8B8A8Unorm; + info.pixel_format = format; info.type = vk::ImageType::e2D; - info.tiling_idx = u32(AmdGpu::TilingMode::Texture_MicroTiled); + info.tiling_idx = static_cast(AmdGpu::TilingMode::Texture_MicroTiled); info.num_bits = 32; info.UpdateSize(); + const ImageId null_id = slot_images.insert(instance, scheduler, info); - ASSERT(null_id.index == NULL_IMAGE_ID.index); auto& img = slot_images[null_id]; + const vk::Image& null_image = img.image; - Vulkan::SetObjectName(instance.GetDevice(), null_image, "Null Image"); + Vulkan::SetObjectName(instance.GetDevice(), null_image, + fmt::format("Null Image ({})", vk::to_string(format))); + img.flags = ImageFlagBits::Empty; img.track_addr = img.info.guest_address; img.track_addr_end = img.info.guest_address + img.info.guest_size; - ImageViewInfo view_info; - const auto null_view_id = - slot_image_views.insert(instance, view_info, slot_images[null_id], null_id); - ASSERT(null_view_id.index == NULL_IMAGE_VIEW_ID.index); - const vk::ImageView& null_image_view = slot_image_views[null_view_id].image_view.get(); - Vulkan::SetObjectName(instance.GetDevice(), null_image_view, "Null Image View"); + null_images.emplace(format, null_id); + return null_id; } -TextureCache::~TextureCache() = default; - void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { if (image.hash == 0) { // Initialize hash @@ -296,7 +307,7 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { const auto& info = desc.info; if (info.guest_address == 0) [[unlikely]] { - return NULL_IMAGE_ID; + return GetNullImage(info.pixel_format); } std::scoped_lock lock{mutex}; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f262768ea..b6bf88958 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -246,6 +246,9 @@ private: } } + /// Gets or creates a null image for a particular format. + ImageId GetNullImage(vk::Format format); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); @@ -285,6 +288,7 @@ private: Common::SlotVector slot_images; Common::SlotVector slot_image_views; tsl::robin_map samplers; + tsl::robin_map null_images; PageTable page_table; std::mutex mutex; From 8d7cbf9943f1b8476bee7bde758b77d0d4d4edff Mon Sep 17 00:00:00 2001 From: Missake212 Date: Fri, 9 May 2025 17:01:34 +0100 Subject: [PATCH 09/11] Adding opcode IMAGE_SAMPLE_B_O (#2894) * Adding opcode IMAGE_SAMPLE_B_O: * fix clang (my first time !) --- src/shader_recompiler/frontend/translate/vector_memory.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index cfc01c58f..5639bc56a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -143,6 +143,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::IMAGE_SAMPLE_C_LZ: case Opcode::IMAGE_SAMPLE_O: case Opcode::IMAGE_SAMPLE_L_O: + case Opcode::IMAGE_SAMPLE_B_O: case Opcode::IMAGE_SAMPLE_LZ_O: case Opcode::IMAGE_SAMPLE_C_O: case Opcode::IMAGE_SAMPLE_C_LZ_O: From a1439b15cf572a862dfd01dea1dbe71c66b473d7 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Fri, 9 May 2025 10:04:37 -0700 Subject: [PATCH 10/11] gnm: Implement sceGnmDrawIndexIndirectMulti (#2889) --- src/core/libraries/gnmdriver/gnmdriver.cpp | 38 +++++++++++++++---- src/core/libraries/gnmdriver/gnmdriver.h | 4 +- src/video_core/amdgpu/liverpool.cpp | 37 ++++++++++++++---- src/video_core/amdgpu/pm4_cmds.h | 26 +++++++++++-- .../renderer_vulkan/vk_instance.cpp | 1 + 5 files changed, 86 insertions(+), 20 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 25ac4921c..f2f40e0e3 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -505,9 +505,10 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(u32* cmdbuf, u32 size, u32 da u32 flags) { LOG_TRACE(Lib_GnmDriver, "called"); - if ((!sceKernelIsNeoMode() || !UseNeoCompatSequences) && !cmdbuf && (size == 16) && - (shader_stage < ShaderStages::Max) && (vertex_sgpr_offset < 0x10u) && - (instance_sgpr_offset < 0x10u)) { + if ((!sceKernelIsNeoMode() || !UseNeoCompatSequences) && cmdbuf && (size == 16) && + (vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u) && + (shader_stage == ShaderStages::Vs || shader_stage == ShaderStages::Es || + shader_stage == ShaderStages::Ls)) { cmdbuf = WriteHeader(cmdbuf, 2); cmdbuf = WriteBody(cmdbuf, 0u); @@ -535,10 +536,33 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(u32* cmdbuf, u32 size, u32 da return -1; } -int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); - UNREACHABLE(); - return ORBIS_OK; +int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti(u32* cmdbuf, u32 size, u32 data_offset, u32 max_count, + u32 shader_stage, u32 vertex_sgpr_offset, + u32 instance_sgpr_offset, u32 flags) { + LOG_TRACE(Lib_GnmDriver, "called"); + + if (cmdbuf && (size == 11) && (vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u) && + (shader_stage == ShaderStages::Vs || shader_stage == ShaderStages::Es || + shader_stage == ShaderStages::Ls)) { + + const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable; + cmdbuf = WriteHeader( + cmdbuf, 6, PM4ShaderType::ShaderGraphics, predicate); + + const auto sgpr_offset = indirect_sgpr_offsets[shader_stage]; + + cmdbuf[0] = data_offset; + cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[3] = max_count; + cmdbuf[4] = sizeof(DrawIndexedIndirectArgs); + cmdbuf[5] = sceKernelIsNeoMode() ? flags & 0xe0000000u : 0; + + cmdbuf += 6; + WriteTrailingNop<3>(cmdbuf); + return ORBIS_OK; + } + return -1; } int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced() { diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index 94d06c85f..a3d4968d3 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -51,7 +51,9 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(u32* cmdbuf, u32 size, u32 da u32 max_count, u64 count_addr, u32 shader_stage, u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags); -int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti(); +int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti(u32* cmdbuf, u32 size, u32 data_offset, u32 max_count, + u32 shader_stage, u32 vertex_sgpr_offset, + u32 instance_sgpr_offset, u32 flags); int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced(); s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count, u32 flags); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 967b952c6..4c8e3367a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -455,14 +455,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); const auto offset = draw_indirect->data_offset; - const auto size = sizeof(DrawIndirectArgs); + const auto stride = sizeof(DrawIndirectArgs); if (DebugState.DumpingCurrentReg()) { DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin(fmt::format("gfx:{}:DrawIndirect", cmd_address)); - rasterizer->DrawIndirect(false, indirect_args_addr, offset, size, 1, 0); + rasterizer->DrawIndirect(false, indirect_args_addr, offset, stride, 1, 0); rasterizer->ScopeMarkerEnd(); } break; @@ -471,7 +471,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); const auto offset = draw_index_indirect->data_offset; - const auto size = sizeof(DrawIndexedIndirectArgs); + const auto stride = sizeof(DrawIndexedIndirectArgs); if (DebugState.DumpingCurrentReg()) { DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); } @@ -479,25 +479,46 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); rasterizer->ScopeMarkerBegin( fmt::format("gfx:{}:DrawIndexIndirect", cmd_address)); - rasterizer->DrawIndirect(true, indirect_args_addr, offset, size, 1, 0); + rasterizer->DrawIndirect(true, indirect_args_addr, offset, stride, 1, 0); rasterizer->ScopeMarkerEnd(); } break; } - case PM4ItOpcode::DrawIndexIndirectCountMulti: { + case PM4ItOpcode::DrawIndexIndirectMulti: { const auto* draw_index_indirect = reinterpret_cast(header); const auto offset = draw_index_indirect->data_offset; if (DebugState.DumpingCurrentReg()) { DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); } + if (rasterizer) { + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin( + fmt::format("gfx:{}:DrawIndexIndirectMulti", cmd_address)); + rasterizer->DrawIndirect(true, indirect_args_addr, offset, + draw_index_indirect->stride, + draw_index_indirect->count, 0); + rasterizer->ScopeMarkerEnd(); + } + break; + } + case PM4ItOpcode::DrawIndexIndirectCountMulti: { + const auto* draw_index_indirect = + reinterpret_cast(header); + const auto offset = draw_index_indirect->data_offset; + if (DebugState.DumpingCurrentReg()) { + DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs); + } if (rasterizer) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin( fmt::format("gfx:{}:DrawIndexIndirectCountMulti", cmd_address)); - rasterizer->DrawIndirect( - true, indirect_args_addr, offset, draw_index_indirect->stride, - draw_index_indirect->count, draw_index_indirect->countAddr); + rasterizer->DrawIndirect(true, indirect_args_addr, offset, + draw_index_indirect->stride, + draw_index_indirect->count, + draw_index_indirect->count_indirect_enable.Value() + ? draw_index_indirect->count_addr + : 0); rasterizer->ScopeMarkerEnd(); } break; diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index ae1d32e00..6b55f5b65 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -860,6 +860,24 @@ struct PM4CmdDrawIndexIndirect { }; struct PM4CmdDrawIndexIndirectMulti { + PM4Type3Header header; ///< header + u32 data_offset; ///< Byte aligned offset where the required data structure starts + union { + u32 dw2; + BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the + ///< BaseVertexLocation it fetched from memory + }; + union { + u32 dw3; + BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the + ///< StartInstanceLocation it fetched from memory + }; + u32 count; ///< Count of data structures to loop through before going to next packet + u32 stride; ///< Stride in memory from one data structure to the next + u32 draw_initiator; ///< Draw Initiator Register +}; + +struct PM4CmdDrawIndexIndirectCountMulti { PM4Type3Header header; ///< header u32 data_offset; ///< Byte aligned offset where the required data structure starts union { @@ -874,14 +892,14 @@ struct PM4CmdDrawIndexIndirectMulti { }; union { u32 dw4; - BitField<0, 16, u32> drawIndexLoc; ///< register offset to write the Draw Index count + BitField<0, 16, u32> draw_index_loc; ///< register offset to write the Draw Index count BitField<30, 1, u32> - countIndirectEnable; ///< Indicates the data structure count is in memory + count_indirect_enable; ///< Indicates the data structure count is in memory BitField<31, 1, u32> - drawIndexEnable; ///< Enables writing of Draw Index count to DRAW_INDEX_LOC + draw_index_enable; ///< Enables writing of Draw Index count to DRAW_INDEX_LOC }; u32 count; ///< Count of data structures to loop through before going to next packet - u64 countAddr; ///< DWord aligned Address[31:2]; Valid if countIndirectEnable is set + u64 count_addr; ///< DWord aligned Address[31:2]; Valid if countIndirectEnable is set u32 stride; ///< Stride in memory from one data structure to the next u32 draw_initiator; ///< Draw Initiator Register }; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 99f225d79..1004d850f 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -338,6 +338,7 @@ bool Instance::CreateDevice() { .geometryShader = features.geometryShader, .tessellationShader = features.tessellationShader, .logicOp = features.logicOp, + .multiDrawIndirect = features.multiDrawIndirect, .depthBiasClamp = features.depthBiasClamp, .fillModeNonSolid = features.fillModeNonSolid, .depthBounds = features.depthBounds, From 6477dc4f1e699981919022ac69fef59813a9ad94 Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Fri, 9 May 2025 14:33:04 -0500 Subject: [PATCH 11/11] Core: Memory Fixes (#2872) * Fix VirtualQuery behavior on low addresses. * Fix VirtualQuery struct Somewhere in our BitField and array use, the size of our VirtualQuery struct became larger than the struct used on real hardware. Fixing this fixes some data corruption visible in the name parameter during my tests. * Default name to anon On real hardware, nameless mappings are given the name "anon:address" where address appears to be the address that made the memory call. For simplicity sake, I'll stick to the name "anon" for now. * Place an upper bound on returns from SearchFree Right now, this upper bound is set based on the limitations of our GPU buffer cache and page table. Someone with more experience in that area of code should probably fix that at some point. * More anons * Clang * Fix name in sceKernelMapNamedDirectMemory * strncpy instead of strcpy Hardcoded the constant size for now, I need to review how real hardware behaves here to determine if anything else is necessary for this to be accurate. * Fix name behavior All memory naming functions restrict the name size to a 31 character limit, and return `ORBIS_KERNEL_ERROR_ENAMETOOLONG` if that limit is exceeded. Since this value is constant for all functions involving names, I've defined it as a constant in kernel's memory.h, and used that in place of any hardcoded 32 character limits. * Error logging Hopefully this helps in catching the UFC regression? * Increase address space upper bound Probably needs heavy testing, especially on Mac/Windows. This increases the address space, as needed to accommodate strange memory behaviors seen in UFC. * VirtualQuery fix Due to limitations of certain platforms, we initialize our vma_map with 3 separate free mappings. As such, we need to use a while loop here to accurately query mappings with high addresses * Fix mappings to high addresses The PS4's GPU can only handle 40bit addresses. Our texture cache and buffer cache were designed around these limits, and mapping to higher addresses would cause segmentation faults and access violations. To fix these crashes, only map to the GPU if the mapping is fully contained within the address space the GPU should access. I'm open to suggestions on how to make this cleaner * Revert "Increase address space upper bound" This reverts commit 3d50eeeebb6aa40e38d6f87e6480235c917843f3. * Revert VirtualQuery while loop Windows wasn't happy with this, again. Will try to debug and properly fix this when I have a good chance. * Fix asserts FindVMA, due to the way it's programmed, never actually returns vma_map.end(), the furthest it ever returns is the last valid memory area. All those asserts we involving vma_map.end() never actually trigger due to this. This commit removes redundant asserts, adds messages to asserts that were lacking them, and fixes all asserts designed to detect out of bounds memory accesses so they actually trigger. I've also fixed some potential memory safety issues. * Proper error behavior in QueryProtection Might as well handle this properly while I'm here. * Clang * More information about ReserveVirtualRange results Should help debug issues like the one in The Order: 1886 (CUSA00076) * Fix assert message * Update assert message Extra space * Fix my bug Oh hey, finally something that's my fault. * Fix rasterizer unmaps Should use adjusted_size here, otherwise we could unmap too much. Thanks to diegolix29 for spotting this. * Fix edge case in MapMemory Code comments explain everything. This should fix some memory asserts. * Fix fix Avoid running the code path if it's unnecessary, since there are many additional edge cases to handle when the VMA map is small. * Fix fix fix Should prevent infinite loops, haven't tested properly yet though. * Split logging for inputs and out_addr in ReserveVirtualRange Addresses review comments. --- src/core/libraries/kernel/memory.cpp | 54 ++++---- src/core/libraries/kernel/memory.h | 16 +-- src/core/memory.cpp | 177 ++++++++++++++++++++------- src/core/memory.h | 8 +- 4 files changed, 179 insertions(+), 76 deletions(-) diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index 8a0c91479..495ddc52f 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -126,9 +126,6 @@ s32 PS4_SYSV_ABI sceKernelAvailableDirectMemorySize(u64 searchStart, u64 searchE s32 PS4_SYSV_ABI sceKernelVirtualQuery(const void* addr, int flags, OrbisVirtualQueryInfo* info, size_t infoSize) { LOG_INFO(Kernel_Vmm, "called addr = {}, flags = {:#x}", fmt::ptr(addr), flags); - if (!addr) { - return ORBIS_KERNEL_ERROR_EACCES; - } auto* memory = Core::Memory::Instance(); return memory->VirtualQuery(std::bit_cast(addr), flags, info); } @@ -136,7 +133,6 @@ s32 PS4_SYSV_ABI sceKernelVirtualQuery(const void* addr, int flags, OrbisVirtual s32 PS4_SYSV_ABI sceKernelReserveVirtualRange(void** addr, u64 len, int flags, u64 alignment) { LOG_INFO(Kernel_Vmm, "addr = {}, len = {:#x}, flags = {:#x}, alignment = {:#x}", fmt::ptr(*addr), len, flags, alignment); - if (addr == nullptr) { LOG_ERROR(Kernel_Vmm, "Address is invalid!"); return ORBIS_KERNEL_ERROR_EINVAL; @@ -155,9 +151,12 @@ s32 PS4_SYSV_ABI sceKernelReserveVirtualRange(void** addr, u64 len, int flags, u auto* memory = Core::Memory::Instance(); const VAddr in_addr = reinterpret_cast(*addr); const auto map_flags = static_cast(flags); - memory->Reserve(addr, in_addr, len, map_flags, alignment); - return ORBIS_OK; + s32 result = memory->Reserve(addr, in_addr, len, map_flags, alignment); + if (result == 0) { + LOG_INFO(Kernel_Vmm, "out_addr = {}", fmt::ptr(*addr)); + } + return result; } int PS4_SYSV_ABI sceKernelMapNamedDirectMemory(void** addr, u64 len, int prot, int flags, @@ -172,10 +171,12 @@ int PS4_SYSV_ABI sceKernelMapNamedDirectMemory(void** addr, u64 len, int prot, i LOG_ERROR(Kernel_Vmm, "Map size is either zero or not 16KB aligned!"); return ORBIS_KERNEL_ERROR_EINVAL; } + if (!Common::Is16KBAligned(directMemoryStart)) { LOG_ERROR(Kernel_Vmm, "Start address is not 16KB aligned!"); return ORBIS_KERNEL_ERROR_EINVAL; } + if (alignment != 0) { if ((!std::has_single_bit(alignment) && !Common::Is16KBAligned(alignment))) { LOG_ERROR(Kernel_Vmm, "Alignment value is invalid!"); @@ -183,14 +184,19 @@ int PS4_SYSV_ABI sceKernelMapNamedDirectMemory(void** addr, u64 len, int prot, i } } + if (std::strlen(name) >= ORBIS_KERNEL_MAXIMUM_NAME_LENGTH) { + LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); + return ORBIS_KERNEL_ERROR_ENAMETOOLONG; + } + const VAddr in_addr = reinterpret_cast(*addr); const auto mem_prot = static_cast(prot); const auto map_flags = static_cast(flags); auto* memory = Core::Memory::Instance(); const auto ret = - memory->MapMemory(addr, in_addr, len, mem_prot, map_flags, Core::VMAType::Direct, "", false, - directMemoryStart, alignment); + memory->MapMemory(addr, in_addr, len, mem_prot, map_flags, Core::VMAType::Direct, name, + false, directMemoryStart, alignment); LOG_INFO(Kernel_Vmm, "out_addr = {}", fmt::ptr(*addr)); return ret; @@ -199,7 +205,8 @@ int PS4_SYSV_ABI sceKernelMapNamedDirectMemory(void** addr, u64 len, int prot, i int PS4_SYSV_ABI sceKernelMapDirectMemory(void** addr, u64 len, int prot, int flags, s64 directMemoryStart, u64 alignment) { LOG_INFO(Kernel_Vmm, "called, redirected to sceKernelMapNamedDirectMemory"); - return sceKernelMapNamedDirectMemory(addr, len, prot, flags, directMemoryStart, alignment, ""); + return sceKernelMapNamedDirectMemory(addr, len, prot, flags, directMemoryStart, alignment, + "anon"); } s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, std::size_t len, int prot, @@ -210,17 +217,16 @@ s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, std::size_t return ORBIS_KERNEL_ERROR_EINVAL; } - static constexpr size_t MaxNameSize = 32; - if (std::strlen(name) > MaxNameSize) { - LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); - return ORBIS_KERNEL_ERROR_ENAMETOOLONG; - } - if (name == nullptr) { LOG_ERROR(Kernel_Vmm, "name is invalid!"); return ORBIS_KERNEL_ERROR_EFAULT; } + if (std::strlen(name) >= ORBIS_KERNEL_MAXIMUM_NAME_LENGTH) { + LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); + return ORBIS_KERNEL_ERROR_ENAMETOOLONG; + } + const VAddr in_addr = reinterpret_cast(*addr_in_out); const auto mem_prot = static_cast(prot); const auto map_flags = static_cast(flags); @@ -236,7 +242,7 @@ s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, std::size_t s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, std::size_t len, int prot, int flags) { - return sceKernelMapNamedFlexibleMemory(addr_in_out, len, prot, flags, ""); + return sceKernelMapNamedFlexibleMemory(addr_in_out, len, prot, flags, "anon"); } int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void** end, u32* prot) { @@ -304,7 +310,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn case MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_DIRECT: { result = sceKernelMapNamedDirectMemory(&entries[i].start, entries[i].length, entries[i].protection, flags, - static_cast(entries[i].offset), 0, ""); + static_cast(entries[i].offset), 0, "anon"); LOG_INFO(Kernel_Vmm, "entry = {}, operation = {}, len = {:#x}, offset = {:#x}, type = {}, " "result = {}", @@ -326,7 +332,7 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn } case MemoryOpTypes::ORBIS_KERNEL_MAP_OP_MAP_FLEXIBLE: { result = sceKernelMapNamedFlexibleMemory(&entries[i].start, entries[i].length, - entries[i].protection, flags, ""); + entries[i].protection, flags, "anon"); LOG_INFO(Kernel_Vmm, "entry = {}, operation = {}, len = {:#x}, type = {}, " "result = {}", @@ -356,16 +362,16 @@ s32 PS4_SYSV_ABI sceKernelBatchMap2(OrbisKernelBatchMapEntry* entries, int numEn } s32 PS4_SYSV_ABI sceKernelSetVirtualRangeName(const void* addr, size_t len, const char* name) { - static constexpr size_t MaxNameSize = 32; - if (std::strlen(name) > MaxNameSize) { - LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); - return ORBIS_KERNEL_ERROR_ENAMETOOLONG; - } - if (name == nullptr) { LOG_ERROR(Kernel_Vmm, "name is invalid!"); return ORBIS_KERNEL_ERROR_EFAULT; } + + if (std::strlen(name) >= ORBIS_KERNEL_MAXIMUM_NAME_LENGTH) { + LOG_ERROR(Kernel_Vmm, "name exceeds 32 bytes!"); + return ORBIS_KERNEL_ERROR_ENAMETOOLONG; + } + auto* memory = Core::Memory::Instance(); memory->NameVirtualRange(std::bit_cast(addr), len, name); return ORBIS_OK; diff --git a/src/core/libraries/kernel/memory.h b/src/core/libraries/kernel/memory.h index 400b6c3fc..6acb559d1 100644 --- a/src/core/libraries/kernel/memory.h +++ b/src/core/libraries/kernel/memory.h @@ -47,6 +47,8 @@ enum MemoryOpTypes : u32 { ORBIS_KERNEL_MAP_OP_TYPE_PROTECT = 4 }; +constexpr u32 ORBIS_KERNEL_MAXIMUM_NAME_LENGTH = 32; + struct OrbisQueryInfo { uintptr_t start; uintptr_t end; @@ -59,14 +61,12 @@ struct OrbisVirtualQueryInfo { size_t offset; s32 protection; s32 memory_type; - union { - BitField<0, 1, u32> is_flexible; - BitField<1, 1, u32> is_direct; - BitField<2, 1, u32> is_stack; - BitField<3, 1, u32> is_pooled; - BitField<4, 1, u32> is_committed; - }; - std::array name; + u32 is_flexible : 1; + u32 is_direct : 1; + u32 is_stack : 1; + u32 is_pooled : 1; + u32 is_committed : 1; + char name[ORBIS_KERNEL_MAXIMUM_NAME_LENGTH]; }; struct OrbisKernelBatchMapEntry { diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 494ffa70c..9861e813a 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -75,7 +75,8 @@ u64 MemoryManager::ClampRangeSize(VAddr virtual_addr, u64 size) { // Clamp size to the remaining size of the current VMA. auto vma = FindVMA(virtual_addr); - ASSERT_MSG(vma != vma_map.end(), "Attempted to access invalid GPU address {:#x}", virtual_addr); + ASSERT_MSG(vma->second.Contains(virtual_addr, 0), + "Attempted to access invalid GPU address {:#x}", virtual_addr); u64 clamped_size = vma->second.base + vma->second.size - virtual_addr; ++vma; @@ -96,6 +97,8 @@ u64 MemoryManager::ClampRangeSize(VAddr virtual_addr, u64 size) { bool MemoryManager::TryWriteBacking(void* address, const void* data, u32 num_bytes) { const VAddr virtual_addr = std::bit_cast(address); const auto& vma = FindVMA(virtual_addr)->second; + ASSERT_MSG(vma.Contains(virtual_addr, 0), + "Attempting to access out of bounds memory at address {:#x}", virtual_addr); if (vma.type != VMAType::Direct) { return false; } @@ -145,10 +148,12 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, auto mapping_end = mapping_start + size; // Find the first free, large enough dmem area in the range. - while ((!dmem_area->second.is_free || dmem_area->second.GetEnd() < mapping_end) && - dmem_area != dmem_map.end()) { + while (!dmem_area->second.is_free || dmem_area->second.GetEnd() < mapping_end) { // The current dmem_area isn't suitable, move to the next one. dmem_area++; + if (dmem_area == dmem_map.end()) { + break; + } // Update local variables based on the new dmem_area mapping_start = Common::AlignUp(dmem_area->second.base, alignment); @@ -172,7 +177,6 @@ void MemoryManager::Free(PAddr phys_addr, size_t size) { std::scoped_lock lk{mutex}; auto dmem_area = CarveDmemArea(phys_addr, size); - ASSERT(dmem_area != dmem_map.end() && dmem_area->second.size >= size); // Release any dmem mappings that reference this physical block. std::vector> remove_list; @@ -216,12 +220,18 @@ int MemoryManager::PoolReserve(void** out_addr, VAddr virtual_addr, size_t size, vma = FindVMA(mapped_addr)->second; } const size_t remaining_size = vma.base + vma.size - mapped_addr; - ASSERT_MSG(vma.type == VMAType::Free && remaining_size >= size); + ASSERT_MSG(vma.type == VMAType::Free && remaining_size >= size, + "Memory region {:#x} to {:#x} is not large enough to reserve {:#x} to {:#x}", + vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size); } // Find the first free area starting with provided virtual address. if (False(flags & MemoryMapFlags::Fixed)) { mapped_addr = SearchFree(mapped_addr, size, alignment); + if (mapped_addr == -1) { + // No suitable memory areas to map to + return ORBIS_KERNEL_ERROR_ENOMEM; + } } // Add virtual memory area @@ -229,7 +239,7 @@ int MemoryManager::PoolReserve(void** out_addr, VAddr virtual_addr, size_t size, auto& new_vma = new_vma_handle->second; new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce); new_vma.prot = MemoryProt::NoAccess; - new_vma.name = ""; + new_vma.name = "anon"; new_vma.type = VMAType::PoolReserved; MergeAdjacent(vma_map, new_vma_handle); @@ -247,19 +257,25 @@ int MemoryManager::Reserve(void** out_addr, VAddr virtual_addr, size_t size, Mem // Fixed mapping means the virtual address must exactly match the provided one. if (True(flags & MemoryMapFlags::Fixed)) { - auto& vma = FindVMA(mapped_addr)->second; + auto vma = FindVMA(mapped_addr)->second; // If the VMA is mapped, unmap the region first. if (vma.IsMapped()) { UnmapMemoryImpl(mapped_addr, size); vma = FindVMA(mapped_addr)->second; } const size_t remaining_size = vma.base + vma.size - mapped_addr; - ASSERT_MSG(vma.type == VMAType::Free && remaining_size >= size); + ASSERT_MSG(vma.type == VMAType::Free && remaining_size >= size, + "Memory region {:#x} to {:#x} is not large enough to reserve {:#x} to {:#x}", + vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size); } // Find the first free area starting with provided virtual address. if (False(flags & MemoryMapFlags::Fixed)) { mapped_addr = SearchFree(mapped_addr, size, alignment); + if (mapped_addr == -1) { + // No suitable memory areas to map to + return ORBIS_KERNEL_ERROR_ENOMEM; + } } // Add virtual memory area @@ -267,7 +283,7 @@ int MemoryManager::Reserve(void** out_addr, VAddr virtual_addr, size_t size, Mem auto& new_vma = new_vma_handle->second; new_vma.disallow_merge = True(flags & MemoryMapFlags::NoCoalesce); new_vma.prot = MemoryProt::NoAccess; - new_vma.name = ""; + new_vma.name = "anon"; new_vma.type = VMAType::Reserved; MergeAdjacent(vma_map, new_vma_handle); @@ -288,7 +304,9 @@ int MemoryManager::PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot) // This should return SCE_KERNEL_ERROR_ENOMEM but shouldn't normally happen. const auto& vma = FindVMA(mapped_addr)->second; const size_t remaining_size = vma.base + vma.size - mapped_addr; - ASSERT_MSG(!vma.IsMapped() && remaining_size >= size); + ASSERT_MSG(!vma.IsMapped() && remaining_size >= size, + "Memory region {:#x} to {:#x} isn't free enough to map region {:#x} to {:#x}", + vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size); // Perform the mapping. void* out_addr = impl.Map(mapped_addr, size, alignment, -1, false); @@ -302,7 +320,10 @@ int MemoryManager::PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot) new_vma.is_exec = false; new_vma.phys_base = 0; - rasterizer->MapMemory(mapped_addr, size); + if (IsValidGpuMapping(mapped_addr, size)) { + rasterizer->MapMemory(mapped_addr, size); + } + return ORBIS_OK; } @@ -325,15 +346,34 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M // Fixed mapping means the virtual address must exactly match the provided one. if (True(flags & MemoryMapFlags::Fixed)) { - // This should return SCE_KERNEL_ERROR_ENOMEM but shouldn't normally happen. - const auto& vma = FindVMA(mapped_addr)->second; - const size_t remaining_size = vma.base + vma.size - mapped_addr; - ASSERT_MSG(!vma.IsMapped() && remaining_size >= size); + auto vma = FindVMA(mapped_addr)->second; + size_t remaining_size = vma.base + vma.size - mapped_addr; + // There's a possible edge case where we're mapping to a partially reserved range. + // To account for this, unmap any reserved areas within this mapping range first. + auto unmap_addr = mapped_addr; + auto unmap_size = size; + while (!vma.IsMapped() && unmap_addr < mapped_addr + size && remaining_size < size) { + auto unmapped = UnmapBytesFromEntry(unmap_addr, vma, unmap_size); + unmap_addr += unmapped; + unmap_size -= unmapped; + vma = FindVMA(unmap_addr)->second; + } + + // This should return SCE_KERNEL_ERROR_ENOMEM but rarely happens. + vma = FindVMA(mapped_addr)->second; + remaining_size = vma.base + vma.size - mapped_addr; + ASSERT_MSG(!vma.IsMapped() && remaining_size >= size, + "Memory region {:#x} to {:#x} isn't free enough to map region {:#x} to {:#x}", + vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size); } // Find the first free area starting with provided virtual address. if (False(flags & MemoryMapFlags::Fixed)) { mapped_addr = SearchFree(mapped_addr, size, alignment); + if (mapped_addr == -1) { + // No suitable memory areas to map to + return ORBIS_KERNEL_ERROR_ENOMEM; + } } // Perform the mapping. @@ -353,7 +393,10 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M if (type == VMAType::Flexible) { flexible_usage += size; } - rasterizer->MapMemory(mapped_addr, size); + + if (IsValidGpuMapping(mapped_addr, size)) { + rasterizer->MapMemory(mapped_addr, size); + } return ORBIS_OK; } @@ -366,12 +409,18 @@ int MemoryManager::MapFile(void** out_addr, VAddr virtual_addr, size_t size, Mem // Find first free area to map the file. if (False(flags & MemoryMapFlags::Fixed)) { mapped_addr = SearchFree(mapped_addr, size_aligned, 1); + if (mapped_addr == -1) { + // No suitable memory areas to map to + return ORBIS_KERNEL_ERROR_ENOMEM; + } } if (True(flags & MemoryMapFlags::Fixed)) { const auto& vma = FindVMA(virtual_addr)->second; const size_t remaining_size = vma.base + vma.size - virtual_addr; - ASSERT_MSG(!vma.IsMapped() && remaining_size >= size); + ASSERT_MSG(!vma.IsMapped() && remaining_size >= size, + "Memory region {:#x} to {:#x} isn't free enough to map region {:#x} to {:#x}", + vma.base, vma.base + vma.size, virtual_addr, virtual_addr + size); } // Map the file. @@ -404,7 +453,9 @@ void MemoryManager::PoolDecommit(VAddr virtual_addr, size_t size) { const auto start_in_vma = virtual_addr - vma_base_addr; const auto type = vma_base.type; - rasterizer->UnmapMemory(virtual_addr, size); + if (IsValidGpuMapping(virtual_addr, size)) { + rasterizer->UnmapMemory(virtual_addr, size); + } // Mark region as free and attempt to coalesce it with neighbours. const auto new_it = CarveVMA(virtual_addr, size); @@ -444,7 +495,10 @@ u64 MemoryManager::UnmapBytesFromEntry(VAddr virtual_addr, VirtualMemoryArea vma if (type == VMAType::Flexible) { flexible_usage -= adjusted_size; } - rasterizer->UnmapMemory(virtual_addr, adjusted_size); + + if (IsValidGpuMapping(virtual_addr, adjusted_size)) { + rasterizer->UnmapMemory(virtual_addr, adjusted_size); + } // Mark region as free and attempt to coalesce it with neighbours. const auto new_it = CarveVMA(virtual_addr, adjusted_size); @@ -471,6 +525,8 @@ s32 MemoryManager::UnmapMemoryImpl(VAddr virtual_addr, u64 size) { do { auto it = FindVMA(virtual_addr + unmapped_bytes); auto& vma_base = it->second; + ASSERT_MSG(vma_base.Contains(virtual_addr + unmapped_bytes, 0), + "Address {:#x} is out of bounds", virtual_addr + unmapped_bytes); auto unmapped = UnmapBytesFromEntry(virtual_addr + unmapped_bytes, vma_base, size - unmapped_bytes); ASSERT_MSG(unmapped > 0, "Failed to unmap memory, progress is impossible"); @@ -485,7 +541,10 @@ int MemoryManager::QueryProtection(VAddr addr, void** start, void** end, u32* pr const auto it = FindVMA(addr); const auto& vma = it->second; - ASSERT_MSG(vma.type != VMAType::Free, "Provided address is not mapped"); + if (!vma.Contains(addr, 0) || vma.IsFree()) { + LOG_ERROR(Kernel_Vmm, "Address {:#x} is not mapped", addr); + return ORBIS_KERNEL_ERROR_EACCES; + } if (start != nullptr) { *start = reinterpret_cast(vma.base); @@ -555,6 +614,8 @@ s32 MemoryManager::Protect(VAddr addr, size_t size, MemoryProt prot) { do { auto it = FindVMA(addr + protected_bytes); auto& vma_base = it->second; + ASSERT_MSG(vma_base.Contains(addr + protected_bytes, 0), "Address {:#x} is out of bounds", + addr + protected_bytes); auto result = 0; result = ProtectBytes(addr + protected_bytes, vma_base, size - protected_bytes, prot); if (result < 0) { @@ -571,8 +632,16 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags, ::Libraries::Kernel::OrbisVirtualQueryInfo* info) { std::scoped_lock lk{mutex}; - auto it = FindVMA(addr); - if (it->second.type == VMAType::Free && flags == 1) { + // FindVMA on addresses before the vma_map return garbage data. + auto query_addr = + addr < impl.SystemManagedVirtualBase() ? impl.SystemManagedVirtualBase() : addr; + if (addr < query_addr && flags == 0) { + LOG_WARNING(Kernel_Vmm, "VirtualQuery on free memory region"); + return ORBIS_KERNEL_ERROR_EACCES; + } + auto it = FindVMA(query_addr); + + while (it->second.type == VMAType::Free && flags == 1 && it != --vma_map.end()) { ++it; } if (it->second.type == VMAType::Free) { @@ -585,15 +654,17 @@ int MemoryManager::VirtualQuery(VAddr addr, int flags, info->end = vma.base + vma.size; info->offset = vma.phys_base; info->protection = static_cast(vma.prot); - info->is_flexible.Assign(vma.type == VMAType::Flexible); - info->is_direct.Assign(vma.type == VMAType::Direct); - info->is_stack.Assign(vma.type == VMAType::Stack); - info->is_pooled.Assign(vma.type == VMAType::PoolReserved || vma.type == VMAType::Pooled); - info->is_committed.Assign(vma.IsMapped()); - vma.name.copy(info->name.data(), std::min(info->name.size(), vma.name.size())); + info->is_flexible = vma.type == VMAType::Flexible ? 1 : 0; + info->is_direct = vma.type == VMAType::Direct ? 1 : 0; + info->is_stack = vma.type == VMAType::Stack ? 1 : 0; + info->is_pooled = vma.type == VMAType::PoolReserved || vma.type == VMAType::Pooled ? 1 : 0; + info->is_committed = vma.IsMapped() ? 1 : 0; + + strncpy(info->name, vma.name.data(), ::Libraries::Kernel::ORBIS_KERNEL_MAXIMUM_NAME_LENGTH); + if (vma.type == VMAType::Direct) { const auto dmem_it = FindDmemArea(vma.phys_base); - ASSERT(dmem_it != dmem_map.end()); + ASSERT_MSG(vma.phys_base <= dmem_it->second.GetEnd(), "vma.phys_base is not in dmem_map!"); info->memory_type = dmem_it->second.memory_type; } else { info->memory_type = ::Libraries::Kernel::SCE_KERNEL_WB_ONION; @@ -607,11 +678,11 @@ int MemoryManager::DirectMemoryQuery(PAddr addr, bool find_next, std::scoped_lock lk{mutex}; auto dmem_area = FindDmemArea(addr); - while (dmem_area != dmem_map.end() && dmem_area->second.is_free && find_next) { + while (dmem_area != --dmem_map.end() && dmem_area->second.is_free && find_next) { dmem_area++; } - if (dmem_area == dmem_map.end() || dmem_area->second.is_free) { + if (dmem_area->second.is_free) { LOG_ERROR(Core, "Unable to find allocated direct memory region to query!"); return ORBIS_KERNEL_ERROR_EACCES; } @@ -691,36 +762,56 @@ VAddr MemoryManager::SearchFree(VAddr virtual_addr, size_t size, u32 alignment) virtual_addr = min_search_address; } + // If the requested address is beyond the maximum our code can handle, throw an assert + auto max_search_address = impl.UserVirtualBase() + impl.UserVirtualSize(); + ASSERT_MSG(virtual_addr <= max_search_address, "Input address {:#x} is out of bounds", + virtual_addr); + auto it = FindVMA(virtual_addr); - ASSERT_MSG(it != vma_map.end(), "Specified mapping address was not found!"); // If the VMA is free and contains the requested mapping we are done. if (it->second.IsFree() && it->second.Contains(virtual_addr, size)) { return virtual_addr; } + // Search for the first free VMA that fits our mapping. - const auto is_suitable = [&] { + while (it != vma_map.end()) { if (!it->second.IsFree()) { - return false; + it++; + continue; } + const auto& vma = it->second; virtual_addr = Common::AlignUp(vma.base, alignment); // Sometimes the alignment itself might be larger than the VMA. if (virtual_addr > vma.base + vma.size) { - return false; + it++; + continue; } + + // Make sure the address is within our defined bounds + if (virtual_addr >= max_search_address) { + // There are no free mappings within our safely usable address space. + break; + } + + // If there's enough space in the VMA, return the address. const size_t remaining_size = vma.base + vma.size - virtual_addr; - return remaining_size >= size; - }; - while (!is_suitable()) { - ++it; + if (remaining_size >= size) { + return virtual_addr; + } + it++; } - return virtual_addr; + + // Couldn't find a suitable VMA, return an error. + LOG_ERROR(Kernel_Vmm, "Couldn't find a free mapping for address {:#x}, size {:#x}", + virtual_addr, size); + return -1; } MemoryManager::VMAHandle MemoryManager::CarveVMA(VAddr virtual_addr, size_t size) { auto vma_handle = FindVMA(virtual_addr); - ASSERT_MSG(vma_handle != vma_map.end(), "Virtual address not in vm_map"); + ASSERT_MSG(vma_handle->second.Contains(virtual_addr, 0), "Virtual address not in vm_map"); const VirtualMemoryArea& vma = vma_handle->second; ASSERT_MSG(vma.base <= virtual_addr, "Adding a mapping to already mapped region"); @@ -749,7 +840,7 @@ MemoryManager::VMAHandle MemoryManager::CarveVMA(VAddr virtual_addr, size_t size MemoryManager::DMemHandle MemoryManager::CarveDmemArea(PAddr addr, size_t size) { auto dmem_handle = FindDmemArea(addr); - ASSERT_MSG(dmem_handle != dmem_map.end(), "Physical address not in dmem_map"); + ASSERT_MSG(addr <= dmem_handle->second.GetEnd(), "Physical address not in dmem_map"); const DirectMemoryArea& area = dmem_handle->second; ASSERT_MSG(area.base <= addr, "Adding an allocation to already allocated region"); @@ -804,7 +895,7 @@ int MemoryManager::GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, auto dmem_area = FindDmemArea(addr); - if (dmem_area == dmem_map.end() || dmem_area->second.is_free) { + if (addr > dmem_area->second.GetEnd() || dmem_area->second.is_free) { LOG_ERROR(Core, "Unable to find allocated direct memory region to check type!"); return ORBIS_KERNEL_ERROR_ENOENT; } diff --git a/src/core/memory.h b/src/core/memory.h index a6a55e288..3a204eb96 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -157,6 +157,12 @@ public: return impl.SystemReservedVirtualBase(); } + bool IsValidGpuMapping(VAddr virtual_addr, u64 size) { + // The PS4's GPU can only handle 40 bit addresses. + const VAddr max_gpu_address{0x10000000000}; + return virtual_addr + size < max_gpu_address; + } + bool IsValidAddress(const void* addr) const noexcept { const VAddr virtual_addr = reinterpret_cast(addr); const auto end_it = std::prev(vma_map.end()); @@ -186,7 +192,7 @@ public: int PoolCommit(VAddr virtual_addr, size_t size, MemoryProt prot); int MapMemory(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot, - MemoryMapFlags flags, VMAType type, std::string_view name = "", + MemoryMapFlags flags, VMAType type, std::string_view name = "anon", bool is_exec = false, PAddr phys_addr = -1, u64 alignment = 0); int MapFile(void** out_addr, VAddr virtual_addr, size_t size, MemoryProt prot,