From 6eeced5b93c9dd3131cc55e3e31a32784ee73f98 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Thu, 3 Apr 2025 12:08:05 -0700 Subject: [PATCH] cpu_patches: Remove CPU patches for macOS and bump minimum OS version to 15.4 --- CMakeLists.txt | 3 +- src/core/cpu_patches.cpp | 240 +++------------------------------------ src/core/cpu_patches.h | 6 - src/core/tls.cpp | 4 - 4 files changed, 17 insertions(+), 236 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b3aba5d13..228a664eb 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) if(APPLE) list(APPEND ADDITIONAL_LANGUAGES OBJC) - set(CMAKE_OSX_DEPLOYMENT_TARGET 14) + # Starting with 15.4, Rosetta 2 has support for all the necessary instruction sets. + set(CMAKE_OSX_DEPLOYMENT_TARGET 15.4) endif() if (NOT CMAKE_BUILD_TYPE) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index a9f6c67a8..118485228 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -22,10 +22,6 @@ #include #else #include -#ifdef __APPLE__ -#include -#include -#endif #endif using namespace Xbyak::util; @@ -129,67 +125,7 @@ static Xbyak::Reg AllocateScratchRegister( UNREACHABLE_MSG("Out of scratch registers!"); } -#ifdef __APPLE__ - -static pthread_key_t stack_pointer_slot; -static pthread_key_t patch_stack_slot; -static std::once_flag patch_context_slots_init_flag; -static constexpr u32 patch_stack_size = 0x1000; - -static_assert(sizeof(void*) == sizeof(u64), - "Cannot fit a register inside a thread local storage slot."); - -static void FreePatchStack(void* patch_stack) { - // Subtract back to the bottom of the stack for free. - std::free(static_cast(patch_stack) - patch_stack_size); -} - -static void InitializePatchContextSlots() { - ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0, - "Unable to allocate thread-local register for stack pointer."); - ASSERT_MSG(pthread_key_create(&patch_stack_slot, FreePatchStack) == 0, - "Unable to allocate thread-local register for patch stack."); -} - -void InitializeThreadPatchStack() { - std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); - - pthread_setspecific(patch_stack_slot, - static_cast(std::malloc(patch_stack_size)) + patch_stack_size); -} - -/// Saves the stack pointer to thread local storage and loads the patch stack. -static void SaveStack(Xbyak::CodeGenerator& c) { - std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); - - // Save original stack pointer and load patch stack. - c.putSeg(gs); - c.mov(qword[reinterpret_cast(stack_pointer_slot * sizeof(void*))], rsp); - c.putSeg(gs); - c.mov(rsp, qword[reinterpret_cast(patch_stack_slot * sizeof(void*))]); -} - -/// Restores the stack pointer from thread local storage. -static void RestoreStack(Xbyak::CodeGenerator& c) { - std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots); - - // Save patch stack pointer and load original stack. - c.putSeg(gs); - c.mov(qword[reinterpret_cast(patch_stack_slot * sizeof(void*))], rsp); - c.putSeg(gs); - c.mov(rsp, qword[reinterpret_cast(stack_pointer_slot * sizeof(void*))]); -} - -/// Validates that the dst register is supported given the SaveStack/RestoreStack implementation. -static void ValidateDst(const Xbyak::Reg& dst) { - // No restrictions. -} - -#else - -void InitializeThreadPatchStack() { - // No-op -} +#if !defined(__APPLE__) // NOTE: Since stack pointer here is subtracted through safe zone and not saved anywhere, // it must not be modified during the instruction. Otherwise, we will not be able to find @@ -212,8 +148,6 @@ static void ValidateDst(const Xbyak::Reg& dst) { ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "Stack pointer not supported as destination."); } -#endif - /// Switches to the patch stack, saves registers, and restores the original stack. static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { // Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption @@ -472,147 +406,6 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat RestoreRegisters(c, {scratch}); } -#ifdef __APPLE__ - -static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in, - const u32 count) { - for (u32 i = 0; i < count; i++) { - out[i] = half_float::half_cast(in[i]); - } -} - -static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - - const auto float_count = dst.getBit() / 32; - const auto byte_count = float_count * 4; - - SaveContext(c, true); - - // Allocate stack space for outputs and load into first parameter. - c.sub(rsp, byte_count); - c.mov(rdi, rsp); - - if (src->isXMM()) { - // Allocate stack space for inputs and load into second parameter. - c.sub(rsp, byte_count); - c.mov(rsi, rsp); - - // Move input to the allocated space. - c.movdqu(ptr[rsp], *reinterpret_cast(src.get())); - } else { - c.lea(rsi, src->getAddress()); - } - - // Load float count into third parameter. - c.mov(rdx, float_count); - - c.mov(rax, reinterpret_cast(PerformVCVTPH2PS)); - c.call(rax); - - if (src->isXMM()) { - // Clean up after inputs space. - c.add(rsp, byte_count); - } - - // Load outputs into destination register and clean up space. - if (dst.isYMM()) { - c.vmovdqu(*reinterpret_cast(&dst), ptr[rsp]); - } else { - c.movdqu(*reinterpret_cast(&dst), ptr[rsp]); - } - c.add(rsp, byte_count); - - RestoreContext(c, dst, true); -} - -using SingleToHalfFloatConverter = half_float::half (*)(float); -static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = { - half_float::half_cast, - half_float::half_cast, - half_float::half_cast, - half_float::half_cast, -}; - -static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in, - const u32 count, const u8 rounding_mode) { - const auto conversion_func = SingleToHalfFloatConverters[rounding_mode]; - - for (u32 i = 0; i < count; i++) { - out[i] = conversion_func(in[i]); - } -} - -static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakOperand(operands[0]); - const auto src = ZydisToXbyakRegisterOperand(operands[1]); - const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]); - - const auto float_count = src.getBit() / 32; - const auto byte_count = float_count * 4; - - SaveContext(c, true); - - if (dst->isXMM()) { - // Allocate stack space for outputs and load into first parameter. - c.sub(rsp, byte_count); - c.mov(rdi, rsp); - } else { - c.lea(rdi, dst->getAddress()); - } - - // Allocate stack space for inputs and load into second parameter. - c.sub(rsp, byte_count); - c.mov(rsi, rsp); - - // Move input to the allocated space. - if (src.isYMM()) { - c.vmovdqu(ptr[rsp], *reinterpret_cast(&src)); - } else { - c.movdqu(ptr[rsp], *reinterpret_cast(&src)); - } - - // Load float count into third parameter. - c.mov(rdx, float_count); - - // Load rounding mode into fourth parameter. - if (ctrl & 4) { - // Load from MXCSR.RC. - c.stmxcsr(ptr[rsp - 4]); - c.mov(rcx, ptr[rsp - 4]); - c.shr(rcx, 13); - c.and_(rcx, 3); - } else { - c.mov(rcx, ctrl & 3); - } - - c.mov(rax, reinterpret_cast(PerformVCVTPS2PH)); - c.call(rax); - - // Clean up after inputs space. - c.add(rsp, byte_count); - - if (dst->isXMM()) { - // Load outputs into destination register and clean up space. - c.movdqu(*reinterpret_cast(dst.get()), ptr[rsp]); - c.add(rsp, byte_count); - } - - RestoreContext(c, *dst, true); -} - -static bool FilterRosetta2Only(const ZydisDecodedOperand*) { - int ret = 0; - size_t size = sizeof(ret); - if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) { - return false; - } - return ret; -} - -#else // __APPLE__ - static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { const auto& dst_op = operands[0]; const auto& src_op = operands[1]; @@ -657,18 +450,18 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe #endif } -#endif // __APPLE__ +static bool FilterNoBMI1(const ZydisDecodedOperand*) { + Cpu cpu; + return !cpu.has(Cpu::tBMI1); +} + +#endif // !defined(__APPLE__) static bool FilterNoSSE4a(const ZydisDecodedOperand*) { Cpu cpu; return !cpu.has(Cpu::tSSE4a); } -static bool FilterNoBMI1(const ZydisDecodedOperand*) { - Cpu cpu; - return !cpu.has(Cpu::tBMI1); -} - static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; @@ -940,16 +733,19 @@ struct PatchInfo { }; static const std::unordered_map Patches = { + // SSE4a + {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, + {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, + +#if !defined(__APPLE__) + // TLS access #if defined(_WIN32) // Windows needs a trampoline. {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, -#elif !defined(__APPLE__) +#else {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif - {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, - {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, - // BMI1 {ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}}, {ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}}, @@ -957,13 +753,7 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}}, {ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}}, {ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}}, - -#ifdef __APPLE__ - // Patches for instruction sets not supported by Rosetta 2. - // F16C - {ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}}, - {ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}}, -#endif +#endif // !defined(__APPLE__) }; static std::once_flag init_flag; diff --git a/src/core/cpu_patches.h b/src/core/cpu_patches.h index 1ccac073a..7a0546046 100644 --- a/src/core/cpu_patches.h +++ b/src/core/cpu_patches.h @@ -7,12 +7,6 @@ namespace Core { -/// Initializes a stack for the current thread for use by patch implementations. -void InitializeThreadPatchStack(); - -/// Cleans up the patch stack for the current thread. -void CleanupThreadPatchStack(); - /// Registers a module for patching, providing an area to generate trampoline code. void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_area_ptr, u64 trampoline_area_size); diff --git a/src/core/tls.cpp b/src/core/tls.cpp index 9b3178171..5dd05b4a0 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -5,7 +5,6 @@ #include "common/arch.h" #include "common/assert.h" #include "common/types.h" -#include "core/cpu_patches.h" #include "core/libraries/kernel/threads/pthread.h" #include "core/tls.h" @@ -198,9 +197,6 @@ thread_local std::once_flag init_tls_flag; void EnsureThreadInitialized() { std::call_once(init_tls_flag, [] { -#ifdef ARCH_X86_64 - InitializeThreadPatchStack(); -#endif SetTcbBase(Libraries::Kernel::g_curthread->tcb); }); }