cpu_patches: Remove CPU patches for macOS and bump minimum OS version to 15.4

This commit is contained in:
squidbus 2025-04-03 12:08:05 -07:00
parent afd0251dd2
commit 6eeced5b93
4 changed files with 17 additions and 236 deletions

View File

@ -9,7 +9,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED True)
if(APPLE) if(APPLE)
list(APPEND ADDITIONAL_LANGUAGES OBJC) list(APPEND ADDITIONAL_LANGUAGES OBJC)
set(CMAKE_OSX_DEPLOYMENT_TARGET 14) # Starting with 15.4, Rosetta 2 has support for all the necessary instruction sets.
set(CMAKE_OSX_DEPLOYMENT_TARGET 15.4)
endif() endif()
if (NOT CMAKE_BUILD_TYPE) if (NOT CMAKE_BUILD_TYPE)

View File

@ -22,10 +22,6 @@
#include <windows.h> #include <windows.h>
#else #else
#include <pthread.h> #include <pthread.h>
#ifdef __APPLE__
#include <half.hpp>
#include <sys/sysctl.h>
#endif
#endif #endif
using namespace Xbyak::util; using namespace Xbyak::util;
@ -129,67 +125,7 @@ static Xbyak::Reg AllocateScratchRegister(
UNREACHABLE_MSG("Out of scratch registers!"); UNREACHABLE_MSG("Out of scratch registers!");
} }
#ifdef __APPLE__ #if !defined(__APPLE__)
static pthread_key_t stack_pointer_slot;
static pthread_key_t patch_stack_slot;
static std::once_flag patch_context_slots_init_flag;
static constexpr u32 patch_stack_size = 0x1000;
static_assert(sizeof(void*) == sizeof(u64),
"Cannot fit a register inside a thread local storage slot.");
static void FreePatchStack(void* patch_stack) {
// Subtract back to the bottom of the stack for free.
std::free(static_cast<u8*>(patch_stack) - patch_stack_size);
}
static void InitializePatchContextSlots() {
ASSERT_MSG(pthread_key_create(&stack_pointer_slot, nullptr) == 0,
"Unable to allocate thread-local register for stack pointer.");
ASSERT_MSG(pthread_key_create(&patch_stack_slot, FreePatchStack) == 0,
"Unable to allocate thread-local register for patch stack.");
}
void InitializeThreadPatchStack() {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
pthread_setspecific(patch_stack_slot,
static_cast<u8*>(std::malloc(patch_stack_size)) + patch_stack_size);
}
/// Saves the stack pointer to thread local storage and loads the patch stack.
static void SaveStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save original stack pointer and load patch stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))]);
}
/// Restores the stack pointer from thread local storage.
static void RestoreStack(Xbyak::CodeGenerator& c) {
std::call_once(patch_context_slots_init_flag, InitializePatchContextSlots);
// Save patch stack pointer and load original stack.
c.putSeg(gs);
c.mov(qword[reinterpret_cast<void*>(patch_stack_slot * sizeof(void*))], rsp);
c.putSeg(gs);
c.mov(rsp, qword[reinterpret_cast<void*>(stack_pointer_slot * sizeof(void*))]);
}
/// Validates that the dst register is supported given the SaveStack/RestoreStack implementation.
static void ValidateDst(const Xbyak::Reg& dst) {
// No restrictions.
}
#else
void InitializeThreadPatchStack() {
// No-op
}
// NOTE: Since stack pointer here is subtracted through safe zone and not saved anywhere, // NOTE: Since stack pointer here is subtracted through safe zone and not saved anywhere,
// it must not be modified during the instruction. Otherwise, we will not be able to find // it must not be modified during the instruction. Otherwise, we will not be able to find
@ -212,8 +148,6 @@ static void ValidateDst(const Xbyak::Reg& dst) {
ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "Stack pointer not supported as destination."); ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "Stack pointer not supported as destination.");
} }
#endif
/// Switches to the patch stack, saves registers, and restores the original stack. /// Switches to the patch stack, saves registers, and restores the original stack.
static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) { static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list<Xbyak::Reg> regs) {
// Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption // Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption
@ -472,147 +406,6 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat
RestoreRegisters(c, {scratch}); RestoreRegisters(c, {scratch});
} }
#ifdef __APPLE__
static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in,
const u32 count) {
for (u32 i = 0; i < count; i++) {
out[i] = half_float::half_cast<float>(in[i]);
}
}
static void GenerateVCVTPH2PS(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakRegisterOperand(operands[0]);
const auto src = ZydisToXbyakOperand(operands[1]);
const auto float_count = dst.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c, true);
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
if (src->isXMM()) {
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
c.movdqu(ptr[rsp], *reinterpret_cast<Xbyak::Xmm*>(src.get()));
} else {
c.lea(rsi, src->getAddress());
}
// Load float count into third parameter.
c.mov(rdx, float_count);
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPH2PS));
c.call(rax);
if (src->isXMM()) {
// Clean up after inputs space.
c.add(rsp, byte_count);
}
// Load outputs into destination register and clean up space.
if (dst.isYMM()) {
c.vmovdqu(*reinterpret_cast<const Xbyak::Ymm*>(&dst), ptr[rsp]);
} else {
c.movdqu(*reinterpret_cast<const Xbyak::Xmm*>(&dst), ptr[rsp]);
}
c.add(rsp, byte_count);
RestoreContext(c, dst, true);
}
using SingleToHalfFloatConverter = half_float::half (*)(float);
static const SingleToHalfFloatConverter SingleToHalfFloatConverters[4] = {
half_float::half_cast<half_float::half, std::round_to_nearest, float>,
half_float::half_cast<half_float::half, std::round_toward_neg_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_infinity, float>,
half_float::half_cast<half_float::half, std::round_toward_zero, float>,
};
static __attribute__((sysv_abi)) void PerformVCVTPS2PH(half_float::half* out, const float* in,
const u32 count, const u8 rounding_mode) {
const auto conversion_func = SingleToHalfFloatConverters[rounding_mode];
for (u32 i = 0; i < count; i++) {
out[i] = conversion_func(in[i]);
}
}
static void GenerateVCVTPS2PH(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
const auto dst = ZydisToXbyakOperand(operands[0]);
const auto src = ZydisToXbyakRegisterOperand(operands[1]);
const auto ctrl = ZydisToXbyakImmediateOperand(operands[2]);
const auto float_count = src.getBit() / 32;
const auto byte_count = float_count * 4;
SaveContext(c, true);
if (dst->isXMM()) {
// Allocate stack space for outputs and load into first parameter.
c.sub(rsp, byte_count);
c.mov(rdi, rsp);
} else {
c.lea(rdi, dst->getAddress());
}
// Allocate stack space for inputs and load into second parameter.
c.sub(rsp, byte_count);
c.mov(rsi, rsp);
// Move input to the allocated space.
if (src.isYMM()) {
c.vmovdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Ymm*>(&src));
} else {
c.movdqu(ptr[rsp], *reinterpret_cast<const Xbyak::Xmm*>(&src));
}
// Load float count into third parameter.
c.mov(rdx, float_count);
// Load rounding mode into fourth parameter.
if (ctrl & 4) {
// Load from MXCSR.RC.
c.stmxcsr(ptr[rsp - 4]);
c.mov(rcx, ptr[rsp - 4]);
c.shr(rcx, 13);
c.and_(rcx, 3);
} else {
c.mov(rcx, ctrl & 3);
}
c.mov(rax, reinterpret_cast<u64>(PerformVCVTPS2PH));
c.call(rax);
// Clean up after inputs space.
c.add(rsp, byte_count);
if (dst->isXMM()) {
// Load outputs into destination register and clean up space.
c.movdqu(*reinterpret_cast<Xbyak::Xmm*>(dst.get()), ptr[rsp]);
c.add(rsp, byte_count);
}
RestoreContext(c, *dst, true);
}
static bool FilterRosetta2Only(const ZydisDecodedOperand*) {
int ret = 0;
size_t size = sizeof(ret);
if (sysctlbyname("sysctl.proc_translated", &ret, &size, nullptr, 0) != 0) {
return false;
}
return ret;
}
#else // __APPLE__
static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { static bool FilterTcbAccess(const ZydisDecodedOperand* operands) {
const auto& dst_op = operands[0]; const auto& dst_op = operands[0];
const auto& src_op = operands[1]; const auto& src_op = operands[1];
@ -657,18 +450,18 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe
#endif #endif
} }
#endif // __APPLE__ static bool FilterNoBMI1(const ZydisDecodedOperand*) {
Cpu cpu;
return !cpu.has(Cpu::tBMI1);
}
#endif // !defined(__APPLE__)
static bool FilterNoSSE4a(const ZydisDecodedOperand*) { static bool FilterNoSSE4a(const ZydisDecodedOperand*) {
Cpu cpu; Cpu cpu;
return !cpu.has(Cpu::tSSE4a); return !cpu.has(Cpu::tSSE4a);
} }
static bool FilterNoBMI1(const ZydisDecodedOperand*) {
Cpu cpu;
return !cpu.has(Cpu::tBMI1);
}
static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) {
bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE &&
operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE;
@ -940,16 +733,19 @@ struct PatchInfo {
}; };
static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = { static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
// SSE4a
{ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}},
{ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}},
#if !defined(__APPLE__)
// TLS access
#if defined(_WIN32) #if defined(_WIN32)
// Windows needs a trampoline. // Windows needs a trampoline.
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}},
#elif !defined(__APPLE__) #else
{ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}},
#endif #endif
{ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}},
{ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}},
// BMI1 // BMI1
{ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}}, {ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}},
{ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}}, {ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}},
@ -957,13 +753,7 @@ static const std::unordered_map<ZydisMnemonic, PatchInfo> Patches = {
{ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}}, {ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}},
{ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}}, {ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}},
{ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}}, {ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}},
#endif // !defined(__APPLE__)
#ifdef __APPLE__
// Patches for instruction sets not supported by Rosetta 2.
// F16C
{ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}},
{ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}},
#endif
}; };
static std::once_flag init_flag; static std::once_flag init_flag;

View File

@ -7,12 +7,6 @@
namespace Core { namespace Core {
/// Initializes a stack for the current thread for use by patch implementations.
void InitializeThreadPatchStack();
/// Cleans up the patch stack for the current thread.
void CleanupThreadPatchStack();
/// Registers a module for patching, providing an area to generate trampoline code. /// Registers a module for patching, providing an area to generate trampoline code.
void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_area_ptr, void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_area_ptr,
u64 trampoline_area_size); u64 trampoline_area_size);

View File

@ -5,7 +5,6 @@
#include "common/arch.h" #include "common/arch.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/types.h" #include "common/types.h"
#include "core/cpu_patches.h"
#include "core/libraries/kernel/threads/pthread.h" #include "core/libraries/kernel/threads/pthread.h"
#include "core/tls.h" #include "core/tls.h"
@ -198,9 +197,6 @@ thread_local std::once_flag init_tls_flag;
void EnsureThreadInitialized() { void EnsureThreadInitialized() {
std::call_once(init_tls_flag, [] { std::call_once(init_tls_flag, [] {
#ifdef ARCH_X86_64
InitializeThreadPatchStack();
#endif
SetTcbBase(Libraries::Kernel::g_curthread->tcb); SetTcbBase(Libraries::Kernel::g_curthread->tcb);
}); });
} }