From fd26ad65774afa8b510574422a3d186b99c53335 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Thu, 3 Apr 2025 12:13:45 -0700 Subject: [PATCH] cpu_patches: Remove BMI1 patches These are now only good for very old Intel CPUs that: * Still do not currently function due to other CPU instruction issues. * Will probably be too slow to run shadPS4 well. --- src/core/cpu_patches.cpp | 362 +-------------------------------------- src/core/tls.cpp | 4 +- 2 files changed, 3 insertions(+), 363 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 118485228..c8106b270 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -77,335 +77,6 @@ static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& opera return ptr[expression]; } -static u64 ZydisToXbyakImmediateOperand(const ZydisDecodedOperand& operand) { - ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_IMMEDIATE, - "Expected immediate operand, got type: {}", static_cast(operand.type)); - return operand.imm.value.u; -} - -static std::unique_ptr ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { - switch (operand.type) { - case ZYDIS_OPERAND_TYPE_REGISTER: { - return std::make_unique(ZydisToXbyakRegisterOperand(operand)); - } - case ZYDIS_OPERAND_TYPE_MEMORY: { - return std::make_unique(ZydisToXbyakMemoryOperand(operand)); - } - default: - UNREACHABLE_MSG("Unsupported operand type: {}", static_cast(operand.type)); - } -} - -static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) { - if (operand->isREG()) { - return operand->getIdx() == index; - } - if (operand->isMEM()) { - const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp(); - return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index; - } - UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast(operand->getKind())); -} - -static bool IsRegisterAllocated( - const std::initializer_list& allocated_registers, const int index) { - return std::ranges::find_if(allocated_registers.begin(), allocated_registers.end(), - [index](const Xbyak::Operand* operand) { - return OperandUsesRegister(operand, index); - }) != allocated_registers.end(); -} - -static Xbyak::Reg AllocateScratchRegister( - const std::initializer_list allocated_registers, const u32 bits) { - for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) { - if (!IsRegisterAllocated(allocated_registers, index)) { - return Xbyak::Reg32e(index, static_cast(bits)); - } - } - UNREACHABLE_MSG("Out of scratch registers!"); -} - -#if !defined(__APPLE__) - -// NOTE: Since stack pointer here is subtracted through safe zone and not saved anywhere, -// it must not be modified during the instruction. Otherwise, we will not be able to find -// and load registers back from where they were saved. Thus, a limitation is placed on -// instructions, that they must not use the stack pointer register as a destination. - -/// Saves the stack pointer to thread local storage and loads the patch stack. -static void SaveStack(Xbyak::CodeGenerator& c) { - c.lea(rsp, ptr[rsp - 128]); // red zone -} - -/// Restores the stack pointer from thread local storage. -static void RestoreStack(Xbyak::CodeGenerator& c) { - c.lea(rsp, ptr[rsp + 128]); // red zone -} - -/// Validates that the dst register is supported given the SaveStack/RestoreStack implementation. -static void ValidateDst(const Xbyak::Reg& dst) { - // Stack pointer is not preserved, so it can't be used as a dst. - ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "Stack pointer not supported as destination."); -} - -/// Switches to the patch stack, saves registers, and restores the original stack. -static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { - // Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption - // if games decide to not follow the ABI and use the red zone. - SaveStack(c); - for (const auto& reg : regs) { - c.push(reg.cvt64()); - } - RestoreStack(c); -} - -/// Switches to the patch stack, restores registers, and restores the original stack. -static void RestoreRegisters(Xbyak::CodeGenerator& c, - const std::initializer_list regs) { - SaveStack(c); - for (const auto& reg : regs) { - c.pop(reg.cvt64()); - } - RestoreStack(c); -} - -/// Switches to the patch stack and stores all registers. -static void SaveContext(Xbyak::CodeGenerator& c, bool save_flags = false) { - SaveStack(c); - for (int reg = Xbyak::Operand::RAX; reg <= Xbyak::Operand::R15; reg++) { - c.push(Xbyak::Reg64(reg)); - } - c.lea(rsp, ptr[rsp - 32 * 16]); - for (int reg = 0; reg <= 15; reg++) { - c.vmovdqu(ptr[rsp + 32 * reg], Xbyak::Ymm(reg)); - } - if (save_flags) { - c.pushfq(); - } -} - -/// Restores all registers and restores the original stack. -/// If the destination is a register, it is not restored to preserve the output. -static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst, - bool restore_flags = false) { - if (restore_flags) { - c.popfq(); - } - for (int reg = 15; reg >= 0; reg--) { - if ((!dst.isXMM() && !dst.isYMM()) || dst.getIdx() != reg) { - c.vmovdqu(Xbyak::Ymm(reg), ptr[rsp + 32 * reg]); - } - } - c.lea(rsp, ptr[rsp + 32 * 16]); - for (int reg = Xbyak::Operand::R15; reg >= Xbyak::Operand::RAX; reg--) { - if (!dst.isREG() || dst.getIdx() != reg) { - c.pop(Xbyak::Reg64(reg)); - } else { - c.lea(rsp, ptr[rsp + 8]); - } - } - RestoreStack(c); -} - -static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); - const auto src2 = ZydisToXbyakOperand(operands[2]); - ValidateDst(dst); - - // Check if src2 is a memory operand or a register different to dst. - // In those cases, we don't need to use a temporary register and are free to modify dst. - // In cases where dst and src2 are the same register, a temporary needs to be used to avoid - // modifying src2. - bool src2_uses_dst = false; - if (src2->isMEM()) { - const auto base = src2->getAddress().getRegExp().getBase().getIdx(); - const auto index = src2->getAddress().getRegExp().getIndex().getIdx(); - src2_uses_dst = base == dst.getIdx() || index == dst.getIdx(); - } else { - ASSERT(src2->isREG()); - src2_uses_dst = src2->getReg() == dst; - } - - if (!src2_uses_dst) { - if (dst != src1) - c.mov(dst, src1); - c.not_(dst); - c.and_(dst, *src2); - } else { - const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - c.mov(scratch, src1); - c.not_(scratch); - c.and_(scratch, *src2); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); - } -} - -static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - const auto start_len = ZydisToXbyakRegisterOperand(operands[2]); - ValidateDst(dst); - - const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast(start_len.getBit())); - const auto scratch1 = - AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit()); - const auto scratch2 = - AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit()); - - if (dst.getIdx() == shift.getIdx()) { - SaveRegisters(c, {scratch1, scratch2}); - } else { - SaveRegisters(c, {scratch1, scratch2, shift}); - } - - c.mov(scratch1, *src); - if (shift.getIdx() != start_len.getIdx()) { - c.mov(shift, start_len); - } - - c.shr(scratch1, shift.cvt8()); - c.shr(shift, 8); - c.mov(scratch2, 1); - c.shl(scratch2, shift.cvt8()); - c.dec(scratch2); - - c.mov(dst, scratch1); - c.and_(dst, scratch2); - - if (dst.getIdx() == shift.getIdx()) { - RestoreRegisters(c, {scratch2, scratch1}); - } else { - RestoreRegisters(c, {shift, scratch2, scratch1}); - } -} - -static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - ValidateDst(dst); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - // BLSI sets CF to zero if source is zero, otherwise it sets CF to one. - Xbyak::Label clear_carry, end; - - c.mov(scratch, *src); - c.neg(scratch); // NEG, like BLSI, clears CF if the source is zero and sets it otherwise - c.jnc(clear_carry); - - c.and_(scratch, *src); - c.stc(); // setting/clearing carry needs to happen after the AND because that clears CF - c.jmp(end); - - c.L(clear_carry); - c.and_(scratch, *src); - // We don't need to clear carry here since AND does that for us - - c.L(end); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - ValidateDst(dst); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - Xbyak::Label clear_carry, end; - - // BLSMSK sets CF to zero if source is NOT zero, otherwise it sets CF to one. - c.mov(scratch, *src); - c.test(scratch, scratch); - c.jnz(clear_carry); - - c.dec(scratch); - c.xor_(scratch, *src); - c.stc(); - c.jmp(end); - - c.L(clear_carry); - c.dec(scratch); - c.xor_(scratch, *src); - // We don't need to clear carry here since XOR does that for us - - c.L(end); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - -static void GenerateTZCNT(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - ValidateDst(dst); - - Xbyak::Label src_zero, end; - - c.cmp(*src, 0); - c.je(src_zero); - - // If src is not zero, functions like a BSF, but also clears the CF - c.bsf(dst, *src); - c.clc(); - c.jmp(end); - - c.L(src_zero); - c.mov(dst, operands[0].size); - // Since dst is not zero, also set ZF to zero. Testing dst with itself when we know - // it isn't zero is a good way to do this. - // Use cvt32 to avoid REX/Operand size prefixes. - c.test(dst.cvt32(), dst.cvt32()); - // When source is zero, TZCNT also sets CF. - c.stc(); - - c.L(end); -} - -static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { - const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto src = ZydisToXbyakOperand(operands[1]); - ValidateDst(dst); - - const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); - - SaveRegisters(c, {scratch}); - - Xbyak::Label clear_carry, end; - - // BLSR sets CF to zero if source is NOT zero, otherwise it sets CF to one. - c.mov(scratch, *src); - c.test(scratch, scratch); - c.jnz(clear_carry); - - c.dec(scratch); - c.and_(scratch, *src); - c.stc(); - c.jmp(end); - - c.L(clear_carry); - c.dec(scratch); - c.and_(scratch, *src); - // We don't need to clear carry here since AND does that for us - - c.L(end); - c.mov(dst, scratch); - - RestoreRegisters(c, {scratch}); -} - static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { const auto& dst_op = operands[0]; const auto& src_op = operands[1]; @@ -450,13 +121,6 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe #endif } -static bool FilterNoBMI1(const ZydisDecodedOperand*) { - Cpu cpu; - return !cpu.has(Cpu::tBMI1); -} - -#endif // !defined(__APPLE__) - static bool FilterNoSSE4a(const ZydisDecodedOperand*) { Cpu cpu; return !cpu.has(Cpu::tSSE4a); @@ -737,23 +401,12 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, -#if !defined(__APPLE__) - // TLS access #if defined(_WIN32) // Windows needs a trampoline. {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, -#else +#elif !defined(__APPLE__) {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif - - // BMI1 - {ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}}, - {ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}}, - {ZYDIS_MNEMONIC_BLSI, {FilterNoBMI1, GenerateBLSI, true}}, - {ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}}, - {ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}}, - {ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}}, -#endif // !defined(__APPLE__) }; static std::once_flag init_flag; @@ -1070,18 +723,7 @@ void RegisterPatchModule(void* module_ptr, u64 module_size, void* trampoline_are } void PrePatchInstructions(u64 segment_addr, u64 segment_size) { -#if defined(__APPLE__) - // HACK: For some reason patching in the signal handler at the start of a page does not work - // under Rosetta 2. Patch any instructions at the start of a page ahead of time. - if (!Patches.empty()) { - auto* code_page = reinterpret_cast(Common::AlignUp(segment_addr, 0x1000)); - const auto* end_page = code_page + Common::AlignUp(segment_size, 0x1000); - while (code_page < end_page) { - TryPatchJit(code_page); - code_page += 0x1000; - } - } -#elif !defined(_WIN32) +#if !defined(_WIN32) && !defined(__APPLE__) // Linux and others have an FS segment pointing to valid memory, so continue to do full // ahead-of-time patching for now until a better solution is worked out. if (!Patches.empty()) { diff --git a/src/core/tls.cpp b/src/core/tls.cpp index 5dd05b4a0..e13c683e1 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -196,9 +196,7 @@ Tcb* GetTcbBase() { thread_local std::once_flag init_tls_flag; void EnsureThreadInitialized() { - std::call_once(init_tls_flag, [] { - SetTcbBase(Libraries::Kernel::g_curthread->tcb); - }); + std::call_once(init_tls_flag, [] { SetTcbBase(Libraries::Kernel::g_curthread->tcb); }); } } // namespace Core