diff --git a/CMakeLists.txt b/CMakeLists.txt index 378b8f78d..84146bb01 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,10 @@ set(GNM_LIB src/core/libraries/gnmdriver/gnmdriver.cpp src/core/libraries/gnmdriver/gnm_error.h ) -set(KERNEL_LIB src/core/libraries/kernel/threads/condvar.cpp +set(KERNEL_LIB src/core/libraries/kernel/sync/mutex.cpp + src/core/libraries/kernel/sync/mutex.h + src/core/libraries/kernel/sync/semaphore.h + src/core/libraries/kernel/threads/condvar.cpp src/core/libraries/kernel/threads/event_flag.cpp src/core/libraries/kernel/threads/exception.cpp src/core/libraries/kernel/threads/exception.h @@ -875,6 +878,10 @@ target_link_libraries(shadps4 PRIVATE Boost::headers GPUOpen::VulkanMemoryAlloca target_compile_definitions(shadps4 PRIVATE IMGUI_USER_CONFIG="imgui/imgui_config.h") target_compile_definitions(Dear_ImGui PRIVATE IMGUI_USER_CONFIG="${PROJECT_SOURCE_DIR}/src/imgui/imgui_config.h") +if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + target_compile_definitions(shadps4 PRIVATE ENABLE_USERFAULTFD) +endif() + if (APPLE) option(USE_SYSTEM_VULKAN_LOADER "Enables using the system Vulkan loader instead of directly linking with MoltenVK. Useful for loading validation layers." OFF) if (USE_SYSTEM_VULKAN_LOADER) diff --git a/src/common/ntapi.cpp b/src/common/ntapi.cpp index ffdedb17f..e0ff1cef0 100644 --- a/src/common/ntapi.cpp +++ b/src/common/ntapi.cpp @@ -6,7 +6,6 @@ #include "ntapi.h" NtClose_t NtClose = nullptr; -NtDelayExecution_t NtDelayExecution = nullptr; NtSetInformationFile_t NtSetInformationFile = nullptr; NtCreateThread_t NtCreateThread = nullptr; NtTerminateThread_t NtTerminateThread = nullptr; @@ -18,7 +17,6 @@ void Initialize() { // http://stackoverflow.com/a/31411628/4725495 NtClose = (NtClose_t)GetProcAddress(nt_handle, "NtClose"); - NtDelayExecution = (NtDelayExecution_t)GetProcAddress(nt_handle, "NtDelayExecution"); NtSetInformationFile = (NtSetInformationFile_t)GetProcAddress(nt_handle, "NtSetInformationFile"); NtCreateThread = (NtCreateThread_t)GetProcAddress(nt_handle, "NtCreateThread"); diff --git a/src/common/ntapi.h b/src/common/ntapi.h index 743174061..cb1ba7f1c 100644 --- a/src/common/ntapi.h +++ b/src/common/ntapi.h @@ -408,7 +408,7 @@ typedef struct _TEB { /* win32/win64 */ #ifdef _WIN64 PVOID SystemReserved1[30]; /* /0190 */ #else - PVOID SystemReserved1[26]; /* 10c/ used for krnl386 private data in Wine */ + PVOID SystemReserved1[26]; /* 10c/ */ #endif char PlaceholderCompatibilityMode; /* 174/0280 */ BOOLEAN PlaceholderHydrationAlwaysExplicit; /* 175/0281 */ @@ -430,13 +430,13 @@ typedef struct _TEB { /* win32/win64 */ BYTE SpareBytes1[23]; /* 1b9/ */ ULONG TxFsContext; /* 1d0/ */ #endif - GDI_TEB_BATCH GdiTebBatch; /* 1d4/02f0 used for ntdll private data in Wine */ + GDI_TEB_BATCH GdiTebBatch; /* 1d4/02f0 */ CLIENT_ID RealClientId; /* 6b4/07d8 */ HANDLE GdiCachedProcessHandle; /* 6bc/07e8 */ ULONG GdiClientPID; /* 6c0/07f0 */ ULONG GdiClientTID; /* 6c4/07f4 */ PVOID GdiThreadLocaleInfo; /* 6c8/07f8 */ - ULONG_PTR Win32ClientInfo[62]; /* 6cc/0800 used for user32 private data in Wine */ + ULONG_PTR Win32ClientInfo[62]; /* 6cc/0800 */ PVOID glDispatchTable[233]; /* 7c4/09f0 */ PVOID glReserved1[29]; /* b68/1138 */ PVOID glReserved2; /* bdc/1220 */ @@ -511,8 +511,6 @@ static_assert(offsetof(TEB, DeallocationStack) == typedef u64(__stdcall* NtClose_t)(HANDLE Handle); -typedef u64(__stdcall* NtDelayExecution_t)(BOOL Alertable, PLARGE_INTEGER DelayInterval); - typedef u64(__stdcall* NtSetInformationFile_t)(HANDLE FileHandle, PIO_STATUS_BLOCK IoStatusBlock, PVOID FileInformation, ULONG Length, FILE_INFORMATION_CLASS FileInformationClass); @@ -525,7 +523,6 @@ typedef u64(__stdcall* NtCreateThread_t)(PHANDLE ThreadHandle, ACCESS_MASK Desir typedef u64(__stdcall* NtTerminateThread_t)(HANDLE ThreadHandle, u64 ExitStatus); extern NtClose_t NtClose; -extern NtDelayExecution_t NtDelayExecution; extern NtSetInformationFile_t NtSetInformationFile; extern NtCreateThread_t NtCreateThread; extern NtTerminateThread_t NtTerminateThread; diff --git a/src/common/thread.cpp b/src/common/thread.cpp index 46df68c38..c87aea6ef 100644 --- a/src/common/thread.cpp +++ b/src/common/thread.cpp @@ -147,6 +147,10 @@ void SetCurrentThreadName(const char* name) { SetThreadDescription(GetCurrentThread(), UTF8ToUTF16W(name).data()); } +void SetThreadName(void* thread, const char* name) { + SetThreadDescription(thread, UTF8ToUTF16W(name).data()); +} + #else // !MSVC_VER, so must be POSIX threads // MinGW with the POSIX threading model does not support pthread_setname_np @@ -170,11 +174,19 @@ void SetCurrentThreadName(const char* name) { pthread_setname_np(pthread_self(), name); #endif } + +void SetThreadName(void* thread, const char* name) { + // TODO +} #endif #if defined(_WIN32) void SetCurrentThreadName(const char*) { - // Do Nothing on MingW + // Do Nothing on MinGW +} + +void SetThreadName(void* thread, const char* name) { + // Do Nothing on MinGW } #endif diff --git a/src/common/thread.h b/src/common/thread.h index fd962f8e5..175ba9445 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -23,6 +23,8 @@ void SetCurrentThreadPriority(ThreadPriority new_priority); void SetCurrentThreadName(const char* name); +void SetThreadName(void* thread, const char* name); + class AccurateTimer { std::chrono::nanoseconds target_interval{}; std::chrono::nanoseconds total_wait{}; diff --git a/src/core/devices/logger.cpp b/src/core/devices/logger.cpp index bf5a28382..6f104509c 100644 --- a/src/core/devices/logger.cpp +++ b/src/core/devices/logger.cpp @@ -15,6 +15,7 @@ s64 Logger::write(const void* buf, size_t nbytes) { log(static_cast(buf), nbytes); return nbytes; } + size_t Logger::writev(const Libraries::Kernel::SceKernelIovec* iov, int iovcnt) { for (int i = 0; i < iovcnt; i++) { log(static_cast(iov[i].iov_base), iov[i].iov_len); diff --git a/src/core/libraries/kernel/sync/mutex.cpp b/src/core/libraries/kernel/sync/mutex.cpp new file mode 100644 index 000000000..c5e3eba1d --- /dev/null +++ b/src/core/libraries/kernel/sync/mutex.cpp @@ -0,0 +1,52 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "mutex.h" + +#include "common/assert.h" + +namespace Libraries::Kernel { + +TimedMutex::TimedMutex() { +#ifdef _WIN64 + mtx = CreateMutex(nullptr, false, nullptr); + ASSERT(mtx); +#endif +} + +TimedMutex::~TimedMutex() { +#ifdef _WIN64 + CloseHandle(mtx); +#endif +} + +void TimedMutex::lock() { +#ifdef _WIN64 + for (;;) { + u64 res = WaitForSingleObjectEx(mtx, INFINITE, true); + if (res == WAIT_OBJECT_0) { + return; + } + } +#else + mtx.lock(); +#endif +} + +bool TimedMutex::try_lock() { +#ifdef _WIN64 + return WaitForSingleObjectEx(mtx, 0, true) == WAIT_OBJECT_0; +#else + return mtx.try_lock(); +#endif +} + +void TimedMutex::unlock() { +#ifdef _WIN64 + ReleaseMutex(mtx); +#else + mtx.unlock(); +#endif +} + +} // namespace Libraries::Kernel \ No newline at end of file diff --git a/src/core/libraries/kernel/sync/mutex.h b/src/core/libraries/kernel/sync/mutex.h new file mode 100644 index 000000000..f14a920b4 --- /dev/null +++ b/src/core/libraries/kernel/sync/mutex.h @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/types.h" + +#ifdef _WIN64 +#include +#else +#include +#endif + +namespace Libraries::Kernel { + +class TimedMutex { +public: + TimedMutex(); + ~TimedMutex(); + + void lock(); + bool try_lock(); + + void unlock(); + + template + bool try_lock_for(const std::chrono::duration& rel_time) { +#ifdef _WIN64 + constexpr auto zero = std::chrono::duration::zero(); + const auto now = std::chrono::steady_clock::now(); + + std::chrono::steady_clock::time_point abs_time = now; + if (rel_time > zero) { + constexpr auto max = (std::chrono::steady_clock::time_point::max)(); + if (abs_time < max - rel_time) { + abs_time += rel_time; + } else { + abs_time = max; + } + } + + return try_lock_until(abs_time); +#else + return mtx.try_lock_for(rel_time); +#endif + } + + template + bool try_lock_until(const std::chrono::time_point& abs_time) { +#ifdef _WIN64 + for (;;) { + const auto now = Clock::now(); + if (abs_time <= now) { + return false; + } + + const auto rel_ms = std::chrono::ceil(abs_time - now); + u64 res = WaitForSingleObjectEx(mtx, static_cast(rel_ms.count()), true); + if (res == WAIT_OBJECT_0) { + return true; + } else if (res == WAIT_TIMEOUT) { + return false; + } + } +#else + return mtx.try_lock_until(abs_time); +#endif + } + +private: +#ifdef _WIN64 + HANDLE mtx; +#else + std::timed_mutex mtx; +#endif +}; + +} // namespace Libraries::Kernel \ No newline at end of file diff --git a/src/core/libraries/kernel/sync/semaphore.h b/src/core/libraries/kernel/sync/semaphore.h new file mode 100644 index 000000000..884b08968 --- /dev/null +++ b/src/core/libraries/kernel/sync/semaphore.h @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "common/assert.h" +#include "common/types.h" + +#ifdef _WIN64 +#include +#elif defined(__APPLE__) +#include +#else +#include +#endif + +namespace Libraries::Kernel { + +template +class Semaphore { +public: + Semaphore(s32 initialCount) +#if !defined(_WIN64) && !defined(__APPLE__) + : sem{initialCount} +#endif + { +#ifdef _WIN64 + sem = CreateSemaphore(nullptr, initialCount, max, nullptr); + ASSERT(sem); +#elif defined(__APPLE__) + sem = dispatch_semaphore_create(initialCount); + ASSERT(sem); +#endif + } + + ~Semaphore() { +#ifdef _WIN64 + CloseHandle(sem); +#elif defined(__APPLE__) + dispatch_release(sem); +#endif + } + + void release() { +#ifdef _WIN64 + ReleaseSemaphore(sem, 1, nullptr); +#elif defined(__APPLE__) + dispatch_semaphore_signal(sem); +#else + sem.release(); +#endif + } + + void acquire() { +#ifdef _WIN64 + for (;;) { + u64 res = WaitForSingleObjectEx(sem, INFINITE, true); + if (res == WAIT_OBJECT_0) { + return; + } + } +#elif defined(__APPLE__) + for (;;) { + const auto res = dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER); + if (res == 0) { + return; + } + } +#else + sem.acquire(); +#endif + } + + bool try_acquire() { +#ifdef _WIN64 + return WaitForSingleObjectEx(sem, 0, true) == WAIT_OBJECT_0; +#elif defined(__APPLE__) + return dispatch_semaphore_wait(sem, DISPATCH_TIME_NOW) == 0; +#else + return sem.try_acquire(); +#endif + } + + template + bool try_acquire_for(const std::chrono::duration& rel_time) { +#ifdef _WIN64 + const auto rel_time_ms = std::chrono::ceil(rel_time); + const u64 timeout_ms = static_cast(rel_time_ms.count()); + + if (timeout_ms == 0) { + return false; + } + + return WaitForSingleObjectEx(sem, timeout_ms, true) == WAIT_OBJECT_0; +#elif defined(__APPLE__) + const auto rel_time_ns = std::chrono::ceil(rel_time).count(); + const auto timeout = dispatch_time(DISPATCH_TIME_NOW, rel_time_ns); + return dispatch_semaphore_wait(sem, timeout) == 0; +#else + return sem.try_acquire_for(rel_time); +#endif + } + + template + bool try_acquire_until(const std::chrono::time_point& abs_time) { +#ifdef _WIN64 + const auto now = Clock::now(); + if (now >= abs_time) { + return false; + } + + const auto rel_time = std::chrono::ceil(abs_time - now); + const u64 timeout_ms = static_cast(rel_time.count()); + if (timeout_ms == 0) { + return false; + } + + u64 res = WaitForSingleObjectEx(sem, static_cast(timeout_ms), true); + return res == WAIT_OBJECT_0; +#elif defined(__APPLE__) + auto abs_s = std::chrono::time_point_cast(abs_time); + auto abs_ns = std::chrono::time_point_cast(abs_time) - + std::chrono::time_point_cast(abs_s); + const timespec abs_timespec = { + .tv_sec = abs_s.time_since_epoch().count(), + .tv_nsec = abs_ns.count(), + }; + const auto timeout = dispatch_walltime(&abs_timespec, 0); + return dispatch_semaphore_wait(sem, timeout) == 0; +#else + return sem.try_acquire_until(abs_time); +#endif + } + +private: +#ifdef _WIN64 + HANDLE sem; +#elif defined(__APPLE__) + dispatch_semaphore_t sem; +#else + std::counting_semaphore sem; +#endif +}; + +using BinarySemaphore = Semaphore<1>; +using CountingSemaphore = Semaphore<0x7FFFFFFF /*ORBIS_KERNEL_SEM_VALUE_MAX*/>; + +} // namespace Libraries::Kernel \ No newline at end of file diff --git a/src/core/libraries/kernel/threads/condvar.cpp b/src/core/libraries/kernel/threads/condvar.cpp index cbe8f6ca7..2927899d9 100644 --- a/src/core/libraries/kernel/threads/condvar.cpp +++ b/src/core/libraries/kernel/threads/condvar.cpp @@ -191,7 +191,7 @@ int PthreadCond::Signal() { PthreadMutex* mp = td->mutex_obj; has_user_waiters = SleepqRemove(sq, td); - std::binary_semaphore* waddr = nullptr; + BinarySemaphore* waddr = nullptr; if (mp->m_owner == curthread) { if (curthread->nwaiter_defer >= Pthread::MaxDeferWaiters) { curthread->WakeAll(); @@ -211,7 +211,7 @@ int PthreadCond::Signal() { struct BroadcastArg { Pthread* curthread; - std::binary_semaphore* waddrs[Pthread::MaxDeferWaiters]; + BinarySemaphore* waddrs[Pthread::MaxDeferWaiters]; int count; }; diff --git a/src/core/libraries/kernel/threads/event_flag.cpp b/src/core/libraries/kernel/threads/event_flag.cpp index 39925153c..ce75bed9e 100644 --- a/src/core/libraries/kernel/threads/event_flag.cpp +++ b/src/core/libraries/kernel/threads/event_flag.cpp @@ -118,7 +118,6 @@ public: } m_bits |= bits; - m_cond_var.notify_all(); } diff --git a/src/core/libraries/kernel/threads/pthread.cpp b/src/core/libraries/kernel/threads/pthread.cpp index a562c51b2..b2fe09934 100644 --- a/src/core/libraries/kernel/threads/pthread.cpp +++ b/src/core/libraries/kernel/threads/pthread.cpp @@ -380,6 +380,7 @@ int PS4_SYSV_ABI posix_sched_get_priority_min() { int PS4_SYSV_ABI posix_pthread_rename_np(PthreadT thread, const char* name) { LOG_INFO(Kernel_Pthread, "name = {}", name); + Common::SetThreadName(reinterpret_cast(thread->native_thr.GetHandle()), name); thread->name = name; return ORBIS_OK; } diff --git a/src/core/libraries/kernel/threads/pthread.h b/src/core/libraries/kernel/threads/pthread.h index 9d71c75e8..456c2ef37 100644 --- a/src/core/libraries/kernel/threads/pthread.h +++ b/src/core/libraries/kernel/threads/pthread.h @@ -11,6 +11,8 @@ #include #include "common/enum.h" +#include "core/libraries/kernel/sync/mutex.h" +#include "core/libraries/kernel/sync/semaphore.h" #include "core/libraries/kernel/time.h" #include "core/thread.h" #include "core/tls.h" @@ -44,7 +46,7 @@ enum class PthreadMutexProt : u32 { }; struct PthreadMutex { - std::timed_mutex m_lock; + TimedMutex m_lock; PthreadMutexFlags m_flags; Pthread* m_owner; int m_count; @@ -288,14 +290,14 @@ struct Pthread { int report_events; int event_mask; std::string name; - std::binary_semaphore wake_sema{0}; + BinarySemaphore wake_sema{0}; SleepQueue* sleepqueue; void* wchan; PthreadMutex* mutex_obj; bool will_sleep; bool has_user_waiters; int nwaiter_defer; - std::binary_semaphore* defer_waiters[MaxDeferWaiters]; + BinarySemaphore* defer_waiters[MaxDeferWaiters]; bool InCritical() const noexcept { return locklevel > 0 || critical_count > 0; diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index e3c7e9092..5aa04f251 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -6,6 +6,8 @@ #include #include +#include "core/libraries/kernel/sync/semaphore.h" + #include "common/logging/log.h" #include "core/libraries/kernel/kernel.h" #include "core/libraries/kernel/orbis_error.h" @@ -21,7 +23,7 @@ constexpr int ORBIS_KERNEL_SEM_VALUE_MAX = 0x7FFFFFFF; struct PthreadSem { explicit PthreadSem(s32 value_) : semaphore{value_}, value{value_} {} - std::counting_semaphore semaphore; + CountingSemaphore semaphore; std::atomic value; }; @@ -75,7 +77,7 @@ public: it = wait_list.erase(it); token_count -= waiter->need_count; waiter->was_signaled = true; - waiter->cv.notify_one(); + waiter->sem.release(); } return true; @@ -88,7 +90,7 @@ public: } for (auto* waiter : wait_list) { waiter->was_cancled = true; - waiter->cv.notify_one(); + waiter->sem.release(); } wait_list.clear(); token_count = set_count < 0 ? init_count : set_count; @@ -99,21 +101,21 @@ public: std::scoped_lock lk{mutex}; for (auto* waiter : wait_list) { waiter->was_deleted = true; - waiter->cv.notify_one(); + waiter->sem.release(); } wait_list.clear(); } public: struct WaitingThread { - std::condition_variable cv; + BinarySemaphore sem; u32 priority; s32 need_count; bool was_signaled{}; bool was_deleted{}; bool was_cancled{}; - explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} { + explicit WaitingThread(s32 need_count, bool is_fifo) : sem{0}, need_count{need_count} { // Retrieve calling thread priority for sorting into waiting threads list. if (!is_fifo) { priority = g_curthread->attr.prio; @@ -134,24 +136,26 @@ public: } int Wait(std::unique_lock& lk, u32* timeout) { + lk.unlock(); if (!timeout) { // Wait indefinitely until we are woken up. - cv.wait(lk); + sem.acquire(); + lk.lock(); return GetResult(false); } // Wait until timeout runs out, recording how much remaining time there was. const auto start = std::chrono::high_resolution_clock::now(); - const auto signaled = cv.wait_for(lk, std::chrono::microseconds(*timeout), - [this] { return was_signaled; }); + sem.try_acquire_for(std::chrono::microseconds(*timeout)); const auto end = std::chrono::high_resolution_clock::now(); const auto time = std::chrono::duration_cast(end - start).count(); - if (signaled) { + lk.lock(); + if (was_signaled) { *timeout -= time; } else { *timeout = 0; } - return GetResult(!signaled); + return GetResult(!was_signaled); } }; diff --git a/src/core/libraries/kernel/time.cpp b/src/core/libraries/kernel/time.cpp index b586431ab..2565b8078 100644 --- a/src/core/libraries/kernel/time.cpp +++ b/src/core/libraries/kernel/time.cpp @@ -52,7 +52,22 @@ u64 PS4_SYSV_ABI sceKernelReadTsc() { int PS4_SYSV_ABI sceKernelUsleep(u32 microseconds) { #ifdef _WIN64 - std::this_thread::sleep_for(std::chrono::microseconds(microseconds)); + const auto start_time = std::chrono::high_resolution_clock::now(); + auto total_wait_time = std::chrono::microseconds(microseconds); + + while (total_wait_time.count() > 0) { + auto wait_time = std::chrono::ceil(total_wait_time).count(); + u64 res = SleepEx(static_cast(wait_time), true); + if (res == WAIT_IO_COMPLETION) { + auto elapsedTime = std::chrono::high_resolution_clock::now() - start_time; + auto elapsedMicroseconds = + std::chrono::duration_cast(elapsedTime).count(); + total_wait_time = std::chrono::microseconds(microseconds - elapsedMicroseconds); + } else { + break; + } + } + return 0; #else timespec start; diff --git a/src/core/module.cpp b/src/core/module.cpp index ef34f25c1..70afb932c 100644 --- a/src/core/module.cpp +++ b/src/core/module.cpp @@ -470,8 +470,8 @@ OrbisKernelModuleInfoEx Module::GetModuleInfoEx() const { .tls_align = tls.align, .init_proc_addr = base_virtual_addr + dynamic_info.init_virtual_addr, .fini_proc_addr = base_virtual_addr + dynamic_info.fini_virtual_addr, - .eh_frame_hdr_addr = eh_frame_hdr_addr, - .eh_frame_addr = eh_frame_addr, + .eh_frame_hdr_addr = base_virtual_addr + eh_frame_hdr_addr, + .eh_frame_addr = base_virtual_addr + eh_frame_addr, .eh_frame_hdr_size = eh_frame_hdr_size, .eh_frame_size = eh_frame_size, .segments = info.segments, diff --git a/src/emulator.cpp b/src/emulator.cpp index 60d6e18d7..8a7c04cf4 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -266,7 +266,7 @@ void Emulator::Run(const std::filesystem::path& file) { } void Emulator::LoadSystemModules(const std::filesystem::path& file, std::string game_serial) { - constexpr std::array ModulesToLoad{ + constexpr std::array ModulesToLoad{ {{"libSceNgs2.sprx", &Libraries::Ngs2::RegisterlibSceNgs2}, {"libSceFiber.sprx", &Libraries::Fiber::RegisterlibSceFiber}, {"libSceUlt.sprx", nullptr}, diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index d8c0a17bd..b578f0c52 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -326,7 +326,9 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) { Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto& buffer = ctx.texture_buffers[handle]; const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); - const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + const Id coord = + ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift), + buffer.coord_offset); Id texel = buffer.is_storage ? ctx.OpImageRead(buffer.result_type, tex_buffer, coord) : ctx.OpImageFetch(buffer.result_type, tex_buffer, coord); if (buffer.is_integer) { @@ -372,7 +374,9 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { const auto& buffer = ctx.texture_buffers[handle]; const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); - const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); + const Id coord = + ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift), + buffer.coord_offset); if (buffer.is_integer) { value = ctx.OpBitcast(buffer.result_type, value); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 4ce9f4221..5c7278c6b 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -207,6 +207,8 @@ void EmitContext::DefineBufferOffsets() { push_data_block, ConstU32(half), ConstU32(comp))}; const Id value{OpLoad(U32[1], ptr)}; tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(6U)); + tex_buffer.coord_shift = + OpBitFieldUExtract(U32[1], value, ConstU32(offset + 6U), ConstU32(2U)); Name(tex_buffer.coord_offset, fmt::format("texbuf{}_off", binding)); } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 1c5da946d..4e5e7dd3b 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -223,6 +223,7 @@ public: struct TextureBufferDefinition { Id id; Id coord_offset; + Id coord_shift; u32 binding; Id image_type; Id result_type; diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index ee9f5c805..080b0eb22 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -58,19 +58,6 @@ struct FetchShaderData { }) != attributes.end(); } - [[nodiscard]] std::pair GetDrawOffsets(const AmdGpu::Liverpool::Regs& regs, - const Info& info) const { - u32 vertex_offset = regs.index_offset; - u32 instance_offset = 0; - if (vertex_offset == 0 && vertex_offset_sgpr != -1) { - vertex_offset = info.user_data[vertex_offset_sgpr]; - } - if (instance_offset_sgpr != -1) { - instance_offset = info.user_data[instance_offset_sgpr]; - } - return {vertex_offset, instance_offset}; - } - bool operator==(const FetchShaderData& other) const { return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr && instance_offset_sgpr == other.instance_offset_sgpr; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index de8b9da87..75ad957b3 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -50,6 +50,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { return S_OR_B64(NegateMode::None, false, inst); case Opcode::S_XOR_B32: return S_XOR_B32(inst); + case Opcode::S_NOT_B32: + return S_NOT_B32(inst); case Opcode::S_XOR_B64: return S_OR_B64(NegateMode::None, true, inst); case Opcode::S_ANDN2_B32: @@ -94,6 +96,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { return S_BREV_B32(inst); case Opcode::S_BCNT1_I32_B64: return S_BCNT1_I32_B64(inst); + case Opcode::S_FF1_I32_B64: + return S_FF1_I32_B64(inst); case Opcode::S_AND_SAVEEXEC_B64: return S_SAVEEXEC_B64(NegateMode::None, false, inst); case Opcode::S_ORN2_SAVEEXEC_B64: @@ -301,6 +305,10 @@ void Translator::S_AND_B64(NegateMode negate, const GcnInst& inst) { ASSERT_MSG(-s32(operand.code) + SignedConstIntNegMin - 1 == -1, "SignedConstIntNeg must be -1"); return ir.Imm1(true); + case OperandField::LiteralConst: + ASSERT_MSG(operand.code == 0 || operand.code == std::numeric_limits::max(), + "Unsupported literal {:#x}", operand.code); + return ir.Imm1(operand.code & 1); default: UNREACHABLE(); } @@ -382,6 +390,13 @@ void Translator::S_XOR_B32(const GcnInst& inst) { ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } +void Translator::S_NOT_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 result{ir.BitwiseNot(src0)}; + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); +} + void Translator::S_LSHL_B32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; @@ -560,6 +575,12 @@ void Translator::S_BCNT1_I32_B64(const GcnInst& inst) { ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } +void Translator::S_FF1_I32_B64(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 result{ir.Select(ir.IEqual(src0, ir.Imm32(0U)), ir.Imm32(-1), ir.FindILsb(src0))}; + SetDst(inst.dst[0], result); +} + void Translator::S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst) { // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 3b89372bd..dd379d8ea 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -96,6 +96,7 @@ public: void S_MUL_I32(const GcnInst& inst); void S_BFE_U32(const GcnInst& inst); void S_ABSDIFF_I32(const GcnInst& inst); + void S_NOT_B32(const GcnInst& inst); // SOPK void S_MOVK(const GcnInst& inst); @@ -109,6 +110,7 @@ public: void S_NOT_B64(const GcnInst& inst); void S_BREV_B32(const GcnInst& inst); void S_BCNT1_I32_B64(const GcnInst& inst); + void S_FF1_I32_B64(const GcnInst& inst); void S_GETPC_B64(u32 pc, const GcnInst& inst); void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index d382d0e7c..494bbb4bb 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -105,6 +105,11 @@ struct PushData { ASSERT(offset < 256 && binding < buf_offsets.size()); buf_offsets[binding] = offset; } + + void AddTexelOffset(u32 binding, u32 multiplier, u32 texel_offset) { + ASSERT(texel_offset < 64 && multiplier < 16); + buf_offsets[binding] = texel_offset | ((std::bit_width(multiplier) - 1) << 6); + } }; static_assert(sizeof(PushData) <= 128, "PushData size is greater than minimum size guaranteed by Vulkan spec"); diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 740b89dda..2a3bd62f4 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -9,7 +9,6 @@ #include "frontend/fetch_shader.h" #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/info.h" -#include "shader_recompiler/ir/passes/srt.h" namespace Shader { @@ -22,8 +21,12 @@ struct VsAttribSpecialization { struct BufferSpecialization { u16 stride : 14; u16 is_storage : 1; + u32 size = 0; - auto operator<=>(const BufferSpecialization&) const = default; + bool operator==(const BufferSpecialization& other) const { + return stride == other.stride && is_storage == other.is_storage && + (size >= other.is_storage || is_storage); + } }; struct TextureBufferSpecialization { @@ -57,7 +60,7 @@ struct StageSpecialization { const Shader::Info* info; RuntimeInfo runtime_info; - Gcn::FetchShaderData fetch_shader_data{}; + std::optional fetch_shader_data{}; boost::container::small_vector vs_attribs; std::bitset bitset{}; boost::container::small_vector buffers; @@ -69,15 +72,14 @@ struct StageSpecialization { explicit StageSpecialization(const Info& info_, RuntimeInfo runtime_info_, const Profile& profile_, Backend::Bindings start_) : info{&info_}, runtime_info{runtime_info_}, start{start_} { - if (const auto fetch_shader = Gcn::ParseFetchShader(info_)) { - fetch_shader_data = *fetch_shader; - if (info_.stage == Stage::Vertex && !profile_.support_legacy_vertex_attributes) { - // Specialize shader on VS input number types to follow spec. - ForEachSharp(vs_attribs, fetch_shader_data.attributes, - [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { - spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt()); - }); - } + fetch_shader_data = Gcn::ParseFetchShader(info_); + if (info_.stage == Stage::Vertex && fetch_shader_data && + !profile_.support_legacy_vertex_attributes) { + // Specialize shader on VS input number types to follow spec. + ForEachSharp(vs_attribs, fetch_shader_data->attributes, + [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt()); + }); } u32 binding{}; if (info->has_readconst) { @@ -87,6 +89,9 @@ struct StageSpecialization { [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); spec.is_storage = desc.IsStorage(sharp); + if (!spec.is_storage) { + spec.size = sharp.GetSize(); + } }); ForEachSharp(binding, tex_buffers, info->texture_buffers, [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 1bbd77f82..a4eae8e7a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -163,8 +163,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { } case PM4ItOpcode::IndirectBufferConst: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCeUpdate( - {indirect_buffer->Address(), indirect_buffer->ib_size}); + auto task = + ProcessCeUpdate({indirect_buffer->Address(), indirect_buffer->ib_size}); while (!task.handle.done()) { task.handle.resume(); @@ -565,7 +565,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - if (dma_data->dst_addr_lo == 0x3022C) { + if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) { break; } if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { @@ -700,7 +700,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } case PM4ItOpcode::DmaData: { const auto* dma_data = reinterpret_cast(header); - if (dma_data->dst_addr_lo == 0x3022C) { + if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) { break; } if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index b1bf77f8a..3dab95db7 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -42,7 +42,7 @@ public: struct Traits { using Entry = BufferId; - static constexpr size_t AddressSpaceBits = 39; + static constexpr size_t AddressSpaceBits = 40; static constexpr size_t FirstLevelBits = 14; static constexpr size_t PageBits = CACHING_PAGEBITS; }; diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h index 375701c4c..a59bcfff5 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -14,7 +14,7 @@ namespace VideoCore { class MemoryTracker { public: - static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t MAX_CPU_PAGE_BITS = 40; static constexpr size_t HIGHER_PAGE_BITS = 22; static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index d26a7067a..80b91b825 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -29,7 +29,7 @@ namespace VideoCore { constexpr size_t PAGESIZE = 4_KB; constexpr size_t PAGEBITS = 12; -#if ENABLE_USERFAULTFD +#ifdef ENABLE_USERFAULTFD struct PageManager::Impl { Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} { uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 47713f0ff..82a029b95 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -279,6 +279,8 @@ bool PipelineCache::RefreshGraphicsKey() { ++remapped_cb; } + fetch_shader = std::nullopt; + Shader::Backend::Bindings binding{}; const auto& TryBindStageRemap = [&](Shader::Stage stage_in, Shader::Stage stage_out) -> bool { const auto stage_in_idx = static_cast(stage_in); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 620e5f103..4e858c0d3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -171,6 +171,22 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) { return state; } +[[nodiscard]] std::pair GetDrawOffsets( + const AmdGpu::Liverpool::Regs& regs, const Shader::Info& info, + const std::optional& fetch_shader) { + u32 vertex_offset = regs.index_offset; + u32 instance_offset = 0; + if (fetch_shader) { + if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) { + vertex_offset = info.user_data[fetch_shader->vertex_offset_sgpr]; + } + if (fetch_shader->instance_offset_sgpr != -1) { + instance_offset = info.user_data[fetch_shader->instance_offset_sgpr]; + } + } + return {vertex_offset, instance_offset}; +} + void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; @@ -198,7 +214,7 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { BeginRendering(*pipeline, state); UpdateDynamicState(*pipeline); - const auto [vertex_offset, instance_offset] = fetch_shader->GetDrawOffsets(regs, vs_info); + const auto [vertex_offset, instance_offset] = GetDrawOffsets(regs, vs_info, fetch_shader); const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle()); @@ -532,12 +548,13 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id); const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; - ASSERT_MSG(fmt_stride == vsharp.GetStride(), + const u32 buf_stride = vsharp.GetStride(); + ASSERT_MSG(buf_stride % fmt_stride == 0, "Texel buffer stride must match format stride"); const u32 offset_aligned = Common::AlignDown(offset, alignment); const u32 adjust = offset - offset_aligned; ASSERT(adjust % fmt_stride == 0); - push_data.AddOffset(binding.buffer, adjust / fmt_stride); + push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride); buffer_view = vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust, desc.is_written, vsharp.GetDataFmt(), vsharp.GetNumberFmt()); diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 7430168d0..9823cb4dc 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -392,7 +392,8 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o const auto* detiler = GetDetiler(image); if (!detiler) { if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled && - image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled) { + image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled && + image.info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) { LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode)); }