Merge remote-tracking branch 'upstream/main'

This commit is contained in:
rainmakerv2 2024-12-07 06:51:29 +08:00
commit e65c2f6f6f
32 changed files with 437 additions and 69 deletions

View File

@ -210,7 +210,10 @@ set(GNM_LIB src/core/libraries/gnmdriver/gnmdriver.cpp
src/core/libraries/gnmdriver/gnm_error.h
)
set(KERNEL_LIB src/core/libraries/kernel/threads/condvar.cpp
set(KERNEL_LIB src/core/libraries/kernel/sync/mutex.cpp
src/core/libraries/kernel/sync/mutex.h
src/core/libraries/kernel/sync/semaphore.h
src/core/libraries/kernel/threads/condvar.cpp
src/core/libraries/kernel/threads/event_flag.cpp
src/core/libraries/kernel/threads/exception.cpp
src/core/libraries/kernel/threads/exception.h
@ -875,6 +878,10 @@ target_link_libraries(shadps4 PRIVATE Boost::headers GPUOpen::VulkanMemoryAlloca
target_compile_definitions(shadps4 PRIVATE IMGUI_USER_CONFIG="imgui/imgui_config.h")
target_compile_definitions(Dear_ImGui PRIVATE IMGUI_USER_CONFIG="${PROJECT_SOURCE_DIR}/src/imgui/imgui_config.h")
if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
target_compile_definitions(shadps4 PRIVATE ENABLE_USERFAULTFD)
endif()
if (APPLE)
option(USE_SYSTEM_VULKAN_LOADER "Enables using the system Vulkan loader instead of directly linking with MoltenVK. Useful for loading validation layers." OFF)
if (USE_SYSTEM_VULKAN_LOADER)

View File

@ -6,7 +6,6 @@
#include "ntapi.h"
NtClose_t NtClose = nullptr;
NtDelayExecution_t NtDelayExecution = nullptr;
NtSetInformationFile_t NtSetInformationFile = nullptr;
NtCreateThread_t NtCreateThread = nullptr;
NtTerminateThread_t NtTerminateThread = nullptr;
@ -18,7 +17,6 @@ void Initialize() {
// http://stackoverflow.com/a/31411628/4725495
NtClose = (NtClose_t)GetProcAddress(nt_handle, "NtClose");
NtDelayExecution = (NtDelayExecution_t)GetProcAddress(nt_handle, "NtDelayExecution");
NtSetInformationFile =
(NtSetInformationFile_t)GetProcAddress(nt_handle, "NtSetInformationFile");
NtCreateThread = (NtCreateThread_t)GetProcAddress(nt_handle, "NtCreateThread");

View File

@ -408,7 +408,7 @@ typedef struct _TEB { /* win32/win64 */
#ifdef _WIN64
PVOID SystemReserved1[30]; /* /0190 */
#else
PVOID SystemReserved1[26]; /* 10c/ used for krnl386 private data in Wine */
PVOID SystemReserved1[26]; /* 10c/ */
#endif
char PlaceholderCompatibilityMode; /* 174/0280 */
BOOLEAN PlaceholderHydrationAlwaysExplicit; /* 175/0281 */
@ -430,13 +430,13 @@ typedef struct _TEB { /* win32/win64 */
BYTE SpareBytes1[23]; /* 1b9/ */
ULONG TxFsContext; /* 1d0/ */
#endif
GDI_TEB_BATCH GdiTebBatch; /* 1d4/02f0 used for ntdll private data in Wine */
GDI_TEB_BATCH GdiTebBatch; /* 1d4/02f0 */
CLIENT_ID RealClientId; /* 6b4/07d8 */
HANDLE GdiCachedProcessHandle; /* 6bc/07e8 */
ULONG GdiClientPID; /* 6c0/07f0 */
ULONG GdiClientTID; /* 6c4/07f4 */
PVOID GdiThreadLocaleInfo; /* 6c8/07f8 */
ULONG_PTR Win32ClientInfo[62]; /* 6cc/0800 used for user32 private data in Wine */
ULONG_PTR Win32ClientInfo[62]; /* 6cc/0800 */
PVOID glDispatchTable[233]; /* 7c4/09f0 */
PVOID glReserved1[29]; /* b68/1138 */
PVOID glReserved2; /* bdc/1220 */
@ -511,8 +511,6 @@ static_assert(offsetof(TEB, DeallocationStack) ==
typedef u64(__stdcall* NtClose_t)(HANDLE Handle);
typedef u64(__stdcall* NtDelayExecution_t)(BOOL Alertable, PLARGE_INTEGER DelayInterval);
typedef u64(__stdcall* NtSetInformationFile_t)(HANDLE FileHandle, PIO_STATUS_BLOCK IoStatusBlock,
PVOID FileInformation, ULONG Length,
FILE_INFORMATION_CLASS FileInformationClass);
@ -525,7 +523,6 @@ typedef u64(__stdcall* NtCreateThread_t)(PHANDLE ThreadHandle, ACCESS_MASK Desir
typedef u64(__stdcall* NtTerminateThread_t)(HANDLE ThreadHandle, u64 ExitStatus);
extern NtClose_t NtClose;
extern NtDelayExecution_t NtDelayExecution;
extern NtSetInformationFile_t NtSetInformationFile;
extern NtCreateThread_t NtCreateThread;
extern NtTerminateThread_t NtTerminateThread;

View File

@ -147,6 +147,10 @@ void SetCurrentThreadName(const char* name) {
SetThreadDescription(GetCurrentThread(), UTF8ToUTF16W(name).data());
}
void SetThreadName(void* thread, const char* name) {
SetThreadDescription(thread, UTF8ToUTF16W(name).data());
}
#else // !MSVC_VER, so must be POSIX threads
// MinGW with the POSIX threading model does not support pthread_setname_np
@ -170,11 +174,19 @@ void SetCurrentThreadName(const char* name) {
pthread_setname_np(pthread_self(), name);
#endif
}
void SetThreadName(void* thread, const char* name) {
// TODO
}
#endif
#if defined(_WIN32)
void SetCurrentThreadName(const char*) {
// Do Nothing on MingW
// Do Nothing on MinGW
}
void SetThreadName(void* thread, const char* name) {
// Do Nothing on MinGW
}
#endif

View File

@ -23,6 +23,8 @@ void SetCurrentThreadPriority(ThreadPriority new_priority);
void SetCurrentThreadName(const char* name);
void SetThreadName(void* thread, const char* name);
class AccurateTimer {
std::chrono::nanoseconds target_interval{};
std::chrono::nanoseconds total_wait{};

View File

@ -15,6 +15,7 @@ s64 Logger::write(const void* buf, size_t nbytes) {
log(static_cast<const char*>(buf), nbytes);
return nbytes;
}
size_t Logger::writev(const Libraries::Kernel::SceKernelIovec* iov, int iovcnt) {
for (int i = 0; i < iovcnt; i++) {
log(static_cast<const char*>(iov[i].iov_base), iov[i].iov_len);

View File

@ -0,0 +1,52 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "mutex.h"
#include "common/assert.h"
namespace Libraries::Kernel {
TimedMutex::TimedMutex() {
#ifdef _WIN64
mtx = CreateMutex(nullptr, false, nullptr);
ASSERT(mtx);
#endif
}
TimedMutex::~TimedMutex() {
#ifdef _WIN64
CloseHandle(mtx);
#endif
}
void TimedMutex::lock() {
#ifdef _WIN64
for (;;) {
u64 res = WaitForSingleObjectEx(mtx, INFINITE, true);
if (res == WAIT_OBJECT_0) {
return;
}
}
#else
mtx.lock();
#endif
}
bool TimedMutex::try_lock() {
#ifdef _WIN64
return WaitForSingleObjectEx(mtx, 0, true) == WAIT_OBJECT_0;
#else
return mtx.try_lock();
#endif
}
void TimedMutex::unlock() {
#ifdef _WIN64
ReleaseMutex(mtx);
#else
mtx.unlock();
#endif
}
} // namespace Libraries::Kernel

View File

@ -0,0 +1,80 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <chrono>
#include "common/types.h"
#ifdef _WIN64
#include <windows.h>
#else
#include <mutex>
#endif
namespace Libraries::Kernel {
class TimedMutex {
public:
TimedMutex();
~TimedMutex();
void lock();
bool try_lock();
void unlock();
template <class Rep, class Period>
bool try_lock_for(const std::chrono::duration<Rep, Period>& rel_time) {
#ifdef _WIN64
constexpr auto zero = std::chrono::duration<Rep, Period>::zero();
const auto now = std::chrono::steady_clock::now();
std::chrono::steady_clock::time_point abs_time = now;
if (rel_time > zero) {
constexpr auto max = (std::chrono::steady_clock::time_point::max)();
if (abs_time < max - rel_time) {
abs_time += rel_time;
} else {
abs_time = max;
}
}
return try_lock_until(abs_time);
#else
return mtx.try_lock_for(rel_time);
#endif
}
template <class Clock, class Duration>
bool try_lock_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
#ifdef _WIN64
for (;;) {
const auto now = Clock::now();
if (abs_time <= now) {
return false;
}
const auto rel_ms = std::chrono::ceil<std::chrono::milliseconds>(abs_time - now);
u64 res = WaitForSingleObjectEx(mtx, static_cast<u64>(rel_ms.count()), true);
if (res == WAIT_OBJECT_0) {
return true;
} else if (res == WAIT_TIMEOUT) {
return false;
}
}
#else
return mtx.try_lock_until(abs_time);
#endif
}
private:
#ifdef _WIN64
HANDLE mtx;
#else
std::timed_mutex mtx;
#endif
};
} // namespace Libraries::Kernel

View File

@ -0,0 +1,151 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <atomic>
#include <chrono>
#include "common/assert.h"
#include "common/types.h"
#ifdef _WIN64
#include <windows.h>
#elif defined(__APPLE__)
#include <dispatch/dispatch.h>
#else
#include <semaphore>
#endif
namespace Libraries::Kernel {
template <s64 max>
class Semaphore {
public:
Semaphore(s32 initialCount)
#if !defined(_WIN64) && !defined(__APPLE__)
: sem{initialCount}
#endif
{
#ifdef _WIN64
sem = CreateSemaphore(nullptr, initialCount, max, nullptr);
ASSERT(sem);
#elif defined(__APPLE__)
sem = dispatch_semaphore_create(initialCount);
ASSERT(sem);
#endif
}
~Semaphore() {
#ifdef _WIN64
CloseHandle(sem);
#elif defined(__APPLE__)
dispatch_release(sem);
#endif
}
void release() {
#ifdef _WIN64
ReleaseSemaphore(sem, 1, nullptr);
#elif defined(__APPLE__)
dispatch_semaphore_signal(sem);
#else
sem.release();
#endif
}
void acquire() {
#ifdef _WIN64
for (;;) {
u64 res = WaitForSingleObjectEx(sem, INFINITE, true);
if (res == WAIT_OBJECT_0) {
return;
}
}
#elif defined(__APPLE__)
for (;;) {
const auto res = dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
if (res == 0) {
return;
}
}
#else
sem.acquire();
#endif
}
bool try_acquire() {
#ifdef _WIN64
return WaitForSingleObjectEx(sem, 0, true) == WAIT_OBJECT_0;
#elif defined(__APPLE__)
return dispatch_semaphore_wait(sem, DISPATCH_TIME_NOW) == 0;
#else
return sem.try_acquire();
#endif
}
template <class Rep, class Period>
bool try_acquire_for(const std::chrono::duration<Rep, Period>& rel_time) {
#ifdef _WIN64
const auto rel_time_ms = std::chrono::ceil<std::chrono::milliseconds>(rel_time);
const u64 timeout_ms = static_cast<u64>(rel_time_ms.count());
if (timeout_ms == 0) {
return false;
}
return WaitForSingleObjectEx(sem, timeout_ms, true) == WAIT_OBJECT_0;
#elif defined(__APPLE__)
const auto rel_time_ns = std::chrono::ceil<std::chrono::nanoseconds>(rel_time).count();
const auto timeout = dispatch_time(DISPATCH_TIME_NOW, rel_time_ns);
return dispatch_semaphore_wait(sem, timeout) == 0;
#else
return sem.try_acquire_for(rel_time);
#endif
}
template <class Clock, class Duration>
bool try_acquire_until(const std::chrono::time_point<Clock, Duration>& abs_time) {
#ifdef _WIN64
const auto now = Clock::now();
if (now >= abs_time) {
return false;
}
const auto rel_time = std::chrono::ceil<std::chrono::milliseconds>(abs_time - now);
const u64 timeout_ms = static_cast<u64>(rel_time.count());
if (timeout_ms == 0) {
return false;
}
u64 res = WaitForSingleObjectEx(sem, static_cast<u64>(timeout_ms), true);
return res == WAIT_OBJECT_0;
#elif defined(__APPLE__)
auto abs_s = std::chrono::time_point_cast<std::chrono::seconds>(abs_time);
auto abs_ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(abs_time) -
std::chrono::time_point_cast<std::chrono::nanoseconds>(abs_s);
const timespec abs_timespec = {
.tv_sec = abs_s.time_since_epoch().count(),
.tv_nsec = abs_ns.count(),
};
const auto timeout = dispatch_walltime(&abs_timespec, 0);
return dispatch_semaphore_wait(sem, timeout) == 0;
#else
return sem.try_acquire_until(abs_time);
#endif
}
private:
#ifdef _WIN64
HANDLE sem;
#elif defined(__APPLE__)
dispatch_semaphore_t sem;
#else
std::counting_semaphore<max> sem;
#endif
};
using BinarySemaphore = Semaphore<1>;
using CountingSemaphore = Semaphore<0x7FFFFFFF /*ORBIS_KERNEL_SEM_VALUE_MAX*/>;
} // namespace Libraries::Kernel

View File

@ -191,7 +191,7 @@ int PthreadCond::Signal() {
PthreadMutex* mp = td->mutex_obj;
has_user_waiters = SleepqRemove(sq, td);
std::binary_semaphore* waddr = nullptr;
BinarySemaphore* waddr = nullptr;
if (mp->m_owner == curthread) {
if (curthread->nwaiter_defer >= Pthread::MaxDeferWaiters) {
curthread->WakeAll();
@ -211,7 +211,7 @@ int PthreadCond::Signal() {
struct BroadcastArg {
Pthread* curthread;
std::binary_semaphore* waddrs[Pthread::MaxDeferWaiters];
BinarySemaphore* waddrs[Pthread::MaxDeferWaiters];
int count;
};

View File

@ -118,7 +118,6 @@ public:
}
m_bits |= bits;
m_cond_var.notify_all();
}

View File

@ -380,6 +380,7 @@ int PS4_SYSV_ABI posix_sched_get_priority_min() {
int PS4_SYSV_ABI posix_pthread_rename_np(PthreadT thread, const char* name) {
LOG_INFO(Kernel_Pthread, "name = {}", name);
Common::SetThreadName(reinterpret_cast<void*>(thread->native_thr.GetHandle()), name);
thread->name = name;
return ORBIS_OK;
}

View File

@ -11,6 +11,8 @@
#include <shared_mutex>
#include "common/enum.h"
#include "core/libraries/kernel/sync/mutex.h"
#include "core/libraries/kernel/sync/semaphore.h"
#include "core/libraries/kernel/time.h"
#include "core/thread.h"
#include "core/tls.h"
@ -44,7 +46,7 @@ enum class PthreadMutexProt : u32 {
};
struct PthreadMutex {
std::timed_mutex m_lock;
TimedMutex m_lock;
PthreadMutexFlags m_flags;
Pthread* m_owner;
int m_count;
@ -288,14 +290,14 @@ struct Pthread {
int report_events;
int event_mask;
std::string name;
std::binary_semaphore wake_sema{0};
BinarySemaphore wake_sema{0};
SleepQueue* sleepqueue;
void* wchan;
PthreadMutex* mutex_obj;
bool will_sleep;
bool has_user_waiters;
int nwaiter_defer;
std::binary_semaphore* defer_waiters[MaxDeferWaiters];
BinarySemaphore* defer_waiters[MaxDeferWaiters];
bool InCritical() const noexcept {
return locklevel > 0 || critical_count > 0;

View File

@ -6,6 +6,8 @@
#include <mutex>
#include <semaphore>
#include "core/libraries/kernel/sync/semaphore.h"
#include "common/logging/log.h"
#include "core/libraries/kernel/kernel.h"
#include "core/libraries/kernel/orbis_error.h"
@ -21,7 +23,7 @@ constexpr int ORBIS_KERNEL_SEM_VALUE_MAX = 0x7FFFFFFF;
struct PthreadSem {
explicit PthreadSem(s32 value_) : semaphore{value_}, value{value_} {}
std::counting_semaphore<ORBIS_KERNEL_SEM_VALUE_MAX> semaphore;
CountingSemaphore semaphore;
std::atomic<s32> value;
};
@ -75,7 +77,7 @@ public:
it = wait_list.erase(it);
token_count -= waiter->need_count;
waiter->was_signaled = true;
waiter->cv.notify_one();
waiter->sem.release();
}
return true;
@ -88,7 +90,7 @@ public:
}
for (auto* waiter : wait_list) {
waiter->was_cancled = true;
waiter->cv.notify_one();
waiter->sem.release();
}
wait_list.clear();
token_count = set_count < 0 ? init_count : set_count;
@ -99,21 +101,21 @@ public:
std::scoped_lock lk{mutex};
for (auto* waiter : wait_list) {
waiter->was_deleted = true;
waiter->cv.notify_one();
waiter->sem.release();
}
wait_list.clear();
}
public:
struct WaitingThread {
std::condition_variable cv;
BinarySemaphore sem;
u32 priority;
s32 need_count;
bool was_signaled{};
bool was_deleted{};
bool was_cancled{};
explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} {
explicit WaitingThread(s32 need_count, bool is_fifo) : sem{0}, need_count{need_count} {
// Retrieve calling thread priority for sorting into waiting threads list.
if (!is_fifo) {
priority = g_curthread->attr.prio;
@ -134,24 +136,26 @@ public:
}
int Wait(std::unique_lock<std::mutex>& lk, u32* timeout) {
lk.unlock();
if (!timeout) {
// Wait indefinitely until we are woken up.
cv.wait(lk);
sem.acquire();
lk.lock();
return GetResult(false);
}
// Wait until timeout runs out, recording how much remaining time there was.
const auto start = std::chrono::high_resolution_clock::now();
const auto signaled = cv.wait_for(lk, std::chrono::microseconds(*timeout),
[this] { return was_signaled; });
sem.try_acquire_for(std::chrono::microseconds(*timeout));
const auto end = std::chrono::high_resolution_clock::now();
const auto time =
std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
if (signaled) {
lk.lock();
if (was_signaled) {
*timeout -= time;
} else {
*timeout = 0;
}
return GetResult(!signaled);
return GetResult(!was_signaled);
}
};

View File

@ -52,7 +52,22 @@ u64 PS4_SYSV_ABI sceKernelReadTsc() {
int PS4_SYSV_ABI sceKernelUsleep(u32 microseconds) {
#ifdef _WIN64
std::this_thread::sleep_for(std::chrono::microseconds(microseconds));
const auto start_time = std::chrono::high_resolution_clock::now();
auto total_wait_time = std::chrono::microseconds(microseconds);
while (total_wait_time.count() > 0) {
auto wait_time = std::chrono::ceil<std::chrono::milliseconds>(total_wait_time).count();
u64 res = SleepEx(static_cast<u64>(wait_time), true);
if (res == WAIT_IO_COMPLETION) {
auto elapsedTime = std::chrono::high_resolution_clock::now() - start_time;
auto elapsedMicroseconds =
std::chrono::duration_cast<std::chrono::microseconds>(elapsedTime).count();
total_wait_time = std::chrono::microseconds(microseconds - elapsedMicroseconds);
} else {
break;
}
}
return 0;
#else
timespec start;

View File

@ -470,8 +470,8 @@ OrbisKernelModuleInfoEx Module::GetModuleInfoEx() const {
.tls_align = tls.align,
.init_proc_addr = base_virtual_addr + dynamic_info.init_virtual_addr,
.fini_proc_addr = base_virtual_addr + dynamic_info.fini_virtual_addr,
.eh_frame_hdr_addr = eh_frame_hdr_addr,
.eh_frame_addr = eh_frame_addr,
.eh_frame_hdr_addr = base_virtual_addr + eh_frame_hdr_addr,
.eh_frame_addr = base_virtual_addr + eh_frame_addr,
.eh_frame_hdr_size = eh_frame_hdr_size,
.eh_frame_size = eh_frame_size,
.segments = info.segments,

View File

@ -266,7 +266,7 @@ void Emulator::Run(const std::filesystem::path& file) {
}
void Emulator::LoadSystemModules(const std::filesystem::path& file, std::string game_serial) {
constexpr std::array<SysModules, 10> ModulesToLoad{
constexpr std::array<SysModules, 14> ModulesToLoad{
{{"libSceNgs2.sprx", &Libraries::Ngs2::RegisterlibSceNgs2},
{"libSceFiber.sprx", &Libraries::Fiber::RegisterlibSceFiber},
{"libSceUlt.sprx", nullptr},

View File

@ -326,7 +326,9 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst*, u32 handle, Id address) {
Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) {
const auto& buffer = ctx.texture_buffers[handle];
const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset);
const Id coord =
ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
buffer.coord_offset);
Id texel = buffer.is_storage ? ctx.OpImageRead(buffer.result_type, tex_buffer, coord)
: ctx.OpImageFetch(buffer.result_type, tex_buffer, coord);
if (buffer.is_integer) {
@ -372,7 +374,9 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre
void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) {
const auto& buffer = ctx.texture_buffers[handle];
const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id);
const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset);
const Id coord =
ctx.OpIAdd(ctx.U32[1], ctx.OpShiftLeftLogical(ctx.U32[1], address, buffer.coord_shift),
buffer.coord_offset);
if (buffer.is_integer) {
value = ctx.OpBitcast(buffer.result_type, value);
}

View File

@ -207,6 +207,8 @@ void EmitContext::DefineBufferOffsets() {
push_data_block, ConstU32(half), ConstU32(comp))};
const Id value{OpLoad(U32[1], ptr)};
tex_buffer.coord_offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(6U));
tex_buffer.coord_shift =
OpBitFieldUExtract(U32[1], value, ConstU32(offset + 6U), ConstU32(2U));
Name(tex_buffer.coord_offset, fmt::format("texbuf{}_off", binding));
}
}

View File

@ -223,6 +223,7 @@ public:
struct TextureBufferDefinition {
Id id;
Id coord_offset;
Id coord_shift;
u32 binding;
Id image_type;
Id result_type;

View File

@ -58,19 +58,6 @@ struct FetchShaderData {
}) != attributes.end();
}
[[nodiscard]] std::pair<u32, u32> GetDrawOffsets(const AmdGpu::Liverpool::Regs& regs,
const Info& info) const {
u32 vertex_offset = regs.index_offset;
u32 instance_offset = 0;
if (vertex_offset == 0 && vertex_offset_sgpr != -1) {
vertex_offset = info.user_data[vertex_offset_sgpr];
}
if (instance_offset_sgpr != -1) {
instance_offset = info.user_data[instance_offset_sgpr];
}
return {vertex_offset, instance_offset};
}
bool operator==(const FetchShaderData& other) const {
return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr &&
instance_offset_sgpr == other.instance_offset_sgpr;

View File

@ -50,6 +50,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
return S_OR_B64(NegateMode::None, false, inst);
case Opcode::S_XOR_B32:
return S_XOR_B32(inst);
case Opcode::S_NOT_B32:
return S_NOT_B32(inst);
case Opcode::S_XOR_B64:
return S_OR_B64(NegateMode::None, true, inst);
case Opcode::S_ANDN2_B32:
@ -94,6 +96,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) {
return S_BREV_B32(inst);
case Opcode::S_BCNT1_I32_B64:
return S_BCNT1_I32_B64(inst);
case Opcode::S_FF1_I32_B64:
return S_FF1_I32_B64(inst);
case Opcode::S_AND_SAVEEXEC_B64:
return S_SAVEEXEC_B64(NegateMode::None, false, inst);
case Opcode::S_ORN2_SAVEEXEC_B64:
@ -301,6 +305,10 @@ void Translator::S_AND_B64(NegateMode negate, const GcnInst& inst) {
ASSERT_MSG(-s32(operand.code) + SignedConstIntNegMin - 1 == -1,
"SignedConstIntNeg must be -1");
return ir.Imm1(true);
case OperandField::LiteralConst:
ASSERT_MSG(operand.code == 0 || operand.code == std::numeric_limits<u32>::max(),
"Unsupported literal {:#x}", operand.code);
return ir.Imm1(operand.code & 1);
default:
UNREACHABLE();
}
@ -382,6 +390,13 @@ void Translator::S_XOR_B32(const GcnInst& inst) {
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_NOT_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 result{ir.BitwiseNot(src0)};
SetDst(inst.dst[0], result);
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_LSHL_B32(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 src1{GetSrc(inst.src[1])};
@ -560,6 +575,12 @@ void Translator::S_BCNT1_I32_B64(const GcnInst& inst) {
ir.SetScc(ir.INotEqual(result, ir.Imm32(0)));
}
void Translator::S_FF1_I32_B64(const GcnInst& inst) {
const IR::U32 src0{GetSrc(inst.src[0])};
const IR::U32 result{ir.Select(ir.IEqual(src0, ir.Imm32(0U)), ir.Imm32(-1), ir.FindILsb(src0))};
SetDst(inst.dst[0], result);
}
void Translator::S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst) {
// This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs)
// However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination

View File

@ -96,6 +96,7 @@ public:
void S_MUL_I32(const GcnInst& inst);
void S_BFE_U32(const GcnInst& inst);
void S_ABSDIFF_I32(const GcnInst& inst);
void S_NOT_B32(const GcnInst& inst);
// SOPK
void S_MOVK(const GcnInst& inst);
@ -109,6 +110,7 @@ public:
void S_NOT_B64(const GcnInst& inst);
void S_BREV_B32(const GcnInst& inst);
void S_BCNT1_I32_B64(const GcnInst& inst);
void S_FF1_I32_B64(const GcnInst& inst);
void S_GETPC_B64(u32 pc, const GcnInst& inst);
void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst);

View File

@ -105,6 +105,11 @@ struct PushData {
ASSERT(offset < 256 && binding < buf_offsets.size());
buf_offsets[binding] = offset;
}
void AddTexelOffset(u32 binding, u32 multiplier, u32 texel_offset) {
ASSERT(texel_offset < 64 && multiplier < 16);
buf_offsets[binding] = texel_offset | ((std::bit_width(multiplier) - 1) << 6);
}
};
static_assert(sizeof(PushData) <= 128,
"PushData size is greater than minimum size guaranteed by Vulkan spec");

View File

@ -9,7 +9,6 @@
#include "frontend/fetch_shader.h"
#include "shader_recompiler/backend/bindings.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/passes/srt.h"
namespace Shader {
@ -22,8 +21,12 @@ struct VsAttribSpecialization {
struct BufferSpecialization {
u16 stride : 14;
u16 is_storage : 1;
u32 size = 0;
auto operator<=>(const BufferSpecialization&) const = default;
bool operator==(const BufferSpecialization& other) const {
return stride == other.stride && is_storage == other.is_storage &&
(size >= other.is_storage || is_storage);
}
};
struct TextureBufferSpecialization {
@ -57,7 +60,7 @@ struct StageSpecialization {
const Shader::Info* info;
RuntimeInfo runtime_info;
Gcn::FetchShaderData fetch_shader_data{};
std::optional<Gcn::FetchShaderData> fetch_shader_data{};
boost::container::small_vector<VsAttribSpecialization, 32> vs_attribs;
std::bitset<MaxStageResources> bitset{};
boost::container::small_vector<BufferSpecialization, 16> buffers;
@ -69,15 +72,14 @@ struct StageSpecialization {
explicit StageSpecialization(const Info& info_, RuntimeInfo runtime_info_,
const Profile& profile_, Backend::Bindings start_)
: info{&info_}, runtime_info{runtime_info_}, start{start_} {
if (const auto fetch_shader = Gcn::ParseFetchShader(info_)) {
fetch_shader_data = *fetch_shader;
if (info_.stage == Stage::Vertex && !profile_.support_legacy_vertex_attributes) {
// Specialize shader on VS input number types to follow spec.
ForEachSharp(vs_attribs, fetch_shader_data.attributes,
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt());
});
}
fetch_shader_data = Gcn::ParseFetchShader(info_);
if (info_.stage == Stage::Vertex && fetch_shader_data &&
!profile_.support_legacy_vertex_attributes) {
// Specialize shader on VS input number types to follow spec.
ForEachSharp(vs_attribs, fetch_shader_data->attributes,
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.num_class = AmdGpu::GetNumberClass(sharp.GetNumberFmt());
});
}
u32 binding{};
if (info->has_readconst) {
@ -87,6 +89,9 @@ struct StageSpecialization {
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.stride = sharp.GetStride();
spec.is_storage = desc.IsStorage(sharp);
if (!spec.is_storage) {
spec.size = sharp.GetSize();
}
});
ForEachSharp(binding, tex_buffers, info->texture_buffers,
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {

View File

@ -163,8 +163,8 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
}
case PM4ItOpcode::IndirectBufferConst: {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCeUpdate(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
auto task =
ProcessCeUpdate({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
while (!task.handle.done()) {
task.handle.resume();
@ -565,7 +565,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
case PM4ItOpcode::DmaData: {
const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
if (dma_data->dst_addr_lo == 0x3022C) {
if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
break;
}
if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {
@ -700,7 +700,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, int vqid) {
}
case PM4ItOpcode::DmaData: {
const auto* dma_data = reinterpret_cast<const PM4DmaData*>(header);
if (dma_data->dst_addr_lo == 0x3022C) {
if (dma_data->dst_addr_lo == 0x3022C || !rasterizer) {
break;
}
if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) {

View File

@ -42,7 +42,7 @@ public:
struct Traits {
using Entry = BufferId;
static constexpr size_t AddressSpaceBits = 39;
static constexpr size_t AddressSpaceBits = 40;
static constexpr size_t FirstLevelBits = 14;
static constexpr size_t PageBits = CACHING_PAGEBITS;
};

View File

@ -14,7 +14,7 @@ namespace VideoCore {
class MemoryTracker {
public:
static constexpr size_t MAX_CPU_PAGE_BITS = 39;
static constexpr size_t MAX_CPU_PAGE_BITS = 40;
static constexpr size_t HIGHER_PAGE_BITS = 22;
static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS;
static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL;

View File

@ -29,7 +29,7 @@ namespace VideoCore {
constexpr size_t PAGESIZE = 4_KB;
constexpr size_t PAGEBITS = 12;
#if ENABLE_USERFAULTFD
#ifdef ENABLE_USERFAULTFD
struct PageManager::Impl {
Impl(Vulkan::Rasterizer* rasterizer_) : rasterizer{rasterizer_} {
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);

View File

@ -279,6 +279,8 @@ bool PipelineCache::RefreshGraphicsKey() {
++remapped_cb;
}
fetch_shader = std::nullopt;
Shader::Backend::Bindings binding{};
const auto& TryBindStageRemap = [&](Shader::Stage stage_in, Shader::Stage stage_out) -> bool {
const auto stage_in_idx = static_cast<u32>(stage_in);

View File

@ -171,6 +171,22 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) {
return state;
}
[[nodiscard]] std::pair<u32, u32> GetDrawOffsets(
const AmdGpu::Liverpool::Regs& regs, const Shader::Info& info,
const std::optional<Shader::Gcn::FetchShaderData>& fetch_shader) {
u32 vertex_offset = regs.index_offset;
u32 instance_offset = 0;
if (fetch_shader) {
if (vertex_offset == 0 && fetch_shader->vertex_offset_sgpr != -1) {
vertex_offset = info.user_data[fetch_shader->vertex_offset_sgpr];
}
if (fetch_shader->instance_offset_sgpr != -1) {
instance_offset = info.user_data[fetch_shader->instance_offset_sgpr];
}
}
return {vertex_offset, instance_offset};
}
void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
RENDERER_TRACE;
@ -198,7 +214,7 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
BeginRendering(*pipeline, state);
UpdateDynamicState(*pipeline);
const auto [vertex_offset, instance_offset] = fetch_shader->GetDrawOffsets(regs, vs_info);
const auto [vertex_offset, instance_offset] = GetDrawOffsets(regs, vs_info, fetch_shader);
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline->Handle());
@ -532,12 +548,13 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id);
const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3;
ASSERT_MSG(fmt_stride == vsharp.GetStride(),
const u32 buf_stride = vsharp.GetStride();
ASSERT_MSG(buf_stride % fmt_stride == 0,
"Texel buffer stride must match format stride");
const u32 offset_aligned = Common::AlignDown(offset, alignment);
const u32 adjust = offset - offset_aligned;
ASSERT(adjust % fmt_stride == 0);
push_data.AddOffset(binding.buffer, adjust / fmt_stride);
push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride);
buffer_view =
vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust, desc.is_written,
vsharp.GetDataFmt(), vsharp.GetNumberFmt());

View File

@ -392,7 +392,8 @@ std::pair<vk::Buffer, u32> TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o
const auto* detiler = GetDetiler(image);
if (!detiler) {
if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled &&
image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled) {
image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled &&
image.info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) {
LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
}