diff --git a/CMakeLists.txt b/CMakeLists.txt index 6780db417..8daa98dea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,9 +54,9 @@ else() endif() if (ARCHITECTURE STREQUAL "x86_64") - # Target the same CPU architecture as the PS4, to maintain the same level of compatibility. - # Exclude SSE4a as it is only available on AMD CPUs. - add_compile_options(-march=btver2 -mtune=generic -mno-sse4a) + # Target x86-64-v3 CPU architecture as this is a good balance between supporting performance critical + # instructions like AVX2 and maintaining support for older CPUs. + add_compile_options(-march=x86-64-v3) endif() if (APPLE AND ARCHITECTURE STREQUAL "x86_64" AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64") diff --git a/documents/Quickstart/Quickstart.md b/documents/Quickstart/Quickstart.md index 62df95e71..e2145ebbd 100644 --- a/documents/Quickstart/Quickstart.md +++ b/documents/Quickstart/Quickstart.md @@ -21,9 +21,9 @@ SPDX-License-Identifier: GPL-2.0-or-later - A processor with at least 4 cores and 6 threads - Above 2.5 GHz frequency -- A CPU supporting the following instruction sets: MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, F16C, CLMUL, AES, BMI1, MOVBE, XSAVE, ABM +- A CPU supporting the x86-64-v3 baseline. - **Intel**: Haswell generation or newer - - **AMD**: Jaguar generation or newer + - **AMD**: Excavator generation or newer - **Apple**: Rosetta 2 on macOS 15.4 or newer ### GPU @@ -55,4 +55,4 @@ To configure the emulator, you can go through the interface and go to "settings" You can also configure the emulator by editing the `config.toml` file located in the `user` folder created after the application is started (Mostly useful if you are using the SDL version). Some settings may be related to more technical development and debugging.\ -For more information on this, see [**Debugging**](https://github.com/shadps4-emu/shadPS4/blob/main/documents/Debugging/Debugging.md#configuration). \ No newline at end of file +For more information on this, see [**Debugging**](https://github.com/shadps4-emu/shadPS4/blob/main/documents/Debugging/Debugging.md#configuration). diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index f2f40e0e3..9cf340050 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -179,7 +179,7 @@ s32 PS4_SYSV_ABI sceGnmComputeWaitOnAddress(u32* cmdbuf, u32 size, uintptr_t add auto* wait_reg_mem = reinterpret_cast(cmdbuf); wait_reg_mem->header = PM4Type3Header{PM4ItOpcode::WaitRegMem, 5}; wait_reg_mem->raw = (is_mem << 4u) | (cmp_func & 7u); - wait_reg_mem->poll_addr_lo = u32(addr & addr_mask); + wait_reg_mem->poll_addr_lo_raw = u32(addr & addr_mask); wait_reg_mem->poll_addr_hi = u32(addr >> 32u); wait_reg_mem->ref = ref; wait_reg_mem->mask = mask; diff --git a/src/core/libraries/kernel/equeue.cpp b/src/core/libraries/kernel/equeue.cpp index a4916208a..958019cd3 100644 --- a/src/core/libraries/kernel/equeue.cpp +++ b/src/core/libraries/kernel/equeue.cpp @@ -12,12 +12,25 @@ namespace Libraries::Kernel { +extern boost::asio::io_context io_context; +extern void KernelSignalRequest(); + +static constexpr auto HrTimerSpinlockThresholdUs = 1200u; + // Events are uniquely identified by id and filter. bool EqueueInternal::AddEvent(EqueueEvent& event) { std::scoped_lock lock{m_mutex}; event.time_added = std::chrono::steady_clock::now(); + if (event.event.filter == SceKernelEvent::Filter::Timer || + event.event.filter == SceKernelEvent::Filter::HrTimer) { + // HrTimer events are offset by the threshold of time at the end that we spinlock for + // greater accuracy. + const auto offset = + event.event.filter == SceKernelEvent::Filter::HrTimer ? HrTimerSpinlockThresholdUs : 0u; + event.timer_interval = std::chrono::microseconds(event.event.data - offset); + } const auto& it = std::ranges::find(m_events, event); if (it != m_events.cend()) { @@ -29,6 +42,47 @@ bool EqueueInternal::AddEvent(EqueueEvent& event) { return true; } +bool EqueueInternal::ScheduleEvent(u64 id, s16 filter, + void (*callback)(SceKernelEqueue, const SceKernelEvent&)) { + std::scoped_lock lock{m_mutex}; + + const auto& it = std::ranges::find_if(m_events, [id, filter](auto& ev) { + return ev.event.ident == id && ev.event.filter == filter; + }); + if (it == m_events.cend()) { + return false; + } + + const auto& event = *it; + ASSERT(event.event.filter == SceKernelEvent::Filter::Timer || + event.event.filter == SceKernelEvent::Filter::HrTimer); + + if (!it->timer) { + it->timer = std::make_unique(io_context, event.timer_interval); + } else { + // If the timer already exists we are scheduling a reoccurrence after the next period. + // Set the expiration time to the previous occurrence plus the period. + it->timer->expires_at(it->timer->expiry() + event.timer_interval); + } + + it->timer->async_wait( + [this, event_data = event.event, callback](const boost::system::error_code& ec) { + if (ec) { + if (ec != boost::system::errc::operation_canceled) { + LOG_ERROR(Kernel_Event, "Timer callback error: {}", ec.message()); + } else { + // Timer was cancelled (removed) before it triggered + LOG_DEBUG(Kernel_Event, "Timer cancelled"); + } + return; + } + callback(this, event_data); + }); + KernelSignalRequest(); + + return true; +} + bool EqueueInternal::RemoveEvent(u64 id, s16 filter) { bool has_found = false; std::scoped_lock lock{m_mutex}; @@ -152,18 +206,14 @@ int EqueueInternal::WaitForSmallTimer(SceKernelEvent* ev, int num, u32 micros) { return count; } -extern boost::asio::io_context io_context; -extern void KernelSignalRequest(); +bool EqueueInternal::EventExists(u64 id, s16 filter) { + std::scoped_lock lock{m_mutex}; -static constexpr auto HrTimerSpinlockThresholdUs = 1200u; + const auto& it = std::ranges::find_if(m_events, [id, filter](auto& ev) { + return ev.event.ident == id && ev.event.filter == filter; + }); -static void SmallTimerCallback(const boost::system::error_code& error, SceKernelEqueue eq, - SceKernelEvent kevent) { - static EqueueEvent event; - event.event = kevent; - event.event.data = HrTimerSpinlockThresholdUs; - eq->AddSmallTimer(event); - eq->TriggerEvent(kevent.ident, SceKernelEvent::Filter::HrTimer, kevent.udata); + return it != m_events.cend(); } int PS4_SYSV_ABI sceKernelCreateEqueue(SceKernelEqueue* eq, const char* name) { @@ -243,6 +293,14 @@ int PS4_SYSV_ABI sceKernelWaitEqueue(SceKernelEqueue eq, SceKernelEvent* ev, int return ORBIS_OK; } +static void HrTimerCallback(SceKernelEqueue eq, const SceKernelEvent& kevent) { + static EqueueEvent event; + event.event = kevent; + event.event.data = HrTimerSpinlockThresholdUs; + eq->AddSmallTimer(event); + eq->TriggerEvent(kevent.ident, SceKernelEvent::Filter::HrTimer, kevent.udata); +} + s32 PS4_SYSV_ABI sceKernelAddHRTimerEvent(SceKernelEqueue eq, int id, timespec* ts, void* udata) { if (eq == nullptr) { return ORBIS_KERNEL_ERROR_EBADF; @@ -273,17 +331,10 @@ s32 PS4_SYSV_ABI sceKernelAddHRTimerEvent(SceKernelEqueue eq, int id, timespec* return eq->AddSmallTimer(event) ? ORBIS_OK : ORBIS_KERNEL_ERROR_ENOMEM; } - event.timer = std::make_unique( - io_context, std::chrono::microseconds(total_us - HrTimerSpinlockThresholdUs)); - - event.timer->async_wait(std::bind(SmallTimerCallback, std::placeholders::_1, eq, event.event)); - - if (!eq->AddEvent(event)) { + if (!eq->AddEvent(event) || + !eq->ScheduleEvent(id, SceKernelEvent::Filter::HrTimer, HrTimerCallback)) { return ORBIS_KERNEL_ERROR_ENOMEM; } - - KernelSignalRequest(); - return ORBIS_OK; } @@ -300,6 +351,57 @@ int PS4_SYSV_ABI sceKernelDeleteHRTimerEvent(SceKernelEqueue eq, int id) { } } +static void TimerCallback(SceKernelEqueue eq, const SceKernelEvent& kevent) { + if (eq->EventExists(kevent.ident, kevent.filter)) { + eq->TriggerEvent(kevent.ident, SceKernelEvent::Filter::Timer, kevent.udata); + + if (!(kevent.flags & SceKernelEvent::Flags::OneShot)) { + // Reschedule the event for its next period. + eq->ScheduleEvent(kevent.ident, kevent.filter, TimerCallback); + } + } +} + +int PS4_SYSV_ABI sceKernelAddTimerEvent(SceKernelEqueue eq, int id, SceKernelUseconds usec, + void* udata) { + if (eq == nullptr) { + return ORBIS_KERNEL_ERROR_EBADF; + } + + EqueueEvent event{}; + event.event.ident = static_cast(id); + event.event.filter = SceKernelEvent::Filter::Timer; + event.event.flags = SceKernelEvent::Flags::Add; + event.event.fflags = 0; + event.event.data = usec; + event.event.udata = udata; + + if (eq->EventExists(event.event.ident, event.event.filter)) { + eq->RemoveEvent(id, SceKernelEvent::Filter::Timer); + LOG_DEBUG(Kernel_Event, + "Timer event already exists, removing it: queue name={}, queue id={}", + eq->GetName(), event.event.ident); + } + + LOG_DEBUG(Kernel_Event, "Added timing event: queue name={}, queue id={}, usec={}, pointer={:x}", + eq->GetName(), event.event.ident, usec, reinterpret_cast(udata)); + + if (!eq->AddEvent(event) || + !eq->ScheduleEvent(id, SceKernelEvent::Filter::Timer, TimerCallback)) { + return ORBIS_KERNEL_ERROR_ENOMEM; + } + return ORBIS_OK; +} + +int PS4_SYSV_ABI sceKernelDeleteTimerEvent(SceKernelEqueue eq, int id) { + if (eq == nullptr) { + return ORBIS_KERNEL_ERROR_EBADF; + } + + return eq->RemoveEvent(id, SceKernelEvent::Filter::Timer) ? ORBIS_OK + : ORBIS_KERNEL_ERROR_ENOENT; +} + int PS4_SYSV_ABI sceKernelAddUserEvent(SceKernelEqueue eq, int id) { if (eq == nullptr) { return ORBIS_KERNEL_ERROR_EBADF; @@ -380,6 +482,8 @@ void RegisterEventQueue(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("WDszmSbWuDk", "libkernel", 1, "libkernel", 1, 1, sceKernelAddUserEventEdge); LIB_FUNCTION("R74tt43xP6k", "libkernel", 1, "libkernel", 1, 1, sceKernelAddHRTimerEvent); LIB_FUNCTION("J+LF6LwObXU", "libkernel", 1, "libkernel", 1, 1, sceKernelDeleteHRTimerEvent); + LIB_FUNCTION("57ZK+ODEXWY", "libkernel", 1, "libkernel", 1, 1, sceKernelAddTimerEvent); + LIB_FUNCTION("YWQFUyXIVdU", "libkernel", 1, "libkernel", 1, 1, sceKernelDeleteTimerEvent); LIB_FUNCTION("F6e0kwo4cnk", "libkernel", 1, "libkernel", 1, 1, sceKernelTriggerUserEvent); LIB_FUNCTION("LJDwdSNTnDg", "libkernel", 1, "libkernel", 1, 1, sceKernelDeleteUserEvent); LIB_FUNCTION("mJ7aghmgvfc", "libkernel", 1, "libkernel", 1, 1, sceKernelGetEventId); diff --git a/src/core/libraries/kernel/equeue.h b/src/core/libraries/kernel/equeue.h index 2bd7ef510..a0367c66a 100644 --- a/src/core/libraries/kernel/equeue.h +++ b/src/core/libraries/kernel/equeue.h @@ -21,6 +21,9 @@ namespace Libraries::Kernel { class EqueueInternal; struct EqueueEvent; +using SceKernelUseconds = u32; +using SceKernelEqueue = EqueueInternal*; + struct SceKernelEvent { enum Filter : s16 { None = 0, @@ -77,6 +80,7 @@ struct EqueueEvent { SceKernelEvent event; void* data = nullptr; std::chrono::steady_clock::time_point time_added; + std::chrono::microseconds timer_interval; std::unique_ptr timer; void ResetTriggerState() { @@ -133,6 +137,8 @@ public: } bool AddEvent(EqueueEvent& event); + bool ScheduleEvent(u64 id, s16 filter, + void (*callback)(SceKernelEqueue, const SceKernelEvent&)); bool RemoveEvent(u64 id, s16 filter); int WaitForEvents(SceKernelEvent* ev, int num, u32 micros); bool TriggerEvent(u64 ident, s16 filter, void* trigger_data); @@ -152,6 +158,8 @@ public: int WaitForSmallTimer(SceKernelEvent* ev, int num, u32 micros); + bool EventExists(u64 id, s16 filter); + private: std::string m_name; std::mutex m_mutex; @@ -160,9 +168,6 @@ private: std::condition_variable m_cond; }; -using SceKernelUseconds = u32; -using SceKernelEqueue = EqueueInternal*; - u64 PS4_SYSV_ABI sceKernelGetEventData(const SceKernelEvent* ev); void RegisterEventQueue(Core::Loader::SymbolsResolver* sym); diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index dd0e07302..bddfcfc84 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -290,6 +290,12 @@ int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut directMemoryEndOut); } +int PS4_SYSV_ABI sceKernelIsStack(void* addr, void** start, void** end) { + LOG_DEBUG(Kernel_Vmm, "called, addr = {}", fmt::ptr(addr)); + auto* memory = Core::Memory::Instance(); + return memory->IsStack(std::bit_cast(addr), start, end); +} + s32 PS4_SYSV_ABI sceKernelBatchMap(OrbisKernelBatchMapEntry* entries, int numEntries, int* numEntriesOut) { return sceKernelBatchMap2(entries, numEntries, numEntriesOut, @@ -636,6 +642,7 @@ void RegisterMemory(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("7oxv3PPCumo", "libkernel", 1, "libkernel", 1, 1, sceKernelReserveVirtualRange); LIB_FUNCTION("BC+OG5m9+bw", "libkernel", 1, "libkernel", 1, 1, sceKernelGetDirectMemoryType); LIB_FUNCTION("pO96TwzOm5E", "libkernel", 1, "libkernel", 1, 1, sceKernelGetDirectMemorySize); + LIB_FUNCTION("yDBwVAolDgg", "libkernel", 1, "libkernel", 1, 1, sceKernelIsStack); LIB_FUNCTION("NcaWUxfMNIQ", "libkernel", 1, "libkernel", 1, 1, sceKernelMapNamedDirectMemory); LIB_FUNCTION("L-Q3LEjIbgA", "libkernel", 1, "libkernel", 1, 1, sceKernelMapDirectMemory); LIB_FUNCTION("WFcfL2lzido", "libkernel", 1, "libkernel", 1, 1, sceKernelQueryMemoryProtection); diff --git a/src/core/libraries/kernel/memory.h b/src/core/libraries/kernel/memory.h index 3e2bf8de5..92e158a00 100644 --- a/src/core/libraries/kernel/memory.h +++ b/src/core/libraries/kernel/memory.h @@ -158,6 +158,7 @@ void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func[]); int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut); +int PS4_SYSV_ABI sceKernelIsStack(void* addr, void** start, void** end); s32 PS4_SYSV_ABI sceKernelBatchMap(OrbisKernelBatchMapEntry* entries, int numEntries, int* numEntriesOut); diff --git a/src/core/libraries/kernel/threads/event_flag.cpp b/src/core/libraries/kernel/threads/event_flag.cpp index 24ddcb927..91b17bd49 100644 --- a/src/core/libraries/kernel/threads/event_flag.cpp +++ b/src/core/libraries/kernel/threads/event_flag.cpp @@ -315,7 +315,7 @@ int PS4_SYSV_ABI sceKernelPollEventFlag(OrbisKernelEventFlag ef, u64 bitPattern, auto result = ef->Poll(bitPattern, wait, clear, pResultPat); if (result != ORBIS_OK && result != ORBIS_KERNEL_ERROR_EBUSY) { - LOG_ERROR(Kernel_Event, "returned {}", result); + LOG_DEBUG(Kernel_Event, "returned {:#x}", result); } return result; @@ -361,7 +361,7 @@ int PS4_SYSV_ABI sceKernelWaitEventFlag(OrbisKernelEventFlag ef, u64 bitPattern, u32 result = ef->Wait(bitPattern, wait, clear, pResultPat, pTimeout); if (result != ORBIS_OK && result != ORBIS_KERNEL_ERROR_ETIMEDOUT) { - LOG_ERROR(Kernel_Event, "returned {:#x}", result); + LOG_DEBUG(Kernel_Event, "returned {:#x}", result); } return result; diff --git a/src/core/libraries/ngs2/ngs2.cpp b/src/core/libraries/ngs2/ngs2.cpp index 743be5fd6..9bb73536c 100644 --- a/src/core/libraries/ngs2/ngs2.cpp +++ b/src/core/libraries/ngs2/ngs2.cpp @@ -380,8 +380,7 @@ s32 PS4_SYSV_ABI sceNgs2GeomApply(const OrbisNgs2GeomListenerWork* listener, s32 PS4_SYSV_ABI sceNgs2PanInit(OrbisNgs2PanWork* work, const float* aSpeakerAngle, float unitAngle, u32 numSpeakers) { - LOG_ERROR(Lib_Ngs2, "aSpeakerAngle = {}, unitAngle = {}, numSpeakers = {}", *aSpeakerAngle, - unitAngle, numSpeakers); + LOG_ERROR(Lib_Ngs2, "unitAngle = {}, numSpeakers = {}", unitAngle, numSpeakers); return ORBIS_OK; } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 3217dcab7..df72bc538 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -950,4 +950,33 @@ int MemoryManager::GetDirectMemoryType(PAddr addr, int* directMemoryTypeOut, return ORBIS_OK; } +int MemoryManager::IsStack(VAddr addr, void** start, void** end) { + auto vma_handle = FindVMA(addr); + if (vma_handle == vma_map.end()) { + return ORBIS_KERNEL_ERROR_EINVAL; + } + + const VirtualMemoryArea& vma = vma_handle->second; + if (!vma.Contains(addr, 0) || vma.IsFree()) { + return ORBIS_KERNEL_ERROR_EACCES; + } + + auto stack_start = 0ul; + auto stack_end = 0ul; + if (vma.type == VMAType::Stack) { + stack_start = vma.base; + stack_end = vma.base + vma.size; + } + + if (start != nullptr) { + *start = reinterpret_cast(stack_start); + } + + if (end != nullptr) { + *end = reinterpret_cast(stack_end); + } + + return ORBIS_OK; +} + } // namespace Core diff --git a/src/core/memory.h b/src/core/memory.h index 4920aa397..4c143ff6f 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -223,6 +223,8 @@ public: void InvalidateMemory(VAddr addr, u64 size) const; + int IsStack(VAddr addr, void** start, void** end); + private: VMAHandle FindVMA(VAddr target) { return std::prev(vma_map.upper_bound(target)); diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index 89426e080..376cc304e 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -53,7 +53,7 @@ void Translator::S_LOAD_DWORD(int num_dwords, const GcnInst& inst) { ir.CompositeConstruct(ir.GetScalarReg(sbase), ir.GetScalarReg(sbase + 1)); IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { - ir.SetScalarReg(dst_reg++, ir.ReadConst(base, ir.Imm32(dword_offset + i))); + ir.SetScalarReg(dst_reg + i, ir.ReadConst(base, ir.Imm32(dword_offset + i))); } } @@ -75,7 +75,7 @@ void Translator::S_BUFFER_LOAD_DWORD(int num_dwords, const GcnInst& inst) { IR::ScalarReg dst_reg{inst.dst[0].code}; for (u32 i = 0; i < num_dwords; i++) { const IR::U32 index = ir.IAdd(dword_offset, ir.Imm32(i)); - ir.SetScalarReg(dst_reg++, ir.ReadConstBuffer(vsharp, index)); + ir.SetScalarReg(dst_reg + i, ir.ReadConstBuffer(vsharp, index)); } } diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 3ce86c131..6171cca07 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -989,13 +989,22 @@ void Translator::V_CMP_NE_U64(const GcnInst& inst) { } }; const IR::U1 src0{get_src(inst.src[0])}; - ASSERT(inst.src[1].field == OperandField::ConstZero); // src0 != 0 + auto op = [&inst, this](auto x) { + switch (inst.src[1].field) { + case OperandField::ConstZero: + return x; + case OperandField::SignedConstIntNeg: + return ir.LogicalNot(x); + default: + UNREACHABLE_MSG("unhandled V_CMP_NE_U64 source argument {}", u32(inst.src[1].field)); + } + }; switch (inst.dst[1].field) { case OperandField::VccLo: - ir.SetVcc(src0); + ir.SetVcc(op(src0)); break; case OperandField::ScalarGPR: - ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), src0); + ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[1].code), op(src0)); break; default: UNREACHABLE(); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 598288085..d1cd98634 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -584,7 +584,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + const auto* event = reinterpret_cast(header); + LOG_DEBUG(Render_Vulkan, + "Encountered EventWrite: event_type = {}, event_index = {}", + magic_enum::enum_name(event->event_type.Value()), + magic_enum::enum_name(event->event_index.Value())); + if (event->event_type.Value() == EventType::SoVgtStreamoutFlush) { + // TODO: handle proper synchronization, for now signal that update is done + // immediately + regs.cp_strmout_cntl.offset_update_done = 1; + } break; } case PM4ItOpcode::EventWriteEos: { @@ -696,10 +705,10 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanAddress(); if (vo_port->IsVoLabel(wait_addr) && num_submits == mapped_queues[GfxQueueId].submits.size()) { - vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); }); + vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(regs.reg_array); }); break; } - while (!wait_reg_mem->Test()) { + while (!wait_reg_mem->Test(regs.reg_array)) { YIELD_GFX(); } break; @@ -732,6 +741,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + LOG_WARNING(Render_Vulkan, + "Unimplemented IT_STRMOUT_BUFFER_UPDATE, update_memory = {}, " + "source_select = {}, buffer_select = {}", + strmout->update_memory.Value(), + magic_enum::enum_name(strmout->source_select.Value()), + strmout->buffer_select.Value()); + break; + } default: UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", static_cast(opcode), count); @@ -866,8 +885,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq } case PM4ItOpcode::SetQueueReg: { const auto* set_data = reinterpret_cast(header); - UNREACHABLE_MSG("Encountered compute SetQueueReg: vqid = {}, reg_offset = {:#x}", - set_data->vqid.Value(), set_data->reg_offset.Value()); + LOG_WARNING(Render, "Encountered compute SetQueueReg: vqid = {}, reg_offset = {:#x}", + set_data->vqid.Value(), set_data->reg_offset.Value()); + break; } case PM4ItOpcode::DispatchDirect: { const auto* dispatch_direct = reinterpret_cast(header); @@ -934,7 +954,7 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq case PM4ItOpcode::WaitRegMem: { const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); - while (!wait_reg_mem->Test()) { + while (!wait_reg_mem->Test(regs.reg_array)) { YIELD_ASC(vqid); } break; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index c4bebd05f..a62141099 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -1175,6 +1175,14 @@ struct Liverpool { BitField<22, 2, u32> onchip; }; + union StreamOutControl { + u32 raw; + struct { + u32 offset_update_done : 1; + u32 : 31; + }; + }; + union StreamOutConfig { u32 raw; struct { @@ -1378,7 +1386,9 @@ struct Liverpool { AaConfig aa_config; INSERT_PADDING_WORDS(0xA318 - 0xA2F8 - 1); ColorBuffer color_buffers[NumColorBuffers]; - INSERT_PADDING_WORDS(0xC242 - 0xA390); + INSERT_PADDING_WORDS(0xC03F - 0xA390); + StreamOutControl cp_strmout_cntl; + INSERT_PADDING_WORDS(0xC242 - 0xC040); PrimitiveType primitive_type; INSERT_PADDING_WORDS(0xC24C - 0xC243); u32 num_indices; @@ -1668,6 +1678,7 @@ static_assert(GFX6_3D_REG_INDEX(color_buffers[0].base_address) == 0xA318); static_assert(GFX6_3D_REG_INDEX(color_buffers[0].pitch) == 0xA319); static_assert(GFX6_3D_REG_INDEX(color_buffers[0].slice) == 0xA31A); static_assert(GFX6_3D_REG_INDEX(color_buffers[7].base_address) == 0xA381); +static_assert(GFX6_3D_REG_INDEX(cp_strmout_cntl) == 0xC03F); static_assert(GFX6_3D_REG_INDEX(primitive_type) == 0xC242); static_assert(GFX6_3D_REG_INDEX(num_instances) == 0xC24D); static_assert(GFX6_3D_REG_INDEX(vgt_tf_memory_base) == 0xc250); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index cd175f6c9..58ecda93e 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -246,6 +246,46 @@ struct PM4CmdNop { }; }; +enum class SourceSelect : u32 { + BufferOffset = 0, + VgtStrmoutBufferFilledSize = 1, + SrcAddress = 2, + None = 3, +}; + +struct PM4CmdStrmoutBufferUpdate { + PM4Type3Header header; + union { + BitField<0, 1, u32> update_memory; + BitField<1, 2, SourceSelect> source_select; + BitField<8, 2, u32> buffer_select; + u32 control; + }; + union { + BitField<2, 30, u32> dst_address_lo; + BitField<0, 2, u32> swap_dst; + }; + u32 dst_address_hi; + union { + u32 buffer_offset; + BitField<2, 30, u32> src_address_lo; + BitField<0, 2, u32> swap_src; + }; + u32 src_address_hi; + + template + T DstAddress() const { + ASSERT(update_memory.Value() == 1); + return reinterpret_cast(dst_address_lo.Value() | u64(dst_address_hi & 0xFFFF) << 32); + } + + template + T SrcAddress() const { + ASSERT(source_select.Value() == SourceSelect::SrcAddress); + return reinterpret_cast(src_address_lo.Value() | u64(src_address_hi & 0xFFFF) << 32); + } +}; + struct PM4CmdDrawIndexOffset2 { PM4Type3Header header; u32 max_size; ///< Maximum number of indices @@ -303,6 +343,80 @@ static u64 GetGpuClock64() { return static_cast(ticks); } +// VGT_EVENT_INITIATOR.EVENT_TYPE +enum class EventType : u32 { + SampleStreamoutStats1 = 1, + SampleStreamoutStats2 = 2, + SampleStreamoutStats3 = 3, + CacheFlushTs = 4, + ContextDone = 5, + CacheFlush = 6, + CsPartialFlush = 7, + VgtStreamoutSync = 8, + VgtStreamoutReset = 10, + EndOfPipeIncrDe = 11, + EndOfPipeIbEnd = 12, + RstPixCnt = 13, + VsPartialFlush = 15, + PsPartialFlush = 16, + FlushHsOutput = 17, + FlushLsOutput = 18, + CacheFlushAndInvTsEvent = 20, + ZpassDone = 21, + CacheFlushAndInvEvent = 22, + PerfcounterStart = 23, + PerfcounterStop = 24, + PipelineStatStart = 25, + PipelineStatStop = 26, + PerfcounterSample = 27, + FlushEsOutput = 28, + FlushGsOutput = 29, + SamplePipelineStat = 30, + SoVgtStreamoutFlush = 31, + SampleStreamoutStats = 32, + ResetVtxCnt = 33, + VgtFlush = 36, + ScSendDbVpz = 39, + BottomOfPipeTs = 40, + DbCacheFlushAndInv = 42, + FlushAndInvDbDataTs = 43, + FlushAndInvDbMeta = 44, + FlushAndInvCbDataTs = 45, + FlushAndInvCbMeta = 46, + CsDone = 47, + PsDone = 48, + FlushAndInvCbPixelData = 49, + ThreadTraceStart = 51, + ThreadTraceStop = 52, + ThreadTraceFlush = 54, + ThreadTraceFinish = 55, + PixelPipeStatControl = 56, + PixelPipeStatDump = 57, + PixelPipeStatReset = 58, +}; + +enum class EventIndex : u32 { + Other = 0, + ZpassDone = 1, + SamplePipelineStat = 2, + SampleStreamoutStatSx = 3, + CsVsPsPartialFlush = 4, + EopReserved = 5, + EosReserved = 6, + CacheFlush = 7, +}; + +struct PM4CmdEventWrite { + PM4Type3Header header; + union { + u32 event_control; + BitField<0, 6, EventType> event_type; ///< Event type written to VGT_EVENT_INITIATOR + BitField<8, 4, EventIndex> event_index; ///< Event index + BitField<20, 1, u32> inv_l2; ///< Send WBINVL2 op to the TC L2 cache when EVENT_INDEX = 0111 + }; + u32 address[]; +}; + struct PM4CmdEventWriteEop { PM4Type3Header header; union { @@ -474,7 +588,12 @@ struct PM4CmdWaitRegMem { BitField<8, 1, Engine> engine; u32 raw; }; - u32 poll_addr_lo; + union { + BitField<0, 16, u32> reg; + BitField<2, 30, u32> poll_addr_lo; + BitField<0, 2, u32> swap; + u32 poll_addr_lo_raw; + }; u32 poll_addr_hi; u32 ref; u32 mask; @@ -482,31 +601,36 @@ struct PM4CmdWaitRegMem { template T Address() const { - return std::bit_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); + return std::bit_cast((uintptr_t(poll_addr_hi) << 32) | (poll_addr_lo << 2)); } - bool Test() const { + u32 Reg() const { + return reg.Value(); + } + + bool Test(const std::array& regs) const { + u32 value = mem_space.Value() == MemSpace::Memory ? *Address() : regs[Reg()]; switch (function.Value()) { case Function::Always: { return true; } case Function::LessThan: { - return (*Address() & mask) < ref; + return (value & mask) < ref; } case Function::LessThanEqual: { - return (*Address() & mask) <= ref; + return (value & mask) <= ref; } case Function::Equal: { - return (*Address() & mask) == ref; + return (value & mask) == ref; } case Function::NotEqual: { - return (*Address() & mask) != ref; + return (value & mask) != ref; } case Function::GreaterThanEqual: { - return (*Address() & mask) >= ref; + return (value & mask) >= ref; } case Function::GreaterThan: { - return (*Address() & mask) > ref; + return (value & mask) > ref; } case Function::Reserved: [[fallthrough]];