diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index e85b8b890..583339dd9 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -296,17 +296,12 @@ static_assert(CtxInitSequence400.size() == 0x61); // In case if `submitDone` is issued we need to block submissions until GPU idle static u32 submission_lock{}; std::condition_variable cv_lock{}; -static std::mutex m_submission{}; +std::mutex m_submission{}; static u64 frames_submitted{}; // frame counter static bool send_init_packet{true}; // initialize HW state before first game's submit in a frame static int sdk_version{0}; -struct AscQueueInfo { - VAddr map_addr; - u32* read_addr; - u32 ring_size_dw; -}; -static Common::SlotVector asc_queues{}; +static u32 asc_next_offs_dw[Liverpool::NumComputeRings]; static constexpr VAddr tessellation_factors_ring_addr = Core::SYSTEM_RESERVED_MAX - 0xFFFFFFF; static constexpr u32 tessellation_offchip_buffer_size = 0x800000u; @@ -506,11 +501,19 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) { } auto vqid = gnm_vqid - 1; - auto& asc_queue = asc_queues[{vqid}]; - const auto* acb_ptr = reinterpret_cast(asc_queue.map_addr + *asc_queue.read_addr); - const auto acb_size = next_offs_dw ? (next_offs_dw << 2u) - *asc_queue.read_addr - : (asc_queue.ring_size_dw << 2u) - *asc_queue.read_addr; - const std::span acb_span{acb_ptr, acb_size >> 2u}; + auto& asc_queue = liverpool->asc_queues[{vqid}]; + + const auto& offs_dw = asc_next_offs_dw[vqid]; + + if (next_offs_dw < offs_dw) { + ASSERT_MSG(next_offs_dw == 0, "ACB submission is split at the end of ring buffer"); + } + + const auto* acb_ptr = reinterpret_cast(asc_queue.map_addr) + offs_dw; + const auto acb_size_dw = (next_offs_dw ? next_offs_dw : asc_queue.ring_size_dw) - offs_dw; + const std::span acb_span{acb_ptr, acb_size_dw}; + + asc_next_offs_dw[vqid] = next_offs_dw; if (DebugState.DumpingCurrentFrame()) { static auto last_frame_num = -1LL; @@ -545,9 +548,6 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) { }); } liverpool->SubmitAsc(gnm_vqid, acb_span); - - *asc_queue.read_addr += acb_size; - *asc_queue.read_addr %= asc_queue.ring_size_dw * 4; } void PS4_SYSV_ABI sceGnmDingDongForWorkload(u32 gnm_vqid, u32 next_offs_dw, u64 workload_id) { @@ -1266,12 +1266,16 @@ int PS4_SYSV_ABI sceGnmMapComputeQueue(u32 pipe_id, u32 queue_id, VAddr ring_bas return ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_READ_PTR_ADDR; } - auto vqid = asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw); + const auto vqid = + liverpool->asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw, pipe_id); // We need to offset index as `dingDong` assumes it to be from the range [1..64] const auto gnm_vqid = vqid.index + 1; LOG_INFO(Lib_GnmDriver, "ASC pipe {} queue {} mapped to vqueue {}", pipe_id, queue_id, gnm_vqid); + const auto& queue = liverpool->asc_queues[vqid]; + *queue.read_addr = 0u; + return gnm_vqid; } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index b4e2f36a8..2d7aa4f3f 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -29,11 +29,11 @@ static_assert(Liverpool::NumComputeRings <= MAX_NAMES); static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES); -#define YIELD_CE(name) \ +#define YIELD_CE() \ mapped_queues[GfxQueueId].cs_state = regs.cs_program; \ FIBER_EXIT; \ co_yield {}; \ - FIBER_ENTER(name); \ + FIBER_ENTER(ccb_task_name); \ regs.cs_program = mapped_queues[GfxQueueId].cs_state #define YIELD_GFX \ @@ -44,11 +44,11 @@ static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES); regs.cs_program = mapped_queues[GfxQueueId].cs_state; #define YIELD_ASC(id) \ - mapped_queues[id].cs_state = regs.cs_program; \ + mapped_queues[id + 1].cs_state = regs.cs_program; \ FIBER_EXIT; \ co_yield {}; \ FIBER_ENTER(acb_task_name[id]); \ - regs.cs_program = mapped_queues[id].cs_state; + regs.cs_program = mapped_queues[id + 1].cs_state; #define RESUME(task, name) \ FIBER_EXIT; \ @@ -114,7 +114,7 @@ void Liverpool::Process(std::stop_token stoken) { --num_commands; } - qid = (qid + 1) % NumTotalQueues; + qid = (qid + 1) % num_mapped_queues; auto& queue = mapped_queues[qid]; @@ -190,7 +190,7 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { case PM4ItOpcode::WaitOnDeCounterDiff: { const auto diff = it_body[0]; while ((cblock.de_count - cblock.ce_count) >= diff) { - YIELD_CE(ccb_task_name); + YIELD_CE(); } break; } @@ -198,10 +198,9 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { const auto* indirect_buffer = reinterpret_cast(header); auto task = ProcessCeUpdate({indirect_buffer->Address(), indirect_buffer->ib_size}); - RESUME(task, ccb_task_name); while (!task.handle.done()) { - YIELD_CE(ccb_task_name); + YIELD_CE(); RESUME(task, ccb_task_name); } break; @@ -229,7 +228,6 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(dcb.data()); @@ -708,8 +706,10 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span acb, int vqid) { +template +Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { FIBER_ENTER(acb_task_name[vqid]); + const auto& queue = asc_queues[{vqid}]; auto base_addr = reinterpret_cast(acb.data()); while (!acb.empty()) { @@ -730,7 +730,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } case PM4ItOpcode::IndirectBuffer: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCompute( + auto task = ProcessCompute( {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); RESUME(task, acb_task_name[vqid]); @@ -823,7 +823,7 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } case PM4ItOpcode::ReleaseMem: { const auto* release_mem = reinterpret_cast(header); - release_mem->SignalFence(Platform::InterruptId::Compute0RelMem); // <--- + release_mem->SignalFence(static_cast(queue.pipe_id)); break; } default: @@ -831,7 +831,13 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { static_cast(opcode), count); } - acb = NextPacket(acb, header->type3.NumWords() + 1); + const auto packet_size_dw = header->type3.NumWords() + 1; + acb = NextPacket(acb, packet_size_dw); + + if constexpr (!is_indirect) { + *queue.read_addr += packet_size_dw; + *queue.read_addr %= queue.ring_size_dw; + } } FIBER_EXIT; @@ -895,13 +901,15 @@ void Liverpool::SubmitAsc(u32 gnm_vqid, std::span acb) { ASSERT_MSG(gnm_vqid > 0 && gnm_vqid < NumTotalQueues, "Invalid virtual ASC queue index"); auto& queue = mapped_queues[gnm_vqid]; - const auto& task = ProcessCompute(acb, gnm_vqid); + const auto vqid = gnm_vqid - 1; + const auto& task = ProcessCompute(acb, vqid); { std::scoped_lock lock{queue.m_access}; queue.submits.emplace(task.handle); } std::scoped_lock lk{submit_mutex}; + num_mapped_queues = std::max(num_mapped_queues, gnm_vqid + 1); ++num_submits; submit_cv.notify_one(); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index f7eba2ba7..63849c986 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -16,6 +16,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "common/polyfill_thread.h" +#include "common/slot_vector.h" #include "common/types.h" #include "common/unique_function.h" #include "shader_recompiler/params.h" @@ -1342,6 +1343,14 @@ public: gfx_queue.dcb_buffer.reserve(GfxReservedSize); } + struct AscQueueInfo { + VAddr map_addr; + u32* read_addr; + u32 ring_size_dw; + u32 pipe_id; + }; + Common::SlotVector asc_queues{}; + private: struct Task { struct promise_type { @@ -1379,7 +1388,8 @@ private: std::span ccb); Task ProcessGraphics(std::span dcb, std::span ccb); Task ProcessCeUpdate(std::span ccb); - Task ProcessCompute(std::span acb, int vqid); + template + Task ProcessCompute(std::span acb, u32 vqid); void Process(std::stop_token stoken); @@ -1394,6 +1404,7 @@ private: VAddr indirect_args_addr{}; }; std::array mapped_queues{}; + u32 num_mapped_queues{1u}; // GFX is always available struct ConstantEngine { void Reset() {