From 2e4f32a8407e8062bed841dd6ce769d5fd087d6d Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Sat, 13 Jul 2024 17:46:56 +0300 Subject: [PATCH] liverpool: Better sync for CPU flips --- src/core/libraries/gnmdriver/gnmdriver.cpp | 3 +- src/core/libraries/libs.cpp | 2 +- src/core/libraries/videoout/driver.cpp | 20 ++-- src/core/libraries/videoout/driver.h | 18 ++++ src/core/libraries/videoout/video_out.cpp | 10 +- src/video_core/amdgpu/liverpool.cpp | 22 ++++- src/video_core/amdgpu/liverpool.h | 10 ++ src/video_core/amdgpu/pm4_cmds.h | 14 +-- .../renderer_vulkan/renderer_vulkan.cpp | 98 +++++++------------ .../renderer_vulkan/renderer_vulkan.h | 25 +++-- .../renderer_vulkan/vk_master_semaphore.cpp | 44 --------- .../renderer_vulkan/vk_master_semaphore.h | 4 - .../renderer_vulkan/vk_rasterizer.cpp | 7 +- .../renderer_vulkan/vk_rasterizer.h | 2 +- .../renderer_vulkan/vk_scheduler.cpp | 54 ++++++++-- src/video_core/renderer_vulkan/vk_scheduler.h | 35 ++++++- 16 files changed, 206 insertions(+), 162 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 39fcc8125..dba69d6eb 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -20,13 +20,12 @@ extern Frontend::WindowSDL* g_window; std::unique_ptr renderer; +std::unique_ptr liverpool; namespace Libraries::GnmDriver { using namespace AmdGpu; -static std::unique_ptr liverpool; - enum GnmEventIdents : u64 { Compute0RelMem = 0x00, Compute1RelMem = 0x01, diff --git a/src/core/libraries/libs.cpp b/src/core/libraries/libs.cpp index f9325297c..47073b2c6 100644 --- a/src/core/libraries/libs.cpp +++ b/src/core/libraries/libs.cpp @@ -43,8 +43,8 @@ namespace Libraries { void InitHLELibs(Core::Loader::SymbolsResolver* sym) { LOG_INFO(Lib_Kernel, "Initializing HLE libraries"); Libraries::Kernel::LibKernel_Register(sym); - Libraries::VideoOut::RegisterLib(sym); Libraries::GnmDriver::RegisterlibSceGnmDriver(sym); + Libraries::VideoOut::RegisterLib(sym); if (!Config::isLleLibc()) { Libraries::LibC::libcSymbolsRegister(sym); } diff --git a/src/core/libraries/videoout/driver.cpp b/src/core/libraries/videoout/driver.cpp index bc4031036..1c2b747a0 100644 --- a/src/core/libraries/videoout/driver.cpp +++ b/src/core/libraries/videoout/driver.cpp @@ -7,11 +7,10 @@ #include "core/libraries/error_codes.h" #include "core/libraries/kernel/time_management.h" #include "core/libraries/videoout/driver.h" -#include "core/platform.h" - #include "video_core/renderer_vulkan/renderer_vulkan.h" extern std::unique_ptr renderer; +extern std::unique_ptr liverpool; namespace Libraries::VideoOut { @@ -48,15 +47,12 @@ VideoOutDriver::VideoOutDriver(u32 width, u32 height) { VideoOutDriver::~VideoOutDriver() = default; int VideoOutDriver::Open(const ServiceThreadParams* params) { - std::scoped_lock lock{mutex}; - if (main_port.is_open) { return ORBIS_VIDEO_OUT_ERROR_RESOURCE_BUSY; } - - int handle = 1; main_port.is_open = true; - return handle; + liverpool->SetVoPort(&main_port); + return 1; } void VideoOutDriver::Close(s32 handle) { @@ -164,7 +160,8 @@ void VideoOutDriver::Flip(const Request& req) { std::scoped_lock lock{mutex}; // Update flip status. - auto& flip_status = req.port->flip_status; + auto* port = req.port; + auto& flip_status = port->flip_status; flip_status.count++; flip_status.processTime = Libraries::Kernel::sceKernelGetProcessTime(); flip_status.tsc = Libraries::Kernel::sceKernelReadTsc(); @@ -174,7 +171,7 @@ void VideoOutDriver::Flip(const Request& req) { flip_status.flipPendingNum = static_cast(requests.size()); // Trigger flip events for the port. - for (auto& event : req.port->flip_events) { + for (auto& event : port->flip_events) { if (event != nullptr) { event->TriggerEvent(SCE_VIDEO_OUT_EVENT_FLIP, Kernel::SceKernelEvent::Filter::VideoOut, reinterpret_cast(req.flip_arg)); @@ -183,7 +180,8 @@ void VideoOutDriver::Flip(const Request& req) { // Reset flip label if (req.index != -1) { - req.port->buffer_labels[req.index] = 0; + port->buffer_labels[req.index] = 0; + port->SignalVoLabel(); } } @@ -197,7 +195,7 @@ bool VideoOutDriver::SubmitFlip(VideoOutPort* port, s32 index, s64 flip_arg, } else { const auto& buffer = port->buffer_slots[index]; const auto& group = port->groups[buffer.group_index]; - frame = renderer->PrepareFrame(group, buffer.address_left); + frame = renderer->PrepareFrame(group, buffer.address_left, is_eop); } if (index != -1 && requests.size() >= port->NumRegisteredBuffers()) { diff --git a/src/core/libraries/videoout/driver.h b/src/core/libraries/videoout/driver.h index 72f2be153..07bed31f6 100644 --- a/src/core/libraries/videoout/driver.h +++ b/src/core/libraries/videoout/driver.h @@ -26,6 +26,8 @@ struct VideoOutPort { SceVideoOutVblankStatus vblank_status; std::vector flip_events; std::vector vblank_events; + std::mutex vo_mutex; + std::condition_variable vo_cv; int flip_rate = 0; s32 FindFreeGroup() const { @@ -36,6 +38,22 @@ struct VideoOutPort { return index; } + bool IsVoLabel(const u64* address) const { + const u64* start = &buffer_labels[0]; + const u64* end = &buffer_labels[MaxDisplayBuffers - 1]; + return address >= start && address <= end; + } + + void WaitVoLabel(auto&& pred) { + std::unique_lock lk{vo_mutex}; + vo_cv.wait(lk, pred); + } + + void SignalVoLabel() { + std::scoped_lock lk{vo_mutex}; + vo_cv.notify_one(); + } + [[nodiscard]] int NumRegisteredBuffers() const { return std::count_if(buffer_slots.cbegin(), buffer_slots.cend(), [](auto& buffer) { return buffer.group_index != -1; }); diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index 2f552367d..f109c9b2a 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -141,12 +141,10 @@ s32 PS4_SYSV_ABI sceVideoOutSubmitFlip(s32 handle, s32 bufferIndex, s32 flipMode LOG_INFO(Lib_VideoOut, "bufferIndex = {}, flipMode = {}, flipArg = {}", bufferIndex, flipMode, flipArg); - // Next time the gpu enters idle state, submit the flip - Platform::IrqC::Instance()->RegisterOnce( - Platform::InterruptId::GpuIdle, [=](Platform::InterruptId irq) { - const auto result = driver->SubmitFlip(port, bufferIndex, flipArg); - ASSERT_MSG(result, "Flip submission failed"); - }); + if (!driver->SubmitFlip(port, bufferIndex, flipArg)) { + LOG_ERROR(Lib_VideoOut, "Flip queue is full"); + return ORBIS_VIDEO_OUT_ERROR_FLIP_QUEUE_FULL; + } return ORBIS_OK; } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 1801b7d3d..56a2c63e7 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -5,6 +5,7 @@ #include "common/debug.h" #include "common/polyfill_thread.h" #include "common/thread.h" +#include "core/libraries/videoout/driver.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/pm4_cmds.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" @@ -361,6 +362,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + // Guest can wait for GfxEop event to submit CPU flips. + // Flush command list to ensure order. + if (rasterizer && event_eop->int_sel == InterruptSelect::IrqWhenWriteConfirm) { + rasterizer->Flush(); + } event_eop->SignalFence(); break; } @@ -372,8 +378,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); const u32 data_size = (header->type3.count.Value() - 2) * 4; + u64* address = write_data->Address(); if (!write_data->wr_one_addr.Value()) { - std::memcpy(write_data->Address(), write_data->data, data_size); + // Guest can poll VO label before submitting CPU flips. + // Flush command list before signalling to ensure order. + if (rasterizer && vo_port->IsVoLabel(address)) { + rasterizer->Flush(); + } + std::memcpy(address, write_data->data, data_size); } else { UNREACHABLE(); } @@ -386,6 +398,14 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); + // Optimization: VO label waits are special because the emulator + // will write to the label when presentation is finished. So if + // there are no other submits to yield to we can sleep the thread + // instead and allow other tasks to run. + const u64* wait_addr = wait_reg_mem->Address(); + if (vo_port->IsVoLabel(wait_addr) && num_submits == 1) { + vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); }); + } while (!wait_reg_mem->Test()) { TracyFiberLeave; co_yield {}; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index d834a6f43..6923562ec 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "common/assert.h" #include "common/bit_field.h" #include "common/polyfill_thread.h" @@ -21,6 +22,10 @@ namespace Vulkan { class Rasterizer; } +namespace Libraries::VideoOut { +struct VideoOutPort; +} + namespace AmdGpu { #define GFX6_3D_REG_INDEX(field_name) (offsetof(AmdGpu::Liverpool::Regs, field_name) / sizeof(u32)) @@ -1001,6 +1006,10 @@ public: return num_submits == 0; } + void SetVoPort(Libraries::VideoOut::VideoOutPort* port) { + vo_port = port; + } + void BindRasterizer(Vulkan::Rasterizer* rasterizer_) { rasterizer = rasterizer_; } @@ -1065,6 +1074,7 @@ private: } cblock{}; Vulkan::Rasterizer* rasterizer{}; + Libraries::VideoOut::VideoOutPort* vo_port{}; std::jthread process_thread{}; std::atomic num_submits{}; std::atomic submit_done{}; diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index eded2de37..9b44da024 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -404,8 +404,9 @@ struct PM4CmdWaitRegMem { u32 mask; u32 poll_interval; - u32* Address() const { - return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); + template + T Address() const { + return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); } bool Test() const { @@ -464,8 +465,8 @@ struct PM4CmdWriteData { } template - T* Address() const { - return reinterpret_cast(addr64); + T Address() const { + return reinterpret_cast(addr64); } }; @@ -494,8 +495,9 @@ struct PM4CmdEventWriteEos { BitField<16, 16, u32> size; ///< Number of DWs to read from the GDS }; - u32* Address() const { - return reinterpret_cast(address_lo | u64(address_hi) << 32); + template + T Address() const { + return reinterpret_cast(address_lo | u64(address_hi) << 32); } u32 DataDWord() const { diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 098f14d9b..f28bfffc4 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -63,44 +63,30 @@ bool CanBlitToSwapchain(const vk::PhysicalDevice physical_device, vk::Format for }; } -RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool) - : window{window_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, - scheduler{instance}, swapchain{instance, window}, texture_cache{instance, scheduler} { - rasterizer = std::make_unique(instance, scheduler, texture_cache, liverpool); +RendererVulkan::RendererVulkan(Frontend::WindowSDL& window_, AmdGpu::Liverpool* liverpool_) + : window{window_}, liverpool{liverpool_}, instance{window, Config::getGpuId(), Config::vkValidationEnabled()}, + schedulers{Scheduler{instance}, Scheduler{instance}, Scheduler{instance}}, + swapchain{instance, window}, texture_cache{instance, schedulers[SchedulerType::Draw]} { + rasterizer = std::make_unique(instance, schedulers[SchedulerType::Draw], + texture_cache, liverpool); const u32 num_images = swapchain.GetImageCount(); const vk::Device device = instance.GetDevice(); - const vk::CommandPoolCreateInfo pool_info = { - .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | - vk::CommandPoolCreateFlagBits::eTransient, - .queueFamilyIndex = instance.GetGraphicsQueueFamilyIndex(), - }; - command_pool = device.createCommandPoolUnique(pool_info); - - const vk::CommandBufferAllocateInfo alloc_info = { - .commandPool = *command_pool, - .level = vk::CommandBufferLevel::ePrimary, - .commandBufferCount = num_images, - }; - - const auto cmdbuffers = device.allocateCommandBuffers(alloc_info); + // Create presentation frames. present_frames.resize(num_images); for (u32 i = 0; i < num_images; i++) { Frame& frame = present_frames[i]; - frame.cmdbuf = cmdbuffers[i]; - frame.render_ready = device.createSemaphore({}); frame.present_done = device.createFence({.flags = vk::FenceCreateFlagBits::eSignaled}); free_queue.push(&frame); } } RendererVulkan::~RendererVulkan() { - scheduler.Finish(); + schedulers[SchedulerType::Draw].Finish(); const vk::Device device = instance.GetDevice(); for (auto& frame : present_frames) { vmaDestroyImage(instance.GetAllocator(), frame.image, frame.allocation); device.destroyImageView(frame.image_view); - device.destroySemaphore(frame.render_ready); device.destroyFence(frame.present_done); } } @@ -184,7 +170,7 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) { info.pitch = splash->GetImageInfo().width; info.guest_address = VAddr(splash->GetImageData().data()); info.guest_size_bytes = splash->GetImageData().size(); - splash_img.emplace(instance, scheduler, info); + splash_img.emplace(instance, schedulers[SchedulerType::Present], info); texture_cache.RefreshImage(*splash_img); } frame = PrepareFrameInternal(*splash_img); @@ -193,12 +179,18 @@ bool RendererVulkan::ShowSplash(Frame* frame /*= nullptr*/) { return true; } -Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { +Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image, bool is_eop) { // Request a free presentation frame. Frame* frame = GetRenderFrame(); - // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image. - image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead); + // EOP flips are triggered from GPU thread to use the drawing scheduler to record + // commands. Otherwise we are dealing with a CPU flip which could have arrived + // from any guest thread. Use a separate scheduler for that. + auto& scheduler = schedulers[is_eop ? SchedulerType::Draw : SchedulerType::CpuFlip]; + const auto cmdbuf = scheduler.CommandBuffer(); + + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead, + cmdbuf); const std::array pre_barrier{ vk::ImageMemoryBarrier{ @@ -218,12 +210,11 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { }, }, }; - - const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, {}, {}, pre_barrier); + // Post-processing (Anti-aliasing, FSR etc) goes here. For now just blit to the frame image. cmdbuf.blitImage( image.image, image.layout, frame->image, vk::ImageLayout::eTransferDstOptimal, MakeImageBlit(image.info.size.width, image.info.size.height, frame->width, frame->height), @@ -245,13 +236,15 @@ Frame* RendererVulkan::PrepareFrameInternal(VideoCore::Image& image) { .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; - cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, vk::DependencyFlagBits::eByRegion, {}, {}, post_barrier); - // Flush pending vulkan operations. - scheduler.Flush(frame->render_ready); + // Flush frame creation commands. + frame->ready_semaphore = scheduler.GetMasterSemaphore()->Handle(); + frame->ready_tick = scheduler.CurrentTick(); + SubmitInfo info{}; + scheduler.Flush(info); return frame; } @@ -260,11 +253,8 @@ void RendererVulkan::Present(Frame* frame) { const vk::Image swapchain_image = swapchain.Image(); - const vk::CommandBufferBeginInfo begin_info = { - .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, - }; - const vk::CommandBuffer cmdbuf = frame->cmdbuf; - cmdbuf.begin(begin_info); + auto& scheduler = schedulers[SchedulerType::Present]; + const auto cmdbuf = scheduler.CommandBuffer(); { auto* profiler_ctx = instance.GetProfilerContext(); TracyVkNamedZoneC(profiler_ctx, renderer_gpu_zone, cmdbuf, "Host frame", @@ -339,35 +329,17 @@ void RendererVulkan::Present(Frame* frame) { TracyVkCollect(profiler_ctx, cmdbuf); } } - cmdbuf.end(); - static constexpr std::array wait_stage_masks = { - vk::PipelineStageFlagBits::eColorAttachmentOutput, - vk::PipelineStageFlagBits::eAllGraphics, - }; - - const vk::Semaphore present_ready = swapchain.GetPresentReadySemaphore(); - const vk::Semaphore image_acquired = swapchain.GetImageAcquiredSemaphore(); - const std::array wait_semaphores = {image_acquired, frame->render_ready}; - - vk::SubmitInfo submit_info = { - .waitSemaphoreCount = static_cast(wait_semaphores.size()), - .pWaitSemaphores = wait_semaphores.data(), - .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1u, - .pCommandBuffers = &cmdbuf, - .signalSemaphoreCount = 1, - .pSignalSemaphores = &present_ready, - }; - - std::scoped_lock submit_lock{scheduler.submit_mutex}; - try { - instance.GetGraphicsQueue().submit(submit_info, frame->present_done); - } catch (vk::DeviceLostError& err) { - LOG_CRITICAL(Render_Vulkan, "Device lost during present submit: {}", err.what()); - UNREACHABLE(); - } + // Flush vulkan commands. + SubmitInfo info{}; + info.AddWait(swapchain.GetImageAcquiredSemaphore()); + info.AddWait(frame->ready_semaphore, frame->ready_tick); + info.AddSignal(swapchain.GetPresentReadySemaphore()); + info.AddSignal(frame->present_done); + scheduler.Flush(info); + // Present to swapchain. + std::scoped_lock submit_lock{Scheduler::submit_mutex}; swapchain.Present(); // Free the frame for reuse diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 701d3d14b..0bff8e56e 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -26,9 +26,15 @@ struct Frame { VmaAllocation allocation; vk::Image image; vk::ImageView image_view; - vk::Semaphore render_ready; vk::Fence present_done; - vk::CommandBuffer cmdbuf; + vk::Semaphore ready_semaphore; + u64 ready_tick; +}; + +enum SchedulerType { + Draw, + Present, + CpuFlip, }; class Rasterizer; @@ -39,16 +45,16 @@ public: ~RendererVulkan(); Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, - VAddr cpu_address) { + VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; const auto image_id = texture_cache.FindImage(info, cpu_address); auto& image = texture_cache.GetImage(image_id); - return PrepareFrameInternal(image); + return PrepareFrameInternal(image, is_eop); } Frame* PrepareBlankFrame() { auto& image = texture_cache.GetImage(VideoCore::NULL_IMAGE_ID); - return PrepareFrameInternal(image); + return PrepareFrameInternal(image, true); } VideoCore::Image& RegisterVideoOutSurface( @@ -60,9 +66,9 @@ public: } bool IsVideoOutSurface(const AmdGpu::Liverpool::ColorBuffer& color_buffer) { - return std::find_if(vo_buffers_addr.cbegin(), vo_buffers_addr.cend(), [&](VAddr vo_buffer) { + return std::ranges::find_if(vo_buffers_addr, [&](VAddr vo_buffer) { return vo_buffer == color_buffer.Address(); - }) != vo_buffers_addr.cend(); + }) != vo_buffers_addr.end(); } bool ShowSplash(Frame* frame = nullptr); @@ -70,13 +76,14 @@ public: void RecreateFrame(Frame* frame, u32 width, u32 height); private: - Frame* PrepareFrameInternal(VideoCore::Image& image); + Frame* PrepareFrameInternal(VideoCore::Image& image, bool is_eop = true); Frame* GetRenderFrame(); private: Frontend::WindowSDL& window; + AmdGpu::Liverpool* liverpool; Instance instance; - Scheduler scheduler; + std::array schedulers; Swapchain swapchain; std::unique_ptr rasterizer; VideoCore::TextureCache texture_cache; diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index 037510d41..753f2bbdf 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -2,8 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include -#include "common/assert.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" @@ -60,46 +58,4 @@ void MasterSemaphore::Wait(u64 tick) { Refresh(); } -void MasterSemaphore::SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal, - u64 signal_value) { - cmdbuf.end(); - - const u32 num_signal_semaphores = signal ? 2U : 1U; - const std::array signal_values{signal_value, u64(0)}; - const std::array signal_semaphores{Handle(), signal}; - - const u32 num_wait_semaphores = wait ? 2U : 1U; - const std::array wait_values{signal_value - 1, u64(1)}; - const std::array wait_semaphores{Handle(), wait}; - - static constexpr std::array wait_stage_masks = { - vk::PipelineStageFlagBits::eAllCommands, - vk::PipelineStageFlagBits::eColorAttachmentOutput, - }; - - const vk::TimelineSemaphoreSubmitInfo timeline_si = { - .waitSemaphoreValueCount = num_wait_semaphores, - .pWaitSemaphoreValues = wait_values.data(), - .signalSemaphoreValueCount = num_signal_semaphores, - .pSignalSemaphoreValues = signal_values.data(), - }; - - const vk::SubmitInfo submit_info = { - .pNext = &timeline_si, - .waitSemaphoreCount = num_wait_semaphores, - .pWaitSemaphores = wait_semaphores.data(), - .pWaitDstStageMask = wait_stage_masks.data(), - .commandBufferCount = 1u, - .pCommandBuffers = &cmdbuf, - .signalSemaphoreCount = num_signal_semaphores, - .pSignalSemaphores = signal_semaphores.data(), - }; - - try { - instance.GetGraphicsQueue().submit(submit_info); - } catch (vk::DeviceLostError& err) { - UNREACHABLE_MSG("Device lost during submit: {}", err.what()); - } -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 963676b1a..ebc7a60a1 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -46,10 +46,6 @@ public: /// Waits for a tick to be hit on the GPU void Wait(u64 tick); - /// Submits the provided command buffer for execution - void SubmitWork(vk::CommandBuffer cmdbuf, vk::Semaphore wait, vk::Semaphore signal, - u64 signal_value); - protected: const Instance& instance; vk::UniqueSemaphore semaphore; ///< Timeline semaphore. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ff800facc..67a88c473 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -96,8 +96,11 @@ void Rasterizer::DispatchDirect() { cmdbuf.dispatch(cs_program.dim_x, cs_program.dim_y, cs_program.dim_z); } -void Rasterizer::Flush() { - scheduler.Flush(); +u64 Rasterizer::Flush() { + const u64 current_tick = scheduler.CurrentTick(); + SubmitInfo info{}; + scheduler.Flush(info); + return current_tick; } void Rasterizer::BeginRendering() { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index c5c6e3a7f..da3eb68aa 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -36,7 +36,7 @@ public: void ScopeMarkerBegin(const std::string& str); void ScopeMarkerEnd(); - void Flush(); + u64 Flush(); private: u32 SetupIndexBuffer(bool& is_indexed, u32 index_offset); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 39dc2847d..e7b12d49a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -2,12 +2,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include "common/assert.h" #include "common/debug.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" namespace Vulkan { +std::mutex Scheduler::submit_mutex; + Scheduler::Scheduler(const Instance& instance) : instance{instance}, master_semaphore{instance}, command_pool{instance, &master_semaphore} { profiler_scope = reinterpret_cast(std::malloc(sizeof(tracy::VkCtxScope))); @@ -50,22 +53,24 @@ void Scheduler::EndRendering() { current_cmdbuf.endRendering(); } -void Scheduler::Flush(vk::Semaphore signal, vk::Semaphore wait) { - // When flushing, we only send data to the worker thread; no waiting is necessary. - SubmitExecution(signal, wait); +void Scheduler::Flush(SubmitInfo& info) { + // When flushing, we only send data to the driver; no waiting is necessary. + SubmitExecution(info); } -void Scheduler::Finish(vk::Semaphore signal, vk::Semaphore wait) { +void Scheduler::Finish() { // When finishing, we need to wait for the submission to have executed on the device. const u64 presubmit_tick = CurrentTick(); - SubmitExecution(signal, wait); + SubmitInfo info{}; + SubmitExecution(info); Wait(presubmit_tick); } void Scheduler::Wait(u64 tick) { if (tick >= master_semaphore.CurrentTick()) { // Make sure we are not waiting for the current tick without signalling - Flush(); + SubmitInfo info{}; + Flush(info); } master_semaphore.Wait(tick); } @@ -86,7 +91,7 @@ void Scheduler::AllocateWorkerCommandBuffers() { } } -void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore) { +void Scheduler::SubmitExecution(SubmitInfo& info) { std::scoped_lock lk{submit_mutex}; const u64 signal_value = master_semaphore.NextTick(); @@ -97,7 +102,40 @@ void Scheduler::SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wa } EndRendering(); - master_semaphore.SubmitWork(current_cmdbuf, wait_semaphore, signal_semaphore, signal_value); + current_cmdbuf.end(); + + const vk::Semaphore timeline = master_semaphore.Handle(); + info.AddSignal(timeline, signal_value); + + static constexpr std::array wait_stage_masks = { + vk::PipelineStageFlagBits::eAllCommands, + vk::PipelineStageFlagBits::eColorAttachmentOutput, + }; + + const vk::TimelineSemaphoreSubmitInfo timeline_si = { + .waitSemaphoreValueCount = static_cast(info.wait_ticks.size()), + .pWaitSemaphoreValues = info.wait_ticks.data(), + .signalSemaphoreValueCount = static_cast(info.signal_ticks.size()), + .pSignalSemaphoreValues = info.signal_ticks.data(), + }; + + const vk::SubmitInfo submit_info = { + .pNext = &timeline_si, + .waitSemaphoreCount = static_cast(info.wait_semas.size()), + .pWaitSemaphores = info.wait_semas.data(), + .pWaitDstStageMask = wait_stage_masks.data(), + .commandBufferCount = 1U, + .pCommandBuffers = ¤t_cmdbuf, + .signalSemaphoreCount = static_cast(info.signal_semas.size()), + .pSignalSemaphores = info.signal_semas.data(), + }; + + try { + instance.GetGraphicsQueue().submit(submit_info, info.fence); + } catch (vk::DeviceLostError& err) { + UNREACHABLE_MSG("Device lost during submit: {}", err.what()); + } + master_semaphore.Refresh(); AllocateWorkerCommandBuffers(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index b45042743..c5fd1e2b3 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -26,16 +26,39 @@ struct RenderState { } }; +struct SubmitInfo { + boost::container::static_vector wait_semas; + boost::container::static_vector wait_ticks; + boost::container::static_vector signal_semas; + boost::container::static_vector signal_ticks; + vk::Fence fence; + + void AddWait(vk::Semaphore semaphore, u64 tick = 1) { + wait_semas.emplace_back(semaphore); + wait_ticks.emplace_back(tick); + } + + void AddSignal(vk::Semaphore semaphore, u64 tick = 1) { + signal_semas.emplace_back(semaphore); + signal_ticks.emplace_back(tick); + } + + void AddSignal(vk::Fence fence) { + this->fence = fence; + } +}; + class Scheduler { public: explicit Scheduler(const Instance& instance); ~Scheduler(); - /// Sends the current execution context to the GPU. - void Flush(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr); + /// Sends the current execution context to the GPU + /// and increments the scheduler timeline semaphore. + void Flush(SubmitInfo& info); /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(vk::Semaphore signal = nullptr, vk::Semaphore wait = nullptr); + void Finish(); /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); @@ -71,17 +94,21 @@ public: return &master_semaphore; } +<<<<<<< HEAD /// Defers an operation until the gpu has reached the current cpu tick. void DeferOperation(auto&& func) { pending_ops.emplace(func, CurrentTick()); } std::mutex submit_mutex; +======= + static std::mutex submit_mutex; +>>>>>>> 06b0f04 (liverpool: Better sync for CPU flips) private: void AllocateWorkerCommandBuffers(); - void SubmitExecution(vk::Semaphore signal_semaphore, vk::Semaphore wait_semaphore); + void SubmitExecution(SubmitInfo& info); private: const Instance& instance;