From 7fedbd52e0629c037be631068f609e77a2b27615 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 7 Jul 2025 16:23:20 +0300 Subject: [PATCH 01/14] texture_cache: Async download of GPU modified linear images (#3204) * texture_cache: Async download of GPU modified linear images * liverpool: Back to less submits * texture_cache: Don't download depth images * config: Add option for linear image readback --- src/common/config.cpp | 11 +++- src/common/config.h | 1 + src/video_core/amdgpu/liverpool.cpp | 3 +- src/video_core/buffer_cache/buffer_cache.h | 2 +- .../renderer_vulkan/vk_rasterizer.cpp | 16 +++-- .../renderer_vulkan/vk_rasterizer.h | 2 +- .../renderer_vulkan/vk_scheduler.cpp | 13 ++-- src/video_core/renderer_vulkan/vk_scheduler.h | 3 + .../texture_cache/texture_cache.cpp | 61 ++++++++++++++++++- src/video_core/texture_cache/texture_cache.h | 10 ++- 10 files changed, 106 insertions(+), 16 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index d3a5fa6a1..010fecf95 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -65,6 +65,7 @@ static u32 screenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; +static bool readbackLinearImagesEnabled = false; static bool directMemoryAccessEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = false; @@ -103,7 +104,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 51; +static constexpr u64 total_entries = 52; bool allowHDR() { return isHDRAllowed; @@ -262,6 +263,10 @@ bool readbacks() { return readbacksEnabled; } +bool readbackLinearImages() { + return readbackLinearImagesEnabled; +} + bool directMemoryAccess() { return directMemoryAccessEnabled; } @@ -631,6 +636,8 @@ void load(const std::filesystem::path& path) { isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); + readbackLinearImagesEnabled = + toml::find_or(gpu, "readbackLinearImages", readbackLinearImagesEnabled); directMemoryAccessEnabled = toml::find_or(gpu, "directMemoryAccess", directMemoryAccessEnabled); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", shouldDumpShaders); @@ -802,6 +809,7 @@ void save(const std::filesystem::path& path) { data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; + data["GPU"]["readbackLinearImages"] = readbackLinearImagesEnabled; data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; @@ -902,6 +910,7 @@ void setDefaultValues() { isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; + readbackLinearImagesEnabled = false; directMemoryAccessEnabled = false; shouldDumpShaders = false; shouldPatchShaders = false; diff --git a/src/common/config.h b/src/common/config.h index 931fa68e2..2ed08198a 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -47,6 +47,7 @@ bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); bool readbacks(); void setReadbacks(bool enable); +bool readbackLinearImages(); bool directMemoryAccess(); void setDirectMemoryAccess(bool enable); bool dumpShaders(); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 9b8c28b66..e264de74a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -135,9 +135,8 @@ void Liverpool::Process(std::stop_token stoken) { if (submit_done) { VideoCore::EndCapture(); - if (rasterizer) { - rasterizer->ProcessFaults(); + rasterizer->EndCommandList(); rasterizer->Flush(); } submit_done = false; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 900a27aee..354d01431 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -112,7 +112,7 @@ public: /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size); - /// Waits on pending downloads in the logical page range. + /// Flushes any GPU modified buffer in the logical page range back to CPU memory. void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); /// Binds host vertex buffers for the current draw. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e4e026485..cca193831 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -272,6 +272,8 @@ void Rasterizer::EliminateFastClear() { void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -317,6 +319,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 u32 max_count, VAddr count_address) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -380,6 +384,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -407,6 +413,8 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -439,11 +447,12 @@ void Rasterizer::Finish() { scheduler.Finish(); } -void Rasterizer::ProcessFaults() { +void Rasterizer::EndCommandList() { if (fault_process_pending) { fault_process_pending = false; buffer_cache.ProcessFaultBuffer(); } + texture_cache.ProcessDownloadImages(); } bool Rasterizer::BindResources(const Pipeline* pipeline) { @@ -649,8 +658,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (instance.IsNullDescriptorSupported()) { image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); } else { - auto& null_image_view = - texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc.view_info); + auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, vk::ImageLayout::eGeneral); } @@ -664,7 +672,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin bound_images.emplace_back(image_id); auto& image = texture_cache.GetImage(image_id); - auto& image_view = texture_cache.FindTexture(image_id, desc.view_info); + auto& image_view = texture_cache.FindTexture(image_id, desc); if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4a978746c..1e1680258 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -68,7 +68,7 @@ public: void CpSync(); u64 Flush(); void Finish(); - void ProcessFaults(); + void EndCommandList(); PipelineCache& GetPipelineCache() { return pipeline_cache; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e75a69924..4c4e17fe4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -101,6 +101,14 @@ void Scheduler::Wait(u64 tick) { } } +void Scheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + void Scheduler::AllocateWorkerCommandBuffers() { const vk::CommandBufferBeginInfo begin_info = { .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, @@ -175,10 +183,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { AllocateWorkerCommandBuffers(); // Apply pending operations - while (!pending_ops.empty() && IsFree(pending_ops.front().gpu_tick)) { - pending_ops.front().callback(); - pending_ops.pop(); - } + PopPendingOperations(); } void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8ddf00f6a..36fd9c055 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -317,6 +317,9 @@ public: /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + /// Starts a new rendering scope with provided state. void BeginRendering(const RenderState& new_state); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index aa6563a84..723b95892 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -5,7 +5,9 @@ #include #include "common/assert.h" +#include "common/config.h" #include "common/debug.h" +#include "core/memory.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -58,6 +60,50 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { return null_id; } +void TextureCache::ProcessDownloadImages() { + for (const ImageId image_id : download_images) { + DownloadImageMemory(image_id); + } + download_images.clear(); +} + +void TextureCache::DownloadImageMemory(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download); + const u32 download_size = image.info.pitch * image.info.size.height * + image.info.resources.layers * (image.info.num_bits / 8); + ASSERT(download_size <= image.info.guest_size); + const auto [download, offset] = download_buffer.Map(download_size); + download_buffer.Commit(); + const vk::BufferImageCopy image_download = { + .bufferOffset = offset, + .bufferRowLength = image.info.pitch, + .bufferImageHeight = image.info.size.height, + .imageSubresource = + { + .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = image.info.resources.layers, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, + download_buffer.Handle(), image_download); + scheduler.DeferOperation([device_addr = image.info.guest_address, download, download_size] { + auto* memory = Core::Memory::Instance(); + memory->TryWriteBacking(std::bit_cast(device_addr), download, download_size); + }); +} + void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { if (image.hash == 0) { // Initialize hash @@ -437,16 +483,27 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo return slot_image_views[view_id]; } -ImageView& TextureCache::FindTexture(ImageId image_id, const ImageViewInfo& view_info) { +ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; + if (desc.type == BindingType::Storage) { + image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } + } UpdateImage(image_id); - return RegisterImageView(image_id, view_info); + return RegisterImageView(image_id, desc.view_info); } ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } image.usage.render_target = 1u; UpdateImage(image_id); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 87228b84f..ff8ffb61c 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -105,11 +106,14 @@ public: /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); + /// Schedules a copy of pending images for download back to CPU memory. + void ProcessDownloadImages(); + /// Retrieves the image handle of the image with the provided attributes. [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); /// Retrieves an image view with the properties of the specified image id. - [[nodiscard]] ImageView& FindTexture(ImageId image_id, const ImageViewInfo& view_info); + [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); /// Retrieves the render target with specified properties [[nodiscard]] ImageView& FindRenderTarget(BaseDesc& desc); @@ -252,6 +256,9 @@ private: /// Gets or creates a null image for a particular format. ImageId GetNullImage(vk::Format format); + /// Copies image memory back to CPU. + void DownloadImageMemory(ImageId image_id); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); @@ -293,6 +300,7 @@ private: Common::SlotVector slot_image_views; tsl::robin_map samplers; tsl::robin_map null_images; + std::unordered_set download_images; PageTable page_table; std::mutex mutex; From 80f7ec26816705f8080187c284c828a7a6382b8a Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Mon, 7 Jul 2025 18:17:56 +0200 Subject: [PATCH 02/14] video_out: Internal Resolution Support (#3194) * impl * clang * clang+ * update total_entries too --- src/common/config.cpp | 60 ++++++++++++++++------- src/common/config.h | 12 +++-- src/core/libraries/videoout/video_out.cpp | 3 +- src/emulator.cpp | 2 +- src/qt_gui/settings_dialog.cpp | 4 +- 5 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index 010fecf95..6f8563377 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -60,8 +60,10 @@ static bool overrideControllerColor = false; static int controllerCustomColorRGB[3] = {0, 0, 255}; // GPU -static u32 screenWidth = 1280; -static u32 screenHeight = 720; +static u32 windowWidth = 1280; +static u32 windowHeight = 720; +static u32 internalScreenWidth = 1280; +static u32 internalScreenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; @@ -104,7 +106,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 52; +static constexpr u64 total_entries = 54; bool allowHDR() { return isHDRAllowed; @@ -195,12 +197,20 @@ double getTrophyNotificationDuration() { return trophyNotificationDuration; } -u32 getScreenWidth() { - return screenWidth; +u32 getWindowWidth() { + return windowWidth; } -u32 getScreenHeight() { - return screenHeight; +u32 getWindowHeight() { + return windowHeight; +} + +u32 getInternalScreenWidth() { + return internalScreenHeight; +} + +u32 getInternalScreenHeight() { + return internalScreenHeight; } s32 getGpuId() { @@ -339,12 +349,20 @@ void setGpuId(s32 selectedGpuId) { gpuId = selectedGpuId; } -void setScreenWidth(u32 width) { - screenWidth = width; +void setWindowWidth(u32 width) { + windowWidth = width; } -void setScreenHeight(u32 height) { - screenHeight = height; +void setWindowHeight(u32 height) { + windowHeight = height; +} + +void setInternalScreenWidth(u32 width) { + internalScreenWidth = width; +} + +void setInternalScreenHeight(u32 height) { + internalScreenHeight = height; } void setDebugDump(bool enable) { @@ -426,6 +444,7 @@ void setCursorState(s16 newCursorState) { void setCursorHideTimeout(int newcursorHideTimeout) { cursorHideTimeout = newcursorHideTimeout; } + void setTrophyNotificationDuration(double newTrophyNotificationDuration) { trophyNotificationDuration = newTrophyNotificationDuration; } @@ -631,8 +650,11 @@ void load(const std::filesystem::path& path) { if (data.contains("GPU")) { const toml::value& gpu = data.at("GPU"); - screenWidth = toml::find_or(gpu, "screenWidth", screenWidth); - screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); + windowWidth = toml::find_or(gpu, "screenWidth", windowWidth); + windowHeight = toml::find_or(gpu, "screenHeight", windowHeight); + internalScreenWidth = toml::find_or(gpu, "internalScreenWidth", internalScreenWidth); + internalScreenHeight = + toml::find_or(gpu, "internalScreenHeight", internalScreenHeight); isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); @@ -804,8 +826,10 @@ void save(const std::filesystem::path& path) { data["Input"]["specialPadClass"] = specialPadClass; data["Input"]["isMotionControlsEnabled"] = isMotionControlsEnabled; data["Input"]["useUnifiedInputConfig"] = useUnifiedInputConfig; - data["GPU"]["screenWidth"] = screenWidth; - data["GPU"]["screenHeight"] = screenHeight; + data["GPU"]["screenWidth"] = windowWidth; + data["GPU"]["screenHeight"] = windowHeight; + data["GPU"]["internalScreenWidth"] = internalScreenWidth; + data["GPU"]["internalScreenHeight"] = internalScreenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; @@ -905,8 +929,10 @@ void setDefaultValues() { controllerCustomColorRGB[2] = 255; // GPU - screenWidth = 1280; - screenHeight = 720; + windowWidth = 1280; + windowHeight = 720; + internalScreenWidth = 1280; + internalScreenHeight = 720; isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; diff --git a/src/common/config.h b/src/common/config.h index 2ed08198a..e54425676 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -25,10 +25,14 @@ bool getIsFullscreen(); void setIsFullscreen(bool enable); std::string getFullscreenMode(); void setFullscreenMode(std::string mode); -u32 getScreenWidth(); -u32 getScreenHeight(); -void setScreenWidth(u32 width); -void setScreenHeight(u32 height); +u32 getWindowWidth(); +u32 getWindowHeight(); +void setWindowWidth(u32 width); +void setWindowHeight(u32 height); +u32 getInternalScreenWidth(); +u32 getInternalScreenHeight(); +void setInternalScreenWidth(u32 width); +void setInternalScreenHeight(u32 height); bool debugDump(); void setDebugDump(bool enable); s32 getGpuId(); diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index da715b3bf..0f961923a 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -445,7 +445,8 @@ s32 PS4_SYSV_ABI sceVideoOutConfigureOutputMode_(s32 handle, u32 reserved, const } void RegisterLib(Core::Loader::SymbolsResolver* sym) { - driver = std::make_unique(Config::getScreenWidth(), Config::getScreenHeight()); + driver = std::make_unique(Config::getInternalScreenWidth(), + Config::getInternalScreenHeight()); LIB_FUNCTION("SbU3dwp80lQ", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutGetFlipStatus); diff --git a/src/emulator.cpp b/src/emulator.cpp index fbab5929b..332287d22 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -222,7 +222,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar } } window = std::make_unique( - Config::getScreenWidth(), Config::getScreenHeight(), controller, window_title); + Config::getWindowWidth(), Config::getWindowHeight(), controller, window_title); g_window = window.get(); diff --git a/src/qt_gui/settings_dialog.cpp b/src/qt_gui/settings_dialog.cpp index c9d264587..ed2a17e25 100644 --- a/src/qt_gui/settings_dialog.cpp +++ b/src/qt_gui/settings_dialog.cpp @@ -762,8 +762,8 @@ void SettingsDialog::UpdateSettings() { m_gui_settings->SetValue(gui::gl_backgroundMusicVolume, ui->BGMVolumeSlider->value()); Config::setLanguage(languageIndexes[ui->consoleLanguageComboBox->currentIndex()]); Config::setEnableDiscordRPC(ui->discordRPCCheckbox->isChecked()); - Config::setScreenWidth(ui->widthSpinBox->value()); - Config::setScreenHeight(ui->heightSpinBox->value()); + Config::setWindowWidth(ui->widthSpinBox->value()); + Config::setWindowHeight(ui->heightSpinBox->value()); Config::setVblankDiv(ui->vblankSpinBox->value()); Config::setDumpShaders(ui->dumpShadersCheckBox->isChecked()); Config::setNullGpu(ui->nullGpuCheckBox->isChecked()); From ddede4a52dd2573d84abbf02c4bc35a9b1d18e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Bogd=C4=81ns?= Date: Tue, 8 Jul 2025 01:04:16 +0300 Subject: [PATCH 03/14] IME fixes (#3207) - Moved enums, flags, and structs to ime_common.h to simplify usage with Ime and ImeDialog - Updated Ime to use an enum as the return type, consistent with ImeDialog - Removed duplicate definition of OrbisImeKeycode - Added OrbisImeLanguage as a flags enum - Added missing options to OrbisImeOption - Removed OrbisImeDialogOption; OrbisImeOption should be used instead - Added OrbisImeTextAreaMode - Updated OrbisImeTextAreaMode - Fixed OrbisImeEventParam by adding the missing member OrbisImePanelType panel_type - Updated the sceImeOpen declaration to use extended parameters (not yet implemented) -Fixed Diablo III (CUSA00434) assertion failure on ImeDialog initialization Co-authored-by: w1naenator --- src/core/libraries/ime/ime.cpp | 68 +++---- src/core/libraries/ime/ime.h | 80 +------- src/core/libraries/ime/ime_common.h | 232 ++++++++++++++++++++++- src/core/libraries/ime/ime_dialog.cpp | 49 +++-- src/core/libraries/ime/ime_dialog.h | 120 ------------ src/core/libraries/ime/ime_dialog_ui.cpp | 32 +++- src/core/libraries/ime/ime_ui.cpp | 2 +- 7 files changed, 325 insertions(+), 258 deletions(-) diff --git a/src/core/libraries/ime/ime.cpp b/src/core/libraries/ime/ime.cpp index 1c61bc276..54e856e87 100644 --- a/src/core/libraries/ime/ime.cpp +++ b/src/core/libraries/ime/ime.cpp @@ -43,8 +43,8 @@ public: openEvent.param.rect.x = m_param.ime.posx; openEvent.param.rect.y = m_param.ime.posy; } else { - openEvent.param.resource_id_array.userId = 1; - openEvent.param.resource_id_array.resourceId[0] = 1; + openEvent.param.resource_id_array.user_id = 1; + openEvent.param.resource_id_array.resource_id[0] = 1; } // Are we supposed to call the event handler on init with @@ -59,10 +59,10 @@ public: } } - s32 Update(OrbisImeEventHandler handler) { + Error Update(OrbisImeEventHandler handler) { if (!m_ime_mode) { /* We don't handle any events for ImeKeyboard */ - return ORBIS_OK; + return Error::OK; } std::unique_lock lock{g_ime_state.queue_mutex}; @@ -73,7 +73,7 @@ public: Execute(handler, &event, false); } - return ORBIS_OK; + return Error::OK; } void Execute(OrbisImeEventHandler handler, OrbisImeEvent* event, bool use_param_handler) { @@ -94,14 +94,14 @@ public: } } - s32 SetText(const char16_t* text, u32 length) { + Error SetText(const char16_t* text, u32 length) { g_ime_state.SetText(text, length); - return ORBIS_OK; + return Error::OK; } - s32 SetCaret(const OrbisImeCaret* caret) { + Error SetCaret(const OrbisImeCaret* caret) { g_ime_state.SetCaret(caret->index); - return ORBIS_OK; + return Error::OK; } bool IsIme() { @@ -222,11 +222,11 @@ int PS4_SYSV_ABI sceImeGetPanelPositionAndForm() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height) { +Error PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height) { LOG_INFO(Lib_Ime, "called"); if (!width || !height) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } switch (param->type) { @@ -244,18 +244,18 @@ s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* break; } - return ORBIS_OK; + return Error::OK; } -s32 PS4_SYSV_ABI sceImeKeyboardClose(s32 userId) { +Error PS4_SYSV_ABI sceImeKeyboardClose(s32 userId) { LOG_INFO(Lib_Ime, "(STUBBED) called"); if (!g_keyboard_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } g_keyboard_handler.release(); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeKeyboardGetInfo() { @@ -268,25 +268,25 @@ int PS4_SYSV_ABI sceImeKeyboardGetResourceId() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param) { +Error PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param) { LOG_INFO(Lib_Ime, "called"); if (!param) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } if (!param->arg) { - return ORBIS_IME_ERROR_INVALID_ARG; + return Error::INVALID_ARG; } if (!param->handler) { - return ORBIS_IME_ERROR_INVALID_HANDLER; + return Error::INVALID_HANDLER; } if (g_keyboard_handler) { - return ORBIS_IME_ERROR_BUSY; + return Error::BUSY; } g_keyboard_handler = std::make_unique(param); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeKeyboardOpenInternal() { @@ -304,18 +304,18 @@ int PS4_SYSV_ABI sceImeKeyboardUpdate() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const void* extended) { +Error PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const OrbisImeParamExtended* extended) { LOG_INFO(Lib_Ime, "called"); if (!param) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } if (g_ime_handler) { - return ORBIS_IME_ERROR_BUSY; + return Error::BUSY; } g_ime_handler = std::make_unique(param); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeOpenInternal() { @@ -339,27 +339,27 @@ int PS4_SYSV_ABI sceImeSetCandidateIndex() { return ORBIS_OK; } -int PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret) { +Error PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret) { LOG_TRACE(Lib_Ime, "called"); if (!g_ime_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } if (!caret) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } return g_ime_handler->SetCaret(caret); } -s32 PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length) { +Error PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length) { LOG_TRACE(Lib_Ime, "called"); if (!g_ime_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } if (!text) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } return g_ime_handler->SetText(text, length); @@ -370,7 +370,7 @@ int PS4_SYSV_ABI sceImeSetTextGeometry() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { +Error PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { if (g_ime_handler) { g_ime_handler->Update(handler); } @@ -380,10 +380,10 @@ s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { } if (!g_ime_handler || !g_keyboard_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeVshClearPreedit() { diff --git a/src/core/libraries/ime/ime.h b/src/core/libraries/ime/ime.h index fcf381048..c2b80809c 100644 --- a/src/core/libraries/ime/ime.h +++ b/src/core/libraries/ime/ime.h @@ -13,72 +13,6 @@ class SymbolsResolver; namespace Libraries::Ime { -constexpr u32 ORBIS_IME_MAX_TEXT_LENGTH = 2048; - -enum class OrbisImeKeyboardOption : u32 { - Default = 0, - Repeat = 1, - RepeatEachKey = 2, - AddOsk = 4, - EffectiveWithIme = 8, - DisableResume = 16, - DisableCapslockWithoutShift = 32, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeKeyboardOption) - -enum class OrbisImeOption : u32 { - DEFAULT = 0, - MULTILINE = 1, - NO_AUTO_CAPITALIZATION = 2, - PASSWORD = 4, - LANGUAGES_FORCED = 8, - EXT_KEYBOARD = 16, - NO_LEARNING = 32, - FIXED_POSITION = 64, - DISABLE_RESUME = 256, - DISABLE_AUTO_SPACE = 512, - DISABLE_POSITION_ADJUSTMENT = 2048, - EXPANDED_PREEDIT_BUFFER = 4096, - USE_JAPANESE_EISUU_KEY_AS_CAPSLOCK = 8192, - USE_2K_COORDINATES = 16384, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeOption) - -struct OrbisImeKeyboardParam { - OrbisImeKeyboardOption option; - s8 reserved1[4]; - void* arg; - OrbisImeEventHandler handler; - s8 reserved2[8]; -}; - -struct OrbisImeParam { - s32 user_id; - OrbisImeType type; - u64 supported_languages; - OrbisImeEnterLabel enter_label; - OrbisImeInputMethod input_method; - OrbisImeTextFilter filter; - OrbisImeOption option; - u32 maxTextLength; - char16_t* inputTextBuffer; - float posx; - float posy; - OrbisImeHorizontalAlignment horizontal_alignment; - OrbisImeVerticalAlignment vertical_alignment; - void* work; - void* arg; - OrbisImeEventHandler handler; - s8 reserved[8]; -}; - -struct OrbisImeCaret { - f32 x; - f32 y; - u32 height; - u32 index; -}; - int PS4_SYSV_ABI FinalizeImeModule(); int PS4_SYSV_ABI InitializeImeModule(); int PS4_SYSV_ABI sceImeCheckFilterText(); @@ -98,22 +32,22 @@ int PS4_SYSV_ABI sceImeDisableController(); int PS4_SYSV_ABI sceImeFilterText(); int PS4_SYSV_ABI sceImeForTestFunction(); int PS4_SYSV_ABI sceImeGetPanelPositionAndForm(); -s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height); -s32 PS4_SYSV_ABI sceImeKeyboardClose(s32 userId); +Error PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height); +Error PS4_SYSV_ABI sceImeKeyboardClose(s32 userId); int PS4_SYSV_ABI sceImeKeyboardGetInfo(); int PS4_SYSV_ABI sceImeKeyboardGetResourceId(); -s32 PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param); +Error PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param); int PS4_SYSV_ABI sceImeKeyboardOpenInternal(); int PS4_SYSV_ABI sceImeKeyboardSetMode(); int PS4_SYSV_ABI sceImeKeyboardUpdate(); -s32 PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const void* extended); +Error PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const OrbisImeParamExtended* extended); int PS4_SYSV_ABI sceImeOpenInternal(); void PS4_SYSV_ABI sceImeParamInit(OrbisImeParam* param); int PS4_SYSV_ABI sceImeSetCandidateIndex(); -s32 PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret); -s32 PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length); +Error PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret); +Error PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length); int PS4_SYSV_ABI sceImeSetTextGeometry(); -s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler); +Error PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler); int PS4_SYSV_ABI sceImeVshClearPreedit(); int PS4_SYSV_ABI sceImeVshClose(); int PS4_SYSV_ABI sceImeVshConfirmPreedit(); diff --git a/src/core/libraries/ime/ime_common.h b/src/core/libraries/ime/ime_common.h index 96f073dc5..5c0030030 100644 --- a/src/core/libraries/ime/ime_common.h +++ b/src/core/libraries/ime/ime_common.h @@ -3,9 +3,108 @@ #pragma once +#include "common/enum.h" #include "common/types.h" #include "core/libraries/rtc/rtc.h" +constexpr u32 ORBIS_IME_MAX_TEXT_LENGTH = 2048; +constexpr u32 ORBIS_IME_DIALOG_MAX_TEXT_LENGTH = 2048; + +enum class Error : u32 { + OK = 0x0, + BUSY = 0x80bc0001, + NOT_OPENED = 0x80bc0002, + NO_MEMORY = 0x80bc0003, + CONNECTION_FAILED = 0x80bc0004, + TOO_MANY_REQUESTS = 0x80bc0005, + INVALID_TEXT = 0x80bc0006, + EVENT_OVERFLOW = 0x80bc0007, + NOT_ACTIVE = 0x80bc0008, + IME_SUSPENDING = 0x80bc0009, + DEVICE_IN_USE = 0x80bc000a, + INVALID_USER_ID = 0x80bc0010, + INVALID_TYPE = 0x80bc0011, + INVALID_SUPPORTED_LANGUAGES = 0x80bc0012, + INVALID_ENTER_LABEL = 0x80bc0013, + INVALID_INPUT_METHOD = 0x80bc0014, + INVALID_OPTION = 0x80bc0015, + INVALID_MAX_TEXT_LENGTH = 0x80bc0016, + INVALID_INPUT_TEXT_BUFFER = 0x80bc0017, + INVALID_POSX = 0x80bc0018, + INVALID_POSY = 0x80bc0019, + INVALID_HORIZONTALIGNMENT = 0x80bc001a, + INVALID_VERTICALALIGNMENT = 0x80bc001b, + INVALID_EXTENDED = 0x80bc001c, + INVALID_KEYBOARD_TYPE = 0x80bc001d, + INVALID_WORK = 0x80bc0020, + INVALID_ARG = 0x80bc0021, + INVALID_HANDLER = 0x80bc0022, + NO_RESOURCE_ID = 0x80bc0023, + INVALID_MODE = 0x80bc0024, + INVALID_PARAM = 0x80bc0030, + INVALID_ADDRESS = 0x80bc0031, + INVALID_RESERVED = 0x80bc0032, + INVALID_TIMING = 0x80bc0033, + INTERNAL = 0x80bc00ff, + DIALOG_INVALID_TITLE = 0x80bc0101, + DIALOG_NOT_RUNNING = 0x80bc0105, + DIALOG_NOT_FINISHED = 0x80bc0106, + DIALOG_NOT_IN_USE = 0x80bc0107 +}; + +enum class OrbisImeOption : u32 { + DEFAULT = 0, + MULTILINE = 1, + NO_AUTO_CAPITALIZATION = 2, + PASSWORD = 4, + LANGUAGES_FORCED = 8, + EXT_KEYBOARD = 16, + NO_LEARNING = 32, + FIXED_POSITION = 64, + DISABLE_COPY_PASTE = 128, + DISABLE_RESUME = 256, + DISABLE_AUTO_SPACE = 512, + DISABLE_POSITION_ADJUSTMENT = 2048, + EXPANDED_PREEDIT_BUFFER = 4096, + USE_JAPANESE_EISUU_KEY_AS_CAPSLOCK = 8192, + USE_2K_COORDINATES = 16384, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeOption); + +enum class OrbisImeLanguage : u64 { + DANISH = 0x0000000000000001, + GERMAN = 0x0000000000000002, + ENGLISH_US = 0x0000000000000004, + SPANISH = 0x0000000000000008, + FRENCH = 0x0000000000000010, + ITALIAN = 0x0000000000000020, + DUTCH = 0x0000000000000040, + NORWEGIAN = 0x0000000000000080, + POLISH = 0x0000000000000100, + PORTUGUESE_PT = 0x0000000000000200, + RUSSIAN = 0x0000000000000400, + FINNISH = 0x0000000000000800, + SWEDISH = 0x0000000000001000, + JAPANESE = 0x0000000000002000, + KOREAN = 0x0000000000004000, + SIMPLIFIED_CHINESE = 0x0000000000008000, + TRADITIONAL_CHINESE = 0x0000000000010000, + PORTUGUESE_BR = 0x0000000000020000, + ENGLISH_GB = 0x0000000000040000, + TURKISH = 0x0000000000080000, + SPANISH_LA = 0x0000000000100000, + ARABIC = 0x0000000001000000, + FRENCH_CA = 0x0000000002000000, + THAI = 0x0000000004000000, + CZECH = 0x0000000008000000, + GREEK = 0x0000000010000000, + INDONESIAN = 0x0000000020000000, + VIETNAMESE = 0x0000000040000000, + ROMANIAN = 0x0000000080000000, + HUNGARIAN = 0x0000000100000000, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeLanguage); + enum class OrbisImeType : u32 { Default = 0, BasicLatin = 1, @@ -41,6 +140,7 @@ enum class OrbisImeEventId : u32 { Open = 0, UpdateText = 1, UpdateCaret = 2, + ChangeSize = 3, PressClose = 4, PressEnter = 5, Abort = 6, @@ -51,6 +151,10 @@ enum class OrbisImeEventId : u32 { CandidateDone = 11, CandidateCancel = 12, ChangeDevice = 14, + JumpToNextObject = 15, + JumpToBeforeObject = 16, + ChangeWindowType = 17, + ChangeInputMethodState = 18, KeyboardOpen = 256, @@ -110,6 +214,13 @@ enum class OrbisImeDeviceType : u32 { RemoteOsk = 3, }; +enum class OrbisImePanelPriority : u32 { + Default = 0, + Alphabet = 1, + Symbol = 2, + Accent = 3, +}; + struct OrbisImeRect { f32 x; f32 y; @@ -117,8 +228,22 @@ struct OrbisImeRect { u32 height; }; +struct OrbisImeColor { + u8 r; + u8 g; + u8 b; + u8 a; +}; + +enum class OrbisImeTextAreaMode : u32 { + Disable = 0, + Edit = 1, + Preedit = 2, + Select = 3, +}; + struct OrbisImeTextAreaProperty { - u32 mode; // OrbisImeTextAreaMode + OrbisImeTextAreaMode mode; u32 index; s32 length; }; @@ -135,14 +260,14 @@ struct OrbisImeKeycode { char16_t character; u32 status; OrbisImeKeyboardType type; - s32 user_id; + s32 user_id; // Todo: switch to OrbisUserServiceUserId u32 resource_id; Libraries::Rtc::OrbisRtcTick timestamp; }; struct OrbisImeKeyboardResourceIdArray { - s32 userId; - u32 resourceId[5]; + s32 user_id; // Todo: switch to OrbisUserServiceUserId + u32 resource_id[5]; }; enum class OrbisImeCaretMovementDirection : u32 { @@ -159,6 +284,16 @@ enum class OrbisImeCaretMovementDirection : u32 { Bottom = 10, }; +enum class OrbisImePanelType : u32 { + Hide = 0, + Osk = 1, + Dialog = 2, + Candidate = 3, + Edit = 4, + EditAndCandidate = 5, + Accessibility = 6, +}; + union OrbisImeEventParam { OrbisImeRect rect; OrbisImeEditText text; @@ -168,6 +303,7 @@ union OrbisImeEventParam { char16_t* candidate_word; s32 candidate_index; OrbisImeDeviceType device_type; + OrbisImePanelType panel_type; u32 input_method_state; s8 reserved[64]; }; @@ -177,7 +313,95 @@ struct OrbisImeEvent { OrbisImeEventParam param; }; +using OrbisImeExtKeyboardFilter = PS4_SYSV_ABI int (*)(const OrbisImeKeycode* srcKeycode, + u16* outKeycode, u32* outStatus, + void* reserved); + using OrbisImeTextFilter = PS4_SYSV_ABI int (*)(char16_t* outText, u32* outTextLength, const char16_t* srcText, u32 srcTextLength); using OrbisImeEventHandler = PS4_SYSV_ABI void (*)(void* arg, const OrbisImeEvent* e); + +enum class OrbisImeKeyboardOption : u32 { + Default = 0, + Repeat = 1, + RepeatEachKey = 2, + AddOsk = 4, + EffectiveWithIme = 8, + DisableResume = 16, + DisableCapslockWithoutShift = 32, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeKeyboardOption) + +struct OrbisImeKeyboardParam { + OrbisImeKeyboardOption option; + s8 reserved1[4]; + void* arg; + OrbisImeEventHandler handler; + s8 reserved2[8]; +}; + +struct OrbisImeParam { + s32 user_id; // Todo: switch to OrbisUserServiceUserId + OrbisImeType type; + u64 supported_languages; // OrbisImeLanguage flags + OrbisImeEnterLabel enter_label; + OrbisImeInputMethod input_method; + OrbisImeTextFilter filter; + OrbisImeOption option; + u32 maxTextLength; + char16_t* inputTextBuffer; + f32 posx; + f32 posy; + OrbisImeHorizontalAlignment horizontal_alignment; + OrbisImeVerticalAlignment vertical_alignment; + void* work; + void* arg; + OrbisImeEventHandler handler; + s8 reserved[8]; +}; + +struct OrbisImeCaret { + f32 x; + f32 y; + u32 height; + u32 index; +}; + +struct OrbisImeDialogParam { + s32 user_id; + OrbisImeType type; + u64 supported_languages; // OrbisImeLanguage flags + OrbisImeEnterLabel enter_label; + OrbisImeInputMethod input_method; + OrbisImeTextFilter filter; + OrbisImeOption option; + u32 max_text_length; + char16_t* input_text_buffer; + f32 posx; + f32 posy; + OrbisImeHorizontalAlignment horizontal_alignment; + OrbisImeVerticalAlignment vertical_alignment; + const char16_t* placeholder; + const char16_t* title; + s8 reserved[16]; +}; + +struct OrbisImeParamExtended { + u32 option; // OrbisImeExtOption flags + OrbisImeColor color_base; + OrbisImeColor color_line; + OrbisImeColor color_text_field; + OrbisImeColor color_preedit; + OrbisImeColor color_button_default; + OrbisImeColor color_button_function; + OrbisImeColor color_button_symbol; + OrbisImeColor color_text; + OrbisImeColor color_special; + OrbisImePanelPriority priority; + char* additional_dictionary_path; + OrbisImeExtKeyboardFilter ext_keyboard_filter; + u32 disable_device; + u32 ext_keyboard_mode; + s8 reserved[60]; +}; diff --git a/src/core/libraries/ime/ime_dialog.cpp b/src/core/libraries/ime/ime_dialog.cpp index bee185787..6f808636b 100644 --- a/src/core/libraries/ime/ime_dialog.cpp +++ b/src/core/libraries/ime/ime_dialog.cpp @@ -20,19 +20,19 @@ static OrbisImeDialogResult g_ime_dlg_result{}; static ImeDialogState g_ime_dlg_state{}; static ImeDialogUi g_ime_dlg_ui; -static bool IsValidOption(OrbisImeDialogOption option, OrbisImeType type) { - if (False(~option & - (OrbisImeDialogOption::Multiline | OrbisImeDialogOption::NoAutoCompletion))) { +static bool IsValidOption(OrbisImeOption option, OrbisImeType type) { + if (False(~option & (OrbisImeOption::MULTILINE | + OrbisImeOption::NO_AUTO_CAPITALIZATION /* NoAutoCompletion */))) { return false; } - if (True(option & OrbisImeDialogOption::Multiline) && type != OrbisImeType::Default && + if (True(option & OrbisImeOption::MULTILINE) && type != OrbisImeType::Default && type != OrbisImeType::BasicLatin) { return false; } - if (True(option & OrbisImeDialogOption::NoAutoCompletion) && type != OrbisImeType::Number && - type != OrbisImeType::BasicLatin) { + if (True(option & OrbisImeOption::NO_AUTO_CAPITALIZATION /* NoAutoCompletion */) && + type != OrbisImeType::Number && type != OrbisImeType::BasicLatin) { return false; } @@ -96,7 +96,7 @@ Error PS4_SYSV_ABI sceImeDialogGetPanelSize(const OrbisImeDialogParam* param, u3 case OrbisImeType::Url: case OrbisImeType::Mail: *width = 500; // original: 793 - if (True(param->option & OrbisImeDialogOption::Multiline)) { + if (True(param->option & OrbisImeOption::MULTILINE)) { *height = 300; // original: 576 } else { *height = 150; // original: 476 @@ -149,18 +149,20 @@ OrbisImeDialogStatus PS4_SYSV_ABI sceImeDialogGetStatus() { } Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExtended* extended) { + LOG_INFO(Lib_ImeDialog, ">> sceImeDialogInit: entering, param={}, extended={}", + static_cast(param), static_cast(extended)); if (g_ime_dlg_status != OrbisImeDialogStatus::None) { - LOG_INFO(Lib_ImeDialog, "IME dialog is already running"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: busy (status=%u)", (u32)g_ime_dlg_status); return Error::BUSY; } if (param == nullptr) { - LOG_INFO(Lib_ImeDialog, "called with param (NULL)"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: param is null"); return Error::INVALID_ADDRESS; } if (!magic_enum::enum_contains(param->type)) { - LOG_INFO(Lib_ImeDialog, "Invalid param->type"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid param->type=%u", (u32)param->type); return Error::INVALID_ADDRESS; } @@ -168,16 +170,14 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt // TODO: do correct param->supportedLanguages validation if (param->posx < 0.0f || - param->posx >= - MAX_X_POSITIONS[False(param->option & OrbisImeDialogOption::LargeResolution)]) { - LOG_INFO(Lib_ImeDialog, "Invalid param->posx"); + param->posx >= MAX_X_POSITIONS[False(param->option & OrbisImeOption::USE_2K_COORDINATES)]) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid posx=%f", param->posx); return Error::INVALID_POSX; } if (param->posy < 0.0f || - param->posy >= - MAX_Y_POSITIONS[False(param->option & OrbisImeDialogOption::LargeResolution)]) { - LOG_INFO(Lib_ImeDialog, "Invalid param->posy"); + param->posy >= MAX_Y_POSITIONS[False(param->option & OrbisImeOption::USE_2K_COORDINATES)]) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid posy=%f", param->posy); return Error::INVALID_POSY; } @@ -192,12 +192,13 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt } if (!IsValidOption(param->option, param->type)) { - LOG_INFO(Lib_ImeDialog, "Invalid param->option"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid option=0x%X for type=%u", + static_cast(param->option), (u32)param->type); return Error::INVALID_PARAM; } if (param->input_text_buffer == nullptr) { - LOG_INFO(Lib_ImeDialog, "Invalid param->inputTextBuffer"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: input_text_buffer is null"); return Error::INVALID_INPUT_TEXT_BUFFER; } @@ -220,16 +221,24 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt } } - if (param->max_text_length > ORBIS_IME_DIALOG_MAX_TEXT_LENGTH) { - LOG_INFO(Lib_ImeDialog, "Invalid param->maxTextLength"); + if (param->max_text_length == 0 || param->max_text_length > ORBIS_IME_MAX_TEXT_LENGTH) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid max_text_length=%u", + param->max_text_length); return Error::INVALID_MAX_TEXT_LENGTH; } + // Title string validation + if (param->title != nullptr && !std::char_traits::length(param->title)) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: title is empty"); + return Error::INVALID_PARAM; + } + g_ime_dlg_result = {}; g_ime_dlg_state = ImeDialogState(param, extended); g_ime_dlg_status = OrbisImeDialogStatus::Running; g_ime_dlg_ui = ImeDialogUi(&g_ime_dlg_state, &g_ime_dlg_status, &g_ime_dlg_result); + LOG_INFO(Lib_ImeDialog, "<< sceImeDialogInit: successful, status now=Running"); return Error::OK; } diff --git a/src/core/libraries/ime/ime_dialog.h b/src/core/libraries/ime/ime_dialog.h index 526e5f022..a056fdd5e 100644 --- a/src/core/libraries/ime/ime_dialog.h +++ b/src/core/libraries/ime/ime_dialog.h @@ -13,50 +13,6 @@ class SymbolsResolver; namespace Libraries::ImeDialog { -constexpr u32 ORBIS_IME_DIALOG_MAX_TEXT_LENGTH = 2048; - -enum class Error : u32 { - OK = 0x0, - BUSY = 0x80bc0001, - NOT_OPENED = 0x80bc0002, - NO_MEMORY = 0x80bc0003, - CONNECTION_FAILED = 0x80bc0004, - TOO_MANY_REQUESTS = 0x80bc0005, - INVALID_TEXT = 0x80bc0006, - EVENT_OVERFLOW = 0x80bc0007, - NOT_ACTIVE = 0x80bc0008, - IME_SUSPENDING = 0x80bc0009, - DEVICE_IN_USE = 0x80bc000a, - INVALID_USER_ID = 0x80bc0010, - INVALID_TYPE = 0x80bc0011, - INVALID_SUPPORTED_LANGUAGES = 0x80bc0012, - INVALID_ENTER_LABEL = 0x80bc0013, - INVALID_INPUT_METHOD = 0x80bc0014, - INVALID_OPTION = 0x80bc0015, - INVALID_MAX_TEXT_LENGTH = 0x80bc0016, - INVALID_INPUT_TEXT_BUFFER = 0x80bc0017, - INVALID_POSX = 0x80bc0018, - INVALID_POSY = 0x80bc0019, - INVALID_HORIZONTALIGNMENT = 0x80bc001a, - INVALID_VERTICALALIGNMENT = 0x80bc001b, - INVALID_EXTENDED = 0x80bc001c, - INVALID_KEYBOARD_TYPE = 0x80bc001d, - INVALID_WORK = 0x80bc0020, - INVALID_ARG = 0x80bc0021, - INVALID_HANDLER = 0x80bc0022, - NO_RESOURCE_ID = 0x80bc0023, - INVALID_MODE = 0x80bc0024, - INVALID_PARAM = 0x80bc0030, - INVALID_ADDRESS = 0x80bc0031, - INVALID_RESERVED = 0x80bc0032, - INVALID_TIMING = 0x80bc0033, - INTERNAL = 0x80bc00ff, - DIALOG_INVALID_TITLE = 0x80bc0101, - DIALOG_NOT_RUNNING = 0x80bc0105, - DIALOG_NOT_FINISHED = 0x80bc0106, - DIALOG_NOT_IN_USE = 0x80bc0107, -}; - enum class OrbisImeDialogStatus : u32 { None = 0, Running = 1, @@ -69,87 +25,11 @@ enum class OrbisImeDialogEndStatus : u32 { Aborted = 2, }; -enum class OrbisImeDialogOption : u32 { - Default = 0, - Multiline = 1, - NoAutoCorrection = 2, - NoAutoCompletion = 4, - // TODO: Document missing options - LargeResolution = 1024, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeDialogOption) - -enum class OrbisImePanelPriority : u32 { - Default = 0, - Alphabet = 1, - Symbol = 2, - Accent = 3, -}; - -struct OrbisImeColor { - u8 r; - u8 g; - u8 b; - u8 a; -}; - struct OrbisImeDialogResult { OrbisImeDialogEndStatus endstatus; s32 reserved[12]; }; -struct OrbisImeKeycode { - u16 keycode; - char16_t character; - u32 status; - OrbisImeKeyboardType type; - s32 user_id; - u32 resource_id; - u64 timestamp; -}; - -using OrbisImeExtKeyboardFilter = PS4_SYSV_ABI int (*)(const OrbisImeKeycode* srcKeycode, - u16* outKeycode, u32* outStatus, - void* reserved); - -struct OrbisImeDialogParam { - s32 user_id; - OrbisImeType type; - u64 supported_languages; - OrbisImeEnterLabel enter_label; - OrbisImeInputMethod input_method; - OrbisImeTextFilter filter; - OrbisImeDialogOption option; - u32 max_text_length; - char16_t* input_text_buffer; - float posx; - float posy; - OrbisImeHorizontalAlignment horizontal_alignment; - OrbisImeVerticalAlignment vertical_alignment; - const char16_t* placeholder; - const char16_t* title; - s8 reserved[16]; -}; - -struct OrbisImeParamExtended { - u32 option; // OrbisImeDialogOptionExtended - OrbisImeColor color_base; - OrbisImeColor color_line; - OrbisImeColor color_text_field; - OrbisImeColor color_preedit; - OrbisImeColor color_button_default; - OrbisImeColor color_button_function; - OrbisImeColor color_button_symbol; - OrbisImeColor color_text; - OrbisImeColor color_special; - OrbisImePanelPriority priority; - char* additional_dictionary_path; - OrbisImeExtKeyboardFilter ext_keyboard_filter; - uint32_t disable_device; - uint32_t ext_keyboard_mode; - int8_t reserved[60]; -}; - Error PS4_SYSV_ABI sceImeDialogAbort(); Error PS4_SYSV_ABI sceImeDialogForceClose(); Error PS4_SYSV_ABI sceImeDialogForTestFunction(); diff --git a/src/core/libraries/ime/ime_dialog_ui.cpp b/src/core/libraries/ime/ime_dialog_ui.cpp index 51183c79b..746a2c8d3 100644 --- a/src/core/libraries/ime/ime_dialog_ui.cpp +++ b/src/core/libraries/ime/ime_dialog_ui.cpp @@ -21,12 +21,16 @@ namespace Libraries::ImeDialog { ImeDialogState::ImeDialogState(const OrbisImeDialogParam* param, const OrbisImeParamExtended* extended) { + LOG_INFO(Lib_ImeDialog, ">> ImeDialogState::Ctor: param={}, text_buffer={}", + static_cast(param), + static_cast(param ? param->input_text_buffer : nullptr)); if (!param) { + LOG_ERROR(Lib_ImeDialog, " param==nullptr, returning without init"); return; } user_id = param->user_id; - is_multi_line = True(param->option & OrbisImeDialogOption::Multiline); + is_multi_line = True(param->option & OrbisImeOption::MULTILINE); is_numeric = param->type == OrbisImeType::Number; type = param->type; enter_label = param->enter_label; @@ -220,6 +224,7 @@ void ImeDialogUi::Free() { void ImeDialogUi::Draw() { std::unique_lock lock{draw_mutex}; + LOG_INFO(Lib_ImeDialog, ">> ImeDialogUi::Draw: first_render=%d", first_render); if (!state) { return; @@ -259,9 +264,13 @@ void ImeDialogUi::Draw() { } if (state->is_multi_line) { + LOG_INFO(Lib_ImeDialog, " Drawing multi-line widget…"); DrawMultiLineInputText(); + LOG_INFO(Lib_ImeDialog, " Done DrawMultiLineInputText"); } else { + LOG_INFO(Lib_ImeDialog, " Drawing input text widget…"); DrawInputText(); + LOG_INFO(Lib_ImeDialog, " Done DrawInputText"); } SetCursorPosY(GetCursorPosY() + 10.0f); @@ -306,6 +315,7 @@ void ImeDialogUi::Draw() { End(); first_render = false; + LOG_INFO(Lib_ImeDialog, "<< ImeDialogUi::Draw complete"); } void ImeDialogUi::DrawInputText() { @@ -316,7 +326,7 @@ void ImeDialogUi::DrawInputText() { } const char* placeholder = state->placeholder.empty() ? nullptr : state->placeholder.data(); if (InputTextEx("##ImeDialogInput", placeholder, state->current_text.begin(), - state->max_text_length, input_size, ImGuiInputTextFlags_CallbackCharFilter, + state->max_text_length + 1, input_size, ImGuiInputTextFlags_CallbackCharFilter, InputTextCallback, this)) { state->input_changed = true; } @@ -332,7 +342,7 @@ void ImeDialogUi::DrawMultiLineInputText() { } const char* placeholder = state->placeholder.empty() ? nullptr : state->placeholder.data(); if (InputTextEx("##ImeDialogInput", placeholder, state->current_text.begin(), - state->max_text_length, input_size, flags, InputTextCallback, this)) { + state->max_text_length + 1, input_size, flags, InputTextCallback, this)) { state->input_changed = true; } } @@ -341,13 +351,19 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { ImeDialogUi* ui = static_cast(data->UserData); ASSERT(ui); + LOG_DEBUG(Lib_ImeDialog, ">> InputTextCallback: EventFlag={}, EventChar={}", data->EventFlag, + data->EventChar); + // Should we filter punctuation? if (ui->state->is_numeric && (data->EventChar < '0' || data->EventChar > '9') && data->EventChar != '\b' && data->EventChar != ',' && data->EventChar != '.') { + LOG_INFO(Lib_ImeDialog, "InputTextCallback: rejecting non-digit char '{}'", + static_cast(data->EventChar)); return 1; } if (!ui->state->keyboard_filter) { + LOG_DEBUG(Lib_ImeDialog, "InputTextCallback: no keyboard_filter, accepting char"); return 0; } @@ -367,16 +383,20 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { }; if (!ui->state->ConvertUTF8ToOrbis(event_char, 4, &src_keycode.character, 1)) { - LOG_ERROR(Lib_ImeDialog, "Failed to convert orbis char to utf8"); + LOG_ERROR(Lib_ImeDialog, "InputTextCallback: ConvertUTF8ToOrbis failed"); return 0; } + LOG_DEBUG(Lib_ImeDialog, "InputTextCallback: converted to Orbis char={:#X}", + static_cast(src_keycode.character)); src_keycode.keycode = src_keycode.character; // TODO set this to the correct value u16 out_keycode; u32 out_status; - ui->state->CallKeyboardFilter(&src_keycode, &out_keycode, &out_status); - + bool keep = ui->state->CallKeyboardFilter(&src_keycode, &out_keycode, &out_status); + LOG_DEBUG(Lib_ImeDialog, + "InputTextCallback: CallKeyboardFilter returned %s (keycode=0x%X, status=0x%X)", + keep ? "true" : "false", out_keycode, out_status); // TODO. set the keycode return 0; diff --git a/src/core/libraries/ime/ime_ui.cpp b/src/core/libraries/ime/ime_ui.cpp index 37f25e200..c49c70ede 100644 --- a/src/core/libraries/ime/ime_ui.cpp +++ b/src/core/libraries/ime/ime_ui.cpp @@ -199,7 +199,7 @@ int ImeUi::InputTextCallback(ImGuiInputTextCallbackData* data) { eventParam.caret_index = data->CursorPos; eventParam.area_num = 1; - eventParam.text_area[0].mode = 1; // Edit mode + eventParam.text_area[0].mode = OrbisImeTextAreaMode::Edit; eventParam.text_area[0].index = data->CursorPos; eventParam.text_area[0].length = data->BufTextLen; From 2d1a2982df408d93f6d5c42e70183ed87db11616 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Tue, 8 Jul 2025 10:32:39 +0300 Subject: [PATCH 04/14] buffer_cache: Bring back upload batching and temporary buffer (#3211) * buffer_cache: Bring back upload batching and temporary buffer Because that PR fused the write and read protections under a single function call, it was a requirement to move the actual memory copy part inside the lambda to perform it before the read protection kicks in. However on certain large data transfers it had potential for data corruption. If, for example, an upload had two copies, a 400MB and a 300MB one, the first one would fit in the staging buffer, very likely with an induced stall. However the second one wouldn't have space to fit alongside the other data, but it's also small enough for the buffer to fit it, so the staging buffer would cause a flush and wait to copy it, overwriting the previous transfer. To address this the upload function has been reworked to allow for batching like previously but with the new locking behavior. Also the condition to use temporary buffers has been expanded to also include cases when staging buffer will stall, which should increase performance a little in some cases. * buffer_cache: Move buffer barriers and copy outside of lock range --- src/video_core/buffer_cache/buffer.cpp | 30 +++++++--- src/video_core/buffer_cache/buffer.h | 8 +-- src/video_core/buffer_cache/buffer_cache.cpp | 56 +++++++++++++++---- src/video_core/buffer_cache/buffer_cache.h | 3 + src/video_core/buffer_cache/memory_tracker.h | 28 +++++++--- src/video_core/renderer_vulkan/vk_scheduler.h | 6 +- 6 files changed, 96 insertions(+), 35 deletions(-) diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 15bf0d81e..e85a6eb18 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -137,12 +137,15 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& size_bytes); } -std::pair StreamBuffer::Map(u64 size, u64 alignment) { +std::pair StreamBuffer::Map(u64 size, u64 alignment, bool allow_wait) { if (!is_coherent && usage == MemoryUsage::Stream) { size = Common::AlignUp(size, instance->NonCoherentAtomSize()); } - ASSERT(size <= this->size_bytes); + if (size > this->size_bytes) { + return {nullptr, 0}; + } + mapped_size = size; if (alignment > 0) { @@ -162,8 +165,11 @@ std::pair StreamBuffer::Map(u64 size, u64 alignment) { } const u64 mapped_upper_bound = offset + size; - WaitPendingOperations(mapped_upper_bound); - return std::make_pair(mapped_data.data() + offset, offset); + if (!WaitPendingOperations(mapped_upper_bound, allow_wait)) { + return {nullptr, 0}; + } + + return {mapped_data.data() + offset, offset}; } void StreamBuffer::Commit() { @@ -177,6 +183,12 @@ void StreamBuffer::Commit() { } offset += mapped_size; + if (current_watch_cursor != 0 && + current_watches[current_watch_cursor].tick == scheduler->CurrentTick()) { + current_watches[current_watch_cursor].upper_bound = offset; + return; + } + if (current_watch_cursor + 1 >= current_watches.size()) { // Ensure that there are enough watches. ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK); @@ -191,16 +203,20 @@ void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_ watches.resize(watches.size() + grow_size); } -void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { +bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_wait) { if (!invalidation_mark) { - return; + return true; } while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; - wait_bound = watch.upper_bound; + if (!scheduler->IsFree(watch.tick) && !allow_wait) { + return false; + } scheduler->Wait(watch.tick); + wait_bound = watch.upper_bound; ++wait_cursor; } + return true; } } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 530968787..a7a0ce84f 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -168,7 +168,7 @@ public: MemoryUsage usage, u64 size_bytes_); /// Reserves a region of memory from the stream buffer. - std::pair Map(u64 size, u64 alignment = 0); + std::pair Map(u64 size, u64 alignment = 0, bool allow_wait = true); /// Ensures that reserved bytes of memory are available to the GPU. void Commit(); @@ -181,10 +181,6 @@ public: return offset; } - u64 GetFreeSize() const { - return size_bytes - offset - mapped_size; - } - private: struct Watch { u64 tick{}; @@ -195,7 +191,7 @@ private: void ReserveWatches(std::vector& watches, std::size_t grow_size); /// Waits pending watches until requested upper bound. - void WaitPendingOperations(u64 requested_upper_bound); + bool WaitPendingOperations(u64 requested_upper_bound, bool allow_wait); private: u64 offset{}; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index d55e05d1e..8a7e99ea0 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -137,8 +137,7 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { return; } memory_tracker->InvalidateRegion( - device_addr, size, Config::readbacks(), - [this, device_addr, size] { ReadMemory(device_addr, size, true); }); + device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); }); } void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { @@ -817,22 +816,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; + size_t total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); + vk::Buffer src_buffer = VK_NULL_HANDLE; memory_tracker->ForEachUploadRange( - device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { - const u64 offset = staging_buffer.Copy(device_addr_out, range_size); - copies.push_back(vk::BufferCopy{ - .srcOffset = offset, - .dstOffset = device_addr_out - buffer_start, - .size = range_size, - }); - }); + device_addr, size, is_written, + [&](u64 device_addr_out, u64 range_size) { + copies.emplace_back(total_size_bytes, device_addr_out - buffer_start, range_size); + total_size_bytes += range_size; + }, + [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); }); SCOPE_EXIT { if (is_texel_buffer) { SynchronizeBufferFromImage(buffer, device_addr, size); } }; - if (copies.empty()) { + if (!src_buffer) { return; } scheduler.EndRendering(); @@ -861,7 +860,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &pre_barrier, }); - cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -869,6 +868,39 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, }); } +vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes) { + if (copies.empty()) { + return VK_NULL_HANDLE; + } + const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + if (staging) { + for (auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + // Apply the staging offset + copy.srcOffset += offset; + } + staging_buffer.Commit(); + return staging_buffer.Handle(); + } else { + // For large one time transfers use a temporary host buffer. + auto temp_buffer = + std::make_unique(instance, scheduler, MemoryUsage::Upload, 0, + vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes); + const vk::Buffer src_buffer = temp_buffer->Handle(); + u8* const staging = temp_buffer->mapped_data.data(); + for (const auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + } + scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable { buffer.reset(); }); + return src_buffer; + } +} + bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { boost::container::small_vector image_ids; texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 354d01431..b509ce2d0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -194,6 +194,9 @@ private: void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer); + vk::Buffer UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes); + bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index ca87c7df0..ec0878c3b 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -62,17 +62,17 @@ public: } /// Removes all protection from a page and ensures GPU data has been flushed if requested - void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { + void InvalidateRegion(VAddr cpu_addr, u64 size, auto&& on_flush) noexcept { IteratePages( - cpu_addr, size, - [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) { + cpu_addr, size, [&on_flush](RegionManager* manager, u64 offset, size_t size) { const bool should_flush = [&] { // Perform both the GPU modification check and CPU state change with the lock // in case we are racing with GPU thread trying to mark the page as GPU // modified. If we need to flush the flush function is going to perform CPU // state change. std::scoped_lock lk{manager->lock}; - if (try_flush && manager->template IsRegionModified(offset, size)) { + if (Config::readbacks() && + manager->template IsRegionModified(offset, size)) { return true; } manager->template ChangeRegionState( @@ -86,17 +86,27 @@ public: } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func, + auto&& on_upload) { IteratePages(query_cpu_range, query_size, [&func, is_written](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; + manager->lock.lock(); manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, func); - if (is_written) { - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); + if (!is_written) { + manager->lock.unlock(); } }); + on_upload(); + if (!is_written) { + return; + } + IteratePages(query_cpu_range, query_size, + [&func, is_written](RegionManager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + manager->lock.unlock(); + }); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 36fd9c055..bd6fb549a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -347,7 +347,11 @@ public: } /// Returns true when a tick has been triggered by the GPU. - [[nodiscard]] bool IsFree(u64 tick) const noexcept { + [[nodiscard]] bool IsFree(u64 tick) noexcept { + if (master_semaphore.IsFree(tick)) { + return true; + } + master_semaphore.Refresh(); return master_semaphore.IsFree(tick); } From e5f899aae3b895fc383019adba56859997a449fe Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Wed, 9 Jul 2025 03:38:28 +0200 Subject: [PATCH 05/14] Fix brace elision for designated initializer warning (#3215) --- src/core/libraries/ime/ime_dialog_ui.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/libraries/ime/ime_dialog_ui.cpp b/src/core/libraries/ime/ime_dialog_ui.cpp index 746a2c8d3..800ba1124 100644 --- a/src/core/libraries/ime/ime_dialog_ui.cpp +++ b/src/core/libraries/ime/ime_dialog_ui.cpp @@ -379,7 +379,7 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { // the current language?) .user_id = ui->state->user_id, .resource_id = 0, - .timestamp = 0, + .timestamp = {0}, }; if (!ui->state->ConvertUTF8ToOrbis(event_char, 4, &src_keycode.character, 1)) { From df4314f8312aead1707a632db4e30552012eb171 Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Wed, 9 Jul 2025 03:39:51 +0200 Subject: [PATCH 06/14] Extend Qt detection to support multiple drives (#3209) --- cmake/DetectQtInstallation.cmake | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cmake/DetectQtInstallation.cmake b/cmake/DetectQtInstallation.cmake index e95e8980f..650cc9745 100644 --- a/cmake/DetectQtInstallation.cmake +++ b/cmake/DetectQtInstallation.cmake @@ -1,14 +1,28 @@ # SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project # SPDX-License-Identifier: GPL-2.0-or-later -file(GLOB QT_KITS LIST_DIRECTORIES true "C:/Qt/*/msvc*_64") -list(SORT QT_KITS COMPARE NATURAL) -list(REVERSE QT_KITS) -if(QT_KITS) - list(GET QT_KITS 0 QT_PREFIX) - set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) - message(STATUS "Auto-detected Qt prefix: ${QT_PREFIX}") -else() - message(STATUS "findQt.cmake: no Qt‑Directory found in C:/Qt – please set CMAKE_PREFIX_PATH manually") -endif() +set(highest_version "0") +set(CANDIDATE_DRIVES A B C D E F G H I J K L M N O P Q R S T U V W X Y Z) +foreach(drive ${CANDIDATE_DRIVES}) + file(GLOB kits LIST_DIRECTORIES true CONFIGURE_DEPENDS "${drive}:/Qt/*/msvc*_64") + foreach(kit IN LISTS kits) + get_filename_component(version_dir "${kit}" DIRECTORY) + get_filename_component(kit_version "${version_dir}" NAME) + + message(STATUS "DetectQtInstallation.cmake: Detected Qt: ${kit}") + + if (kit_version VERSION_GREATER highest_version) + set(highest_version "${kit_version}") + set(QT_PREFIX "${kit}") + + endif() + endforeach() +endforeach() + +if(QT_PREFIX) + set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) + message(STATUS "DetectQtInstallation.cmake: Choose newest Qt: ${QT_PREFIX}") +else() + message(STATUS "DetectQtInstallation.cmake: No Qt‑Directory found in :/Qt – please set CMAKE_PREFIX_PATH manually") +endif() From f5336358ea5a2ae3a2bbcf857468d0d847698e25 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Wed, 9 Jul 2025 13:55:21 +0300 Subject: [PATCH 07/14] Zero top bits in INSERTQ/EXTRQ (#3217) * Zero top bits in INSERTQ/EXTRQ * Clang-format * Don't assert --- src/core/cpu_patches.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 8512858e9..e4f65cd31 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -163,7 +163,9 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan mask = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask = 0xFFFF'FFFF'FFFF'FFFF; + } // Get lower qword from xmm register c.vmovq(scratch1, xmm_dst); @@ -177,8 +179,8 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan c.mov(scratch2, mask); c.and_(scratch1, scratch2); - // Writeback to xmm register, extrq instruction says top 64-bits are undefined so we don't - // care to preserve them + // Writeback to xmm register, extrq instruction says top 64-bits are undefined but zeroed on + // AMD CPUs c.vmovq(xmm_dst, scratch1); c.pop(scratch2); @@ -287,7 +289,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper mask_value = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask_value = 0xFFFF'FFFF'FFFF'FFFF; + } c.vmovq(scratch1, xmm_src); c.vmovq(scratch2, xmm_dst); @@ -307,8 +311,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper // dst |= src c.or_(scratch2, scratch1); - // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected - c.vpinsrq(xmm_dst, xmm_dst, scratch2, 0); + // Insert scratch2 into low 64 bits of dst, upper 64 bits are undefined but zeroed on AMD + // CPUs + c.vmovq(xmm_dst, scratch2); c.pop(mask); c.pop(scratch2); @@ -374,7 +379,7 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper c.and_(scratch2, mask); c.or_(scratch2, scratch1); - // Upper 64 bits are undefined in insertq + // Upper 64 bits are undefined in insertq but AMD CPUs zero them c.vmovq(xmm_dst, scratch2); c.pop(mask); @@ -635,6 +640,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst >>= index; lowQWordDst &= mask; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); @@ -675,6 +681,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst &= ~(mask << index); lowQWordDst |= lowQWordSrc << index; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); From 7d4b875ee33c47abc4edaa5a2ddacfbb7d32f9d8 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Wed, 9 Jul 2025 17:00:06 +0300 Subject: [PATCH 08/14] Random fixes (#3216) * buffer_cache: Handle inline data to flexible memory * control_flow: Fix single instruction scopes edge case Fixes the following pattern v_cmpx_gt_u32 cond buffer_store_dword value .LABEL: Before buffer[index] = value; After if (cond) { buffer[index] = value; } * vector_memory: Handle soffset when offen is false When offen is not used we can substitute the offset argument with soffset and have it handled correctly * scalar_alu: Handle sharp moves with S_MOV_B64 This fixes unable to track sharp errors when this pattern is used in a shader * emulator: Add log * video_core: Bump binary info search range and buffer num --- src/emulator.cpp | 1 + .../backend/spirv/spirv_emit_context.cpp | 4 +++- .../frontend/control_flow_graph.cpp | 5 +++-- .../frontend/translate/scalar_alu.cpp | 9 +++++++++ .../frontend/translate/vector_memory.cpp | 12 +++++++++--- src/shader_recompiler/info.h | 2 +- src/video_core/amdgpu/liverpool.cpp | 2 ++ src/video_core/amdgpu/liverpool.h | 2 +- src/video_core/buffer_cache/buffer_cache.cpp | 5 ++++- 9 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/emulator.cpp b/src/emulator.cpp index 332287d22..480ceee0b 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -133,6 +133,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole()); LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu()); LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks()); + LOG_INFO(Config, "GPU readbackLinearImages: {}", Config::readbackLinearImages()); LOG_INFO(Config, "GPU directMemoryAccess: {}", Config::directMemoryAccess()); LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders()); LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv()); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 77336c9ec..fe489f1b6 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -700,7 +700,7 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4], - U32[4], U32[4], U32[4], U32[4], U32[4]), + U32[4], U32[4], U32[4], U32[4], U32[4], U32[2]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, PushData::Step0Index, "sr0"); @@ -715,6 +715,7 @@ void EmitContext::DefinePushDataBlock() { MemberName(struct_type, PushData::UdRegsIndex + 3, "ud_regs3"); MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0"); MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1"); + MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2"); MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U); MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U); MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U); @@ -727,6 +728,7 @@ void EmitContext::DefinePushDataBlock() { MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U); MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U); MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 120U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index b53db9e94..805fdb108 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -188,14 +188,15 @@ void CFG::SplitDivergenceScopes() { const bool is_close = is_close_scope(inst); if ((is_close || index == blk->end_index) && curr_begin != -1) { // If there are no instructions inside scope don't do anything. - if (index - curr_begin == 1) { + if (index - curr_begin == 1 && is_close) { curr_begin = -1; continue; } // If all instructions in the scope ignore exec masking, we shouldn't insert a // scope. const auto start = inst_list.begin() + curr_begin + 1; - if (!std::ranges::all_of(start, inst_list.begin() + index, IgnoresExecMask)) { + if (!std::ranges::all_of(start, inst_list.begin() + index + !is_close, + IgnoresExecMask)) { // Determine the first instruction affected by the exec mask. do { ++curr_begin; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 48f977f49..276b55567 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -586,6 +586,15 @@ void Translator::S_MOV(const GcnInst& inst) { } void Translator::S_MOV_B64(const GcnInst& inst) { + // Moving SGPR to SGPR is used for thread masks, like most operations, but it can also be used + // for moving sharps. + if (inst.dst[0].field == OperandField::ScalarGPR && + inst.src[0].field == OperandField::ScalarGPR) { + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code))); + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code + 1), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code + 1))); + } const IR::U1 src = [&] { switch (inst.src[0].field) { case OperandField::VccLo: diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 91f545cfd..68b619c0a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -193,8 +193,8 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; if (info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, - "Non immediate offset not supported"); + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0 || !mubuf.offen, + "Having both scalar and vector offsets is not supported"); } const IR::Value address = [&] -> IR::Value { @@ -204,15 +204,21 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } + if (mubuf.idxen && !soffset.IsImmediate()) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); + } if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } + if (!soffset.IsImmediate()) { + return soffset; + } return {}; }(); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.offset_enable.Assign(mubuf.offen || !soffset.IsImmediate()); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 72977b711..9703643e8 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -25,7 +25,7 @@ namespace Shader { static constexpr size_t NumUserDataRegs = 16; static constexpr size_t NumImages = 64; -static constexpr size_t NumBuffers = 32; +static constexpr size_t NumBuffers = 40; static constexpr size_t NumSamplers = 16; static constexpr size_t NumFMasks = 8; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index e264de74a..3e66fba6a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -603,6 +603,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanevent_index.Value() == EventIndex::ZpassDone) { + LOG_WARNING(Render, "Unimplemented occlusion query"); } break; } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 0613823ab..c07e9f63a 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -88,7 +88,7 @@ struct Liverpool { } }; - static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x1000) { + static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x2000) { constexpr u32 token_mov_vcchi = 0xBEEB03FF; if (code[0] == token_mov_vcchi) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8a7e99ea0..c1110e54d 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -312,7 +312,10 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); if (!is_gds) { - ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + if (!memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)) { + std::memcpy(std::bit_cast(address), value, num_bytes); + return; + } if (!IsRegionRegistered(address, num_bytes)) { return; } From dc6ef99dc7e33bdd0d06393672bebf3df22285f0 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:02:08 +0300 Subject: [PATCH 09/14] vector_memory: Handle immediate but non zero offset too Signed-off-by: georgemoralis --- .../frontend/translate/vector_memory.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 68b619c0a..df20f7f73 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -192,8 +192,9 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; + const bool has_soffset = !soffset.IsImmediate() || soffset.U32() != 0; if (info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0 || !mubuf.offen, + ASSERT_MSG(!has_soffset || !mubuf.offen, "Having both scalar and vector offsets is not supported"); } @@ -204,13 +205,13 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } - if (mubuf.idxen && !soffset.IsImmediate()) { + if (mubuf.idxen && has_soffset) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); } if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } - if (!soffset.IsImmediate()) { + if (has_soffset) { return soffset; } return {}; @@ -218,7 +219,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen || !soffset.IsImmediate()); + buffer_info.offset_enable.Assign(mubuf.offen || has_soffset); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); From 27cbd6647f76531807f578d35bffe3bc21d70145 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 13:38:50 +0300 Subject: [PATCH 10/14] shader_recompiler: Reorganize data share operations and implement GDS bit (#3222) * shader_recompiler: Reorganize data share operations and implement GDS bit * Review comments --- .../backend/spirv/emit_spirv_atomic.cpp | 60 +++- .../backend/spirv/emit_spirv_instructions.h | 10 + .../backend/spirv/spirv_emit_context.cpp | 1 + .../frontend/translate/data_share.cpp | 317 +++++++----------- .../frontend/translate/translate.h | 14 +- .../frontend/translate/vector_alu.cpp | 6 +- src/shader_recompiler/ir/ir_emitter.cpp | 115 +++++-- src/shader_recompiler/ir/ir_emitter.h | 28 +- src/shader_recompiler/ir/microinstruction.cpp | 12 +- src/shader_recompiler/ir/opcodes.inc | 10 + .../ir/passes/resource_tracking_pass.cpp | 179 ++++++++-- .../ir/passes/shader_info_collection_pass.cpp | 10 + .../ir/passes/shared_memory_simplify_pass.cpp | 10 + .../passes/shared_memory_to_storage_pass.cpp | 42 ++- src/video_core/buffer_cache/buffer_cache.cpp | 2 + 15 files changed, 525 insertions(+), 291 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index e37acb2e4..80c8b836b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -54,17 +54,23 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, }); } +Id SharedAtomicU64IncDec(EmitContext& ctx, Id offset, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ctx.EmitSharedMemoryAccess(ctx.shared_u64, ctx.shared_memory_u64, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics); + }); +} + template Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& buffer = ctx.buffers[handle]; - const auto type = [&] { - if constexpr (is_float) { - return ctx.F32[1]; - } else { - return ctx.U32[1]; - } - }(); + const Id type = is_float ? ctx.F32[1] : ctx.U32[1]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { address = ctx.OpIAdd(ctx.U32[1], address, offset); } @@ -148,42 +154,82 @@ Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMax); +} + Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMax); } +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMax); +} + Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMin); } +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMin); +} + Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMin); } +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMin); +} + Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicAnd); } +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicAnd); +} + Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicOr); } +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicOr); +} + Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicXor); } +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicXor); +} + Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub); } +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicISub); +} + Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); } +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); +} + Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); } +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); +} + Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 6e146c5f6..8a0c586e9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -139,15 +139,25 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset); Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset); Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index fe489f1b6..6a731d05c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -76,6 +76,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf } else { SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); } + String(fmt::format("{:#x}", info.pgm_hash)); AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 8ead93f78..634486fc4 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -3,7 +3,6 @@ #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/reg.h" -#include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" namespace Shader::Gcn { @@ -12,29 +11,29 @@ void Translator::EmitDataShare(const GcnInst& inst) { switch (inst.opcode) { // DS case Opcode::DS_ADD_U32: - return DS_ADD_U32(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_ADD_U64: - return DS_ADD_U64(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_SUB_U32: - return DS_SUB_U32(inst, false); + return DS_OP(inst, AtomicOp::Sub, false); case Opcode::DS_INC_U32: - return DS_INC_U32(inst, false); + return DS_OP(inst, AtomicOp::Inc, false); case Opcode::DS_DEC_U32: - return DS_DEC_U32(inst, false); + return DS_OP(inst, AtomicOp::Dec, false); case Opcode::DS_MIN_I32: - return DS_MIN_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smin, false); case Opcode::DS_MAX_I32: - return DS_MAX_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smax, false); case Opcode::DS_MIN_U32: - return DS_MIN_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umin, false); case Opcode::DS_MAX_U32: - return DS_MAX_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umax, false); case Opcode::DS_AND_B32: - return DS_AND_B32(inst, false); + return DS_OP(inst, AtomicOp::And, false); case Opcode::DS_OR_B32: - return DS_OR_B32(inst, false); + return DS_OP(inst, AtomicOp::Or, false); case Opcode::DS_XOR_B32: - return DS_XOR_B32(inst, false); + return DS_OP(inst, AtomicOp::Xor, false); case Opcode::DS_WRITE_B32: return DS_WRITE(32, false, false, false, inst); case Opcode::DS_WRITE2_B32: @@ -42,19 +41,19 @@ void Translator::EmitDataShare(const GcnInst& inst) { case Opcode::DS_WRITE2ST64_B32: return DS_WRITE(32, false, true, true, inst); case Opcode::DS_ADD_RTN_U32: - return DS_ADD_U32(inst, true); + return DS_OP(inst, AtomicOp::Add, true); case Opcode::DS_SUB_RTN_U32: - return DS_SUB_U32(inst, true); + return DS_OP(inst, AtomicOp::Sub, true); case Opcode::DS_MIN_RTN_U32: - return DS_MIN_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umin, true); case Opcode::DS_MAX_RTN_U32: - return DS_MAX_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umax, true); case Opcode::DS_AND_RTN_B32: - return DS_AND_B32(inst, true); + return DS_OP(inst, AtomicOp::And, true); case Opcode::DS_OR_RTN_B32: - return DS_OR_B32(inst, true); + return DS_OP(inst, AtomicOp::Or, true); case Opcode::DS_XOR_RTN_B32: - return DS_XOR_B32(inst, true); + return DS_OP(inst, AtomicOp::Xor, true); case Opcode::DS_SWIZZLE_B32: return DS_SWIZZLE_B32(inst); case Opcode::DS_READ_B32: @@ -117,92 +116,63 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) { // DS -void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { +template +void Translator::DS_OP(const GcnInst& inst, AtomicOp op, bool rtn) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; + const T data = [&] { + if (op == AtomicOp::Inc || op == AtomicOp::Dec) { + return T{}; + } + if constexpr (std::is_same_v) { + return GetSrc(inst.src[1]); + } else { + return GetSrc64(inst.src[1]); + } + }(); const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + const T original_val = [&] -> T { + switch (op) { + case AtomicOp::Add: + return ir.SharedAtomicIAdd(addr_offset, data, is_gds); + case AtomicOp::Umin: + return ir.SharedAtomicIMin(addr_offset, data, false, is_gds); + case AtomicOp::Smin: + return ir.SharedAtomicIMin(addr_offset, data, true, is_gds); + case AtomicOp::Umax: + return ir.SharedAtomicIMax(addr_offset, data, false, is_gds); + case AtomicOp::Smax: + return ir.SharedAtomicIMax(addr_offset, data, true, is_gds); + case AtomicOp::And: + return ir.SharedAtomicAnd(addr_offset, data, is_gds); + case AtomicOp::Or: + return ir.SharedAtomicOr(addr_offset, data, is_gds); + case AtomicOp::Xor: + return ir.SharedAtomicXor(addr_offset, data, is_gds); + case AtomicOp::Sub: + return ir.SharedAtomicISub(addr_offset, data, is_gds); + case AtomicOp::Inc: + return ir.SharedAtomicInc(addr_offset, is_gds); + case AtomicOp::Dec: + return ir.SharedAtomicDec(addr_offset, is_gds); + default: + UNREACHABLE(); + } + }(); if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U64 data{GetSrc64(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); - if (rtn) { - SetDst64(inst.dst[0], IR::U64{original_val}); - } -} - -void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_AND_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicAnd(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_OR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicOr(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_XOR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicXor(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); + if constexpr (std::is_same_v) { + SetDst(inst.dst[0], original_val); + } else { + SetDst64(inst.dst[0], original_val); + } } } void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; @@ -220,33 +190,85 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), - addr0); + addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + ir.WriteShared(32, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), - addr1); + addr1, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + ir.WriteShared(32, ir.GetVectorReg(data1), addr1, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1, is_gds); } } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); + } + } +} + +void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, + const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; + const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; + IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } + if (is_pair) { + // Pair loads are either 32 or 64-bit + const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); + } + const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); + } + } else { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); } } } @@ -263,91 +285,6 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.QuadShuffle(src, index)); } -void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicInc(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicDec(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_SUB_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicISub(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, - const GcnInst& inst) { - const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - IR::VectorReg dst_reg{inst.dst[0].code}; - const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; - if (info.stage == Stage::Fragment) { - ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, - "Unexpected shared memory offset alignment: {}", offset); - ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); - return; - } - if (is_pair) { - // Pair loads are either 32 or 64-bit - const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); - const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data0}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data0}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); - } - const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); - const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data1}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data1}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); - } - } else { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data}); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg, IR::U32{data}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); - } - } -} - void Translator::DS_APPEND(const GcnInst& inst) { const u32 inst_offset = (u32(inst.control.ds.offset1) << 8u) + inst.control.ds.offset0; const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset)); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index b5bfec344..4b5ff827b 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -270,21 +270,13 @@ public: // Data share // DS - void DS_ADD_U32(const GcnInst& inst, bool rtn); - void DS_ADD_U64(const GcnInst& inst, bool rtn); - void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); - void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); + template + void DS_OP(const GcnInst& inst, AtomicOp op, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); - void DS_SWIZZLE_B32(const GcnInst& inst); - void DS_AND_B32(const GcnInst& inst, bool rtn); - void DS_OR_B32(const GcnInst& inst, bool rtn); - void DS_XOR_B32(const GcnInst& inst, bool rtn); void DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); + void DS_SWIZZLE_B32(const GcnInst& inst); void DS_APPEND(const GcnInst& inst); void DS_CONSUME(const GcnInst& inst); - void DS_SUB_U32(const GcnInst& inst, bool rtn); - void DS_INC_U32(const GcnInst& inst, bool rtn); - void DS_DEC_U32(const GcnInst& inst, bool rtn); // Buffer Memory // MUBUF / MTBUF diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 74c7ec601..017c77fb0 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -565,7 +565,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ if ((inst.src[0].field == OperandField::ExecHi || - inst.src[0].field == OperandField::VccHi) && + inst.src[0].field == OperandField::VccHi || + inst.src[0].field == OperandField::ScalarGPR) && (inst.src[1].field == OperandField::ConstZero || inst.src[1].field == OperandField::VectorGPR)) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); @@ -579,7 +580,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_lo_u32_b32 vY, exec_lo, vX // used combined with above for append buffer indexing. - if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) { + if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo || + inst.src[0].field == OperandField::ScalarGPR) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); } UNREACHABLE(); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 2334777ed..b88e1a17d 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -291,78 +291,137 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Inst(Opcode::SetPatch, patch, value); } -Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - return Inst(Opcode::LoadSharedU16, offset); + return Inst(Opcode::LoadSharedU16, Flags{is_gds}, offset); case 32: - return Inst(Opcode::LoadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, Flags{is_gds}, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, Flags{is_gds}, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { +void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - Inst(Opcode::WriteSharedU16, offset, value); + Inst(Opcode::WriteSharedU16, Flags{is_gds}, offset, value); break; case 32: - Inst(Opcode::WriteSharedU32, offset, value); + Inst(Opcode::WriteSharedU32, Flags{is_gds}, offset, value); break; case 64: - Inst(Opcode::WriteSharedU64, offset, value); + Inst(Opcode::WriteSharedU64, Flags{is_gds}, offset, value); break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds) { switch (data.Type()) { case Type::U32: - return Inst(Opcode::SharedAtomicIAdd32, address, data); + return Inst(Opcode::SharedAtomicIAdd32, Flags{is_gds}, address, data); case Type::U64: - return Inst(Opcode::SharedAtomicIAdd64, address, data); + return Inst(Opcode::SharedAtomicIAdd64, Flags{is_gds}, address, data); default: ThrowInvalidType(data.Type()); } } -U32 IREmitter::SharedAtomicIMin(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMin32, address, data) - : Inst(Opcode::SharedAtomicUMin32, address, data); +U32U64 IREmitter::SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMin32 : Opcode::SharedAtomicUMin32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMin64 : Opcode::SharedAtomicUMin64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicIMax(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMax32, address, data) - : Inst(Opcode::SharedAtomicUMax32, address, data); +U32U64 IREmitter::SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMax32 : Opcode::SharedAtomicUMax32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMax64 : Opcode::SharedAtomicUMax64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicAnd(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicAnd32, address, data); +U32U64 IREmitter::SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicOr(const U32& address, const U32& data) { +U32U64 IREmitter::SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } return Inst(Opcode::SharedAtomicOr32, address, data); } -U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicXor32, address, data); +U32U64 IREmitter::SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicXor32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicXor64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicInc(const U32& address) { - return Inst(Opcode::SharedAtomicInc32, address); +U32U64 IREmitter::SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicISub32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicISub64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicDec(const U32& address) { - return Inst(Opcode::SharedAtomicDec32, address); +template <> +U32 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc32, Flags{is_gds}, address); } -U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicISub32, address, data); +template <> +U64 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc64, Flags{is_gds}, address); +} + +template <> +U32 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec32, Flags{is_gds}, address); +} + +template <> +U64 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec64, Flags{is_gds}, address); } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 1c5a8f545..d9e5aab7a 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -96,18 +96,24 @@ public: [[nodiscard]] F32 GetPatch(Patch patch); void SetPatch(Patch patch, const F32& value); - [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); - void WriteShared(int bit_size, const Value& value, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset, + bool is_gds = false); + void WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds = false); - [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); - [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicInc(const U32& address); - [[nodiscard]] U32 SharedAtomicDec(const U32& address); - [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds); + + template + [[nodiscard]] T SharedAtomicInc(const U32& address, bool is_gds); + template + [[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 84bdb5739..eaab05cb7 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -92,7 +92,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::WriteSharedU32: case Opcode::WriteSharedU64: case Opcode::SharedAtomicIAdd32: - case Opcode::SharedAtomicIAdd64: case Opcode::SharedAtomicISub32: case Opcode::SharedAtomicSMin32: case Opcode::SharedAtomicUMin32: @@ -103,6 +102,17 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::SharedAtomicAnd32: case Opcode::SharedAtomicOr32: case Opcode::SharedAtomicXor32: + case Opcode::SharedAtomicIAdd64: + case Opcode::SharedAtomicISub64: + case Opcode::SharedAtomicSMin64: + case Opcode::SharedAtomicUMin64: + case Opcode::SharedAtomicSMax64: + case Opcode::SharedAtomicUMax64: + case Opcode::SharedAtomicInc64: + case Opcode::SharedAtomicDec64: + case Opcode::SharedAtomicAnd64: + case Opcode::SharedAtomicOr64: + case Opcode::SharedAtomicXor64: case Opcode::ImageWrite: case Opcode::ImageAtomicIAdd32: case Opcode::ImageAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 553e63f3e..08dcec458 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -41,15 +41,25 @@ OPCODE(WriteSharedU64, Void, U32, OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicISub32, U32, U32, U32, ) +OPCODE(SharedAtomicISub64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) +OPCODE(SharedAtomicSMin64, U64, U32, U64, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) +OPCODE(SharedAtomicUMin64, U64, U32, U64, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) +OPCODE(SharedAtomicSMax64, U64, U32, U64, ) OPCODE(SharedAtomicUMax32, U32, U32, U32, ) +OPCODE(SharedAtomicUMax64, U64, U32, U64, ) OPCODE(SharedAtomicInc32, U32, U32, ) +OPCODE(SharedAtomicInc64, U64, U32, ) OPCODE(SharedAtomicDec32, U32, U32, ) +OPCODE(SharedAtomicDec64, U64, U32, ) OPCODE(SharedAtomicAnd32, U32, U32, U32, ) +OPCODE(SharedAtomicAnd64, U64, U32, U64, ) OPCODE(SharedAtomicOr32, U32, U32, U32, ) +OPCODE(SharedAtomicOr64, U64, U32, U64, ) OPCODE(SharedAtomicXor32, U32, U32, U32, ) +OPCODE(SharedAtomicXor64, U64, U32, U64, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index f3972769c..e5a4beb8b 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -84,8 +84,42 @@ bool IsBufferInstruction(const IR::Inst& inst) { } bool IsDataRingInstruction(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::DataAppend || - inst.GetOpcode() == IR::Opcode::DataConsume; + switch (inst.GetOpcode()) { + case IR::Opcode::DataAppend: + case IR::Opcode::DataConsume: + return true; + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicDec64: + return inst.Flags(); // is_gds + default: + return false; + } } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { @@ -507,7 +541,8 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } } -void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { +void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, .inline_cbuf = AmdGpu::Buffer::Null(), @@ -515,37 +550,111 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto .is_written = true, }); - const auto pred = [](const IR::Inst* inst) -> std::optional { - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst; - } - return std::nullopt; - }; - - // Attempt to deduce the GDS address of counter at compile time. - u32 gds_addr = 0; - const IR::Value& gds_offset = inst.Arg(0); - if (gds_offset.IsImmediate()) { - // Nothing to do, offset is known. - gds_addr = gds_offset.U32() & 0xFFFF; - } else { - const auto result = IR::BreadthFirstSearch(&inst, pred); - ASSERT_MSG(result, "Unable to track M0 source"); - - // M0 must be set by some user data register. - const IR::Inst* prod = gds_offset.InstRecursive(); - const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); - u32 m0_val = info.user_data[ud_reg] >> 16; - if (prod->GetOpcode() == IR::Opcode::IAdd32) { - m0_val += prod->Arg(1).U32(); - } - gds_addr = m0_val & 0xFFFF; - } - - // Patch instruction. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(gds_addr >> 2)); - inst.SetArg(1, ir.Imm32(binding)); + + // For data append/consume operations attempt to deduce the GDS address. + if (inst.GetOpcode() == IR::Opcode::DataAppend || inst.GetOpcode() == IR::Opcode::DataConsume) { + const auto pred = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; + }; + + u32 gds_addr = 0; + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + gds_addr = gds_offset.U32() & 0xFFFF; + } else { + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to track M0 source"); + + // M0 must be set by some user data register. + const IR::Inst* prod = gds_offset.InstRecursive(); + const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); + u32 m0_val = info.user_data[ud_reg] >> 16; + if (prod->GetOpcode() == IR::Opcode::IAdd32) { + m0_val += prod->Arg(1).U32(); + } + gds_addr = m0_val & 0xFFFF; + } + + // Patch instruction. + inst.SetArg(0, ir.Imm32(gds_addr >> 2)); + inst.SetArg(1, ir.Imm32(binding)); + } else { + // Convert shared memory opcode to storage buffer atomic to GDS buffer. + const IR::U32 offset = IR::U32{inst.Arg(0)}; + const IR::U32 address_words = ir.ShiftRightLogical(offset, ir.Imm32(1)); + const IR::U32 address_dwords = ir.ShiftRightLogical(offset, ir.Imm32(2)); + const IR::U32 address_qwords = ir.ShiftRightLogical(offset, ir.Imm32(3)); + const IR::U32 handle = ir.Imm32(binding); + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWith(ir.BufferAtomicIAdd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicIAdd64: + inst.ReplaceUsesWith( + ir.BufferAtomicIAdd(handle, address_qwords, IR::U64{inst.Arg(1)}, {})); + break; + case IR::Opcode::SharedAtomicISub32: + inst.ReplaceUsesWith(ir.BufferAtomicISub(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMin(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMax(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicInc32: + inst.ReplaceUsesWith(ir.BufferAtomicInc(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicDec32: + inst.ReplaceUsesWith(ir.BufferAtomicDec(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWith(ir.BufferAtomicAnd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWith(ir.BufferAtomicOr(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWith(ir.BufferAtomicXor(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWith(ir.LoadBufferU16(handle, address_words, {})); + break; + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWith(ir.LoadBufferU32(1, handle, address_dwords, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWith(ir.LoadBufferU64(handle, address_qwords, {})); + break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address_words, IR::U16{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address_dwords, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU64(handle, address_qwords, IR::U64{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + default: + UNREACHABLE(); + } + } } IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, @@ -916,8 +1025,6 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferSharp(*block, inst, info, descriptors); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors); - } else if (IsDataRingInstruction(inst)) { - PatchDataRingAccess(*block, inst, info, descriptors); } } } @@ -929,6 +1036,8 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferArgs(*block, inst, info); } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); + } else if (IsDataRingInstruction(inst)) { + PatchGlobalDataShareAccess(*block, inst, info, descriptors); } } } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index a87dceb0a..079827866 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -55,6 +55,16 @@ void Visit(Info& info, const IR::Inst& inst) { info.shared_types |= IR::Type::U32; break; case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: info.uses_shared_int64_atomics = true; [[fallthrough]]; case IR::Opcode::LoadSharedU64: diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp index 0f80a3b28..555fd505b 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -15,6 +15,16 @@ static bool Requires16BitSharedAtomic(const IR::Inst& inst) { static bool Requires64BitSharedAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index a6900e180..b84011acc 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -17,7 +17,6 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd32: - case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicISub32: case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: @@ -28,6 +27,17 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; @@ -64,6 +74,16 @@ IR::Type CalculateSharedMemoryTypes(IR::Program& program) { case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: used_types |= IR::Type::U64; break; default: @@ -119,19 +139,26 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicSMin32: - case IR::Opcode::SharedAtomicUMin32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMin64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); continue; } case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMax64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); continue; @@ -143,12 +170,15 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); continue; case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::LoadSharedU16: @@ -173,7 +203,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.Invalidate(); break; default: - break; + UNREACHABLE(); } } } diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c1110e54d..28444ac60 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -48,6 +48,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory_tracker = std::make_unique(tracker); + std::memset(gds_buffer.mapped_data.data(), 0, DataShareBufferSize); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16); From ee97c5c1109d3a09a964f8ccbae12389ac4efdb4 Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Thu, 10 Jul 2025 12:53:38 +0200 Subject: [PATCH 11/14] Define S_TRAP as InstCategory::FlowControl (#3223) --- src/shader_recompiler/frontend/format.cpp | 2 +- src/shader_recompiler/frontend/translate/scalar_flow.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 52c8c733e..6c4427e5f 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -397,7 +397,7 @@ constexpr std::array InstructionFormatSOPP = {{ // 17 = S_SENDMSGHALT {InstClass::ScalarProgFlow, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 18 = S_TRAP - {InstClass::Undefined, InstCategory::Undefined, 0, 1, ScalarType::Any, ScalarType::Any}, + {InstClass::Undefined, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 19 = S_ICACHE_INV {InstClass::ScalarCache, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 20 = S_INCPERFLEVEL diff --git a/src/shader_recompiler/frontend/translate/scalar_flow.cpp b/src/shader_recompiler/frontend/translate/scalar_flow.cpp index cd1cf51f0..7b57d89ca 100644 --- a/src/shader_recompiler/frontend/translate/scalar_flow.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_flow.cpp @@ -16,6 +16,9 @@ void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { case Opcode::S_SETPRIO: LOG_WARNING(Render_Vulkan, "S_SETPRIO instruction!"); return; + case Opcode::S_TRAP: + LOG_WARNING(Render_Vulkan, "S_TRAP instruction!"); + return; case Opcode::S_GETPC_B64: return S_GETPC_B64(pc, inst); case Opcode::S_SETPC_B64: From 88abb93669a560ab8914968d4696f6011317babf Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 14:19:44 +0300 Subject: [PATCH 12/14] ir_passes: Fold readlane with ff1 pattern (#3224) --- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/emit_spirv_warp.cpp | 5 ++--- .../ir/passes/readlane_elimination_pass.cpp | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 8a0c586e9..f3dd9b2ea 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -529,7 +529,7 @@ Id EmitLaneId(EmitContext& ctx); Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 2d13d09f0..20fb83fa6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -26,9 +26,8 @@ Id EmitReadFirstLane(EmitContext& ctx, Id value) { return ctx.OpGroupNonUniformBroadcastFirst(ctx.U32[1], SubgroupScope(ctx), value); } -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane) { - return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, - ctx.ConstU32(lane)); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane) { + return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, lane); } Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index 9c5f64f84..3378d785f 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -95,6 +95,20 @@ void ReadLaneEliminationPass(IR::Program& program) { if (inst.GetOpcode() != IR::Opcode::ReadLane) { continue; } + + // Check for the following pattern and replace it with ReadFirstLane + // s_ff1_i32_b64 sgpr, exec + // v_readlane_b32 sdst, vgpr, sgpr + if (const auto lane = inst.Arg(1); !lane.IsImmediate()) { + if (lane.InstRecursive()->GetOpcode() == IR::Opcode::FindILsb64) { + const auto value = inst.Arg(0); + inst.ReplaceOpcode(IR::Opcode::ReadFirstLane); + inst.ClearArgs(); + inst.SetArg(0, value); + } + continue; + } + const u32 lane = inst.Arg(1).U32(); IR::Inst* prod = inst.Arg(0).InstRecursive(); From 8bc30270c853635885fffdca9f2ef757ea7ef484 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 21:52:56 +0300 Subject: [PATCH 13/14] shader_recompiler: Implement ff1 with subgroup ops (#3225) --- externals/sirit | 2 +- .../backend/spirv/emit_spirv_instructions.h | 2 ++ .../backend/spirv/emit_spirv_warp.cpp | 8 ++++++++ .../frontend/translate/scalar_alu.cpp | 5 +++-- src/shader_recompiler/ir/ir_emitter.cpp | 8 ++++++++ src/shader_recompiler/ir/ir_emitter.h | 2 ++ src/shader_recompiler/ir/opcodes.inc | 2 ++ .../ir/passes/readlane_elimination_pass.cpp | 12 +----------- 8 files changed, 27 insertions(+), 14 deletions(-) diff --git a/externals/sirit b/externals/sirit index 6b450704f..b4eccb336 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0 +Subproject commit b4eccb336f1b1169af48dac1e04015985af86e3e diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index f3dd9b2ea..74c94754d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -531,6 +531,8 @@ Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); Id EmitReadLane(EmitContext& ctx, Id value, Id lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); +Id EmitBallot(EmitContext& ctx, Id bit); +Id EmitBallotFindLsb(EmitContext& ctx, Id mask); Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 20fb83fa6..951c76001 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -34,4 +34,12 @@ Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { return ctx.u32_zero_value; } +Id EmitBallot(EmitContext& ctx, Id bit) { + return ctx.OpGroupNonUniformBallot(ctx.U32[4], SubgroupScope(ctx), bit); +} + +Id EmitBallotFindLsb(EmitContext& ctx, Id mask) { + return ctx.OpGroupNonUniformBallotFindLSB(ctx.U32[1], SubgroupScope(ctx), mask); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 276b55567..e3134c300 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -680,8 +680,9 @@ void Translator::S_FF1_I32_B32(const GcnInst& inst) { } void Translator::S_FF1_I32_B64(const GcnInst& inst) { - const IR::U64 src0{GetSrc64(inst.src[0])}; - const IR::U32 result{ir.FindILsb(src0)}; + ASSERT(inst.src[0].field == OperandField::ScalarGPR); + const IR::U32 result{ + ir.BallotFindLsb(ir.Ballot(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))))}; SetDst(inst.dst[0], result); } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index b88e1a17d..4997145d7 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -660,6 +660,14 @@ U32 IREmitter::WriteLane(const U32& value, const U32& write_value, const U32& la return Inst(Opcode::WriteLane, value, write_value, lane); } +Value IREmitter::Ballot(const U1& bit) { + return Inst(Opcode::Ballot, bit); +} + +U32 IREmitter::BallotFindLsb(const Value& mask) { + return Inst(Opcode::BallotFindLsb, mask); +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index d9e5aab7a..6055df565 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -176,6 +176,8 @@ public: [[nodiscard]] U32 ReadFirstLane(const U32& value); [[nodiscard]] U32 ReadLane(const U32& value, const U32& lane); [[nodiscard]] U32 WriteLane(const U32& value, const U32& write_value, const U32& lane); + [[nodiscard]] Value Ballot(const U1& bit); + [[nodiscard]] U32 BallotFindLsb(const Value& mask); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 08dcec458..747a27e35 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -472,5 +472,7 @@ OPCODE(QuadShuffle, U32, U32, OPCODE(ReadFirstLane, U32, U32, ) OPCODE(ReadLane, U32, U32, U32 ) OPCODE(WriteLane, U32, U32, U32, U32 ) +OPCODE(Ballot, U32x4, U1, ) +OPCODE(BallotFindLsb, U32, U32x4, ) OPCODE(DataAppend, U32, U32, U32 ) OPCODE(DataConsume, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index 3378d785f..d6586bda0 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -95,17 +95,7 @@ void ReadLaneEliminationPass(IR::Program& program) { if (inst.GetOpcode() != IR::Opcode::ReadLane) { continue; } - - // Check for the following pattern and replace it with ReadFirstLane - // s_ff1_i32_b64 sgpr, exec - // v_readlane_b32 sdst, vgpr, sgpr - if (const auto lane = inst.Arg(1); !lane.IsImmediate()) { - if (lane.InstRecursive()->GetOpcode() == IR::Opcode::FindILsb64) { - const auto value = inst.Arg(0); - inst.ReplaceOpcode(IR::Opcode::ReadFirstLane); - inst.ClearArgs(); - inst.SetArg(0, value); - } + if (!inst.Arg(1).IsImmediate()) { continue; } From b403e1be339b55dd7ab3801e939e5ecd833da015 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 22:14:02 +0300 Subject: [PATCH 14/14] vk_rasterizer: Set render area to max when no framebuffers are bound (#3227) --- src/video_core/renderer_vulkan/vk_instance.h | 12 +++++++++++- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 ++ src/video_core/renderer_vulkan/vk_scheduler.cpp | 7 +------ src/video_core/renderer_vulkan/vk_scheduler.h | 4 ++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index c9e354186..830b1d5c2 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -324,11 +324,21 @@ public: return properties.limits.maxViewportDimensions[0]; } - /// Returns the maximum viewport height. + /// Returns the maximum viewport height. u32 GetMaxViewportHeight() const { return properties.limits.maxViewportDimensions[1]; } + /// Returns the maximum render area width. + u32 GetMaxFramebufferWidth() const { + return properties.limits.maxFramebufferWidth; + } + + /// Returns the maximum render area height. + u32 GetMaxFramebufferHeight() const { + return properties.limits.maxFramebufferHeight; + } + /// Returns the sample count flags supported by framebuffers. vk::SampleCountFlags GetFramebufferSampleCounts() const { return properties.limits.framebufferColorSampleCounts & diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index cca193831..5d0a14ce3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -113,6 +113,8 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) { // Prefetch color and depth buffers to let texture cache handle possible overlaps with bound // textures (e.g. mipgen) RenderState state; + state.width = instance.GetMaxFramebufferWidth(); + state.height = instance.GetMaxFramebufferHeight(); cb_descs.clear(); db_desc.reset(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 4c4e17fe4..ac645c9ce 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -34,16 +34,11 @@ void Scheduler::BeginRendering(const RenderState& new_state) { is_rendering = true; render_state = new_state; - const auto width = - render_state.width != std::numeric_limits::max() ? render_state.width : 1; - const auto height = - render_state.height != std::numeric_limits::max() ? render_state.height : 1; - const vk::RenderingInfo rendering_info = { .renderArea = { .offset = {0, 0}, - .extent = {width, height}, + .extent = {render_state.width, render_state.height}, }, .layerCount = 1, .colorAttachmentCount = render_state.num_color_attachments, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bd6fb549a..b5678edbc 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -26,8 +26,8 @@ struct RenderState { u32 num_color_attachments{}; bool has_depth{}; bool has_stencil{}; - u32 width = std::numeric_limits::max(); - u32 height = std::numeric_limits::max(); + u32 width{}; + u32 height{}; bool operator==(const RenderState& other) const noexcept { return std::memcmp(this, &other, sizeof(RenderState)) == 0;