From 5eef2fd28ad6c8b38e7dfc1157af1045d86d68a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Mon, 7 Jul 2025 11:26:27 +0200 Subject: [PATCH 01/22] mmap executable memory (#3201) --- src/core/address_space.cpp | 12 ++++++++++-- src/core/libraries/kernel/memory.cpp | 3 ++- src/core/memory.cpp | 9 ++++++--- src/core/memory.h | 1 + 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 2e29f70ee..846bb5eb4 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -358,9 +358,17 @@ enum PosixPageProtection { [[nodiscard]] constexpr PosixPageProtection ToPosixProt(Core::MemoryProt prot) { if (True(prot & Core::MemoryProt::CpuReadWrite) || True(prot & Core::MemoryProt::GpuReadWrite)) { - return PAGE_READWRITE; + if (True(prot & Core::MemoryProt::CpuExec)) { + return PAGE_EXECUTE_READWRITE; + } else { + return PAGE_READWRITE; + } } else if (True(prot & Core::MemoryProt::CpuRead) || True(prot & Core::MemoryProt::GpuRead)) { - return PAGE_READONLY; + if (True(prot & Core::MemoryProt::CpuExec)) { + return PAGE_EXECUTE_READ; + } else { + return PAGE_READONLY; + } } else { return PAGE_NOACCESS; } diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index 8153b7610..e0c359f2c 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -573,11 +573,12 @@ void* PS4_SYSV_ABI posix_mmap(void* addr, u64 len, s32 prot, s32 flags, s32 fd, auto* memory = Core::Memory::Instance(); const auto mem_prot = static_cast(prot); const auto mem_flags = static_cast(flags); + const auto is_exec = True(mem_prot & Core::MemoryProt::CpuExec); s32 result = ORBIS_OK; if (fd == -1) { result = memory->MapMemory(&addr_out, std::bit_cast(addr), len, mem_prot, mem_flags, - Core::VMAType::Flexible); + Core::VMAType::Flexible, "anon", is_exec); } else { result = memory->MapFile(&addr_out, std::bit_cast(addr), len, mem_prot, mem_flags, fd, phys_addr); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index e7ecf8d80..3d9bf58a7 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -631,6 +631,9 @@ s64 MemoryManager::ProtectBytes(VAddr addr, VirtualMemoryArea vma_base, u64 size if (True(prot & MemoryProt::CpuReadWrite)) { perms |= Core::MemoryPermission::ReadWrite; } + if (True(prot & MemoryProt::CpuExec)) { + perms |= Core::MemoryPermission::Execute; + } if (True(prot & MemoryProt::GpuRead)) { perms |= Core::MemoryPermission::Read; } @@ -650,9 +653,9 @@ s32 MemoryManager::Protect(VAddr addr, u64 size, MemoryProt prot) { std::scoped_lock lk{mutex}; // Validate protection flags - constexpr static MemoryProt valid_flags = MemoryProt::NoAccess | MemoryProt::CpuRead | - MemoryProt::CpuReadWrite | MemoryProt::GpuRead | - MemoryProt::GpuWrite | MemoryProt::GpuReadWrite; + constexpr static MemoryProt valid_flags = + MemoryProt::NoAccess | MemoryProt::CpuRead | MemoryProt::CpuReadWrite | + MemoryProt::CpuExec | MemoryProt::GpuRead | MemoryProt::GpuWrite | MemoryProt::GpuReadWrite; MemoryProt invalid_flags = prot & ~valid_flags; if (invalid_flags != MemoryProt::NoAccess) { diff --git a/src/core/memory.h b/src/core/memory.h index c800ef763..285d7dbed 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -31,6 +31,7 @@ enum class MemoryProt : u32 { NoAccess = 0, CpuRead = 1, CpuReadWrite = 2, + CpuExec = 4, GpuRead = 16, GpuWrite = 32, GpuReadWrite = 48, From 146e81a56a2e83420521be270a6dafec5e6d3b8d Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Mon, 7 Jul 2025 12:44:06 +0300 Subject: [PATCH 02/22] Fix V_ADDC_U32 carry-out edge cases (#3200) * Fix V_ADDC_U32 carry-out edge cases * Use IAddCarry instead --- .../frontend/translate/vector_alu.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 54f1088f2..5a80855d3 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -623,12 +623,15 @@ void Translator::V_ADDC_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 carry{GetCarryIn(inst)}; - const IR::U32 result{ir.IAdd(ir.IAdd(src0, src1), carry)}; - SetDst(inst.dst[0], result); + const IR::Value tmp1{ir.IAddCary(src0, src1)}; + const IR::U32 result1{ir.CompositeExtract(tmp1, 0)}; + const IR::U32 carry_out1{ir.CompositeExtract(tmp1, 1)}; + const IR::Value tmp2{ir.IAddCary(result1, carry)}; + const IR::U32 result2{ir.CompositeExtract(tmp2, 0)}; + const IR::U32 carry_out2{ir.CompositeExtract(tmp2, 1)}; + SetDst(inst.dst[0], result2); - const IR::U1 less_src0{ir.ILessThan(result, src0, false)}; - const IR::U1 less_src1{ir.ILessThan(result, src1, false)}; - const IR::U1 did_overflow{ir.LogicalOr(less_src0, less_src1)}; + const IR::U1 did_overflow{ir.INotEqual(ir.BitwiseOr(carry_out1, carry_out2), ir.Imm32(0))}; SetCarryOut(inst, did_overflow); } From 70eef0de903aa3e8fe0458c83c03c4cf846e8581 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Mon, 7 Jul 2025 03:03:19 -0700 Subject: [PATCH 03/22] texture_cache: Change depth resolve new image back to max of resources. (#3205) --- src/video_core/texture_cache/texture_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index a50601af6..aa6563a84 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -169,7 +169,7 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi if (recreate) { auto new_info = requested_info; - new_info.resources = std::min(requested_info.resources, cache_image.info.resources); + new_info.resources = std::max(requested_info.resources, cache_image.info.resources); const auto new_image_id = slot_images.insert(instance, scheduler, new_info); RegisterImage(new_image_id); From 4eaa992affc1811a99c8ac8a629ee7b2d3b785c0 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Mon, 7 Jul 2025 13:29:11 +0300 Subject: [PATCH 04/22] Rename 'AddCary' to 'AddCarry' (#3206) --- src/shader_recompiler/backend/spirv/emit_spirv_instructions.h | 2 +- src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp | 2 +- src/shader_recompiler/frontend/translate/vector_alu.cpp | 4 ++-- src/shader_recompiler/ir/ir_emitter.cpp | 4 ++-- src/shader_recompiler/ir/ir_emitter.h | 2 +- src/shader_recompiler/ir/opcodes.inc | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 1ac2266bd..6e146c5f6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -353,7 +353,7 @@ Id EmitFPIsInf32(EmitContext& ctx, Id value); Id EmitFPIsInf64(EmitContext& ctx, Id value); Id EmitIAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitIAdd64(EmitContext& ctx, Id a, Id b); -Id EmitIAddCary32(EmitContext& ctx, Id a, Id b); +Id EmitIAddCarry32(EmitContext& ctx, Id a, Id b); Id EmitISub32(EmitContext& ctx, Id a, Id b); Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitSMulHi(EmitContext& ctx, Id a, Id b); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index ddc1e7574..01652c1cf 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -60,7 +60,7 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b) { return ctx.OpIAdd(ctx.U64, a, b); } -Id EmitIAddCary32(EmitContext& ctx, Id a, Id b) { +Id EmitIAddCarry32(EmitContext& ctx, Id a, Id b) { return ctx.OpIAddCarry(ctx.full_result_u32x2, a, b); } diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 5a80855d3..74c7ec601 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -623,10 +623,10 @@ void Translator::V_ADDC_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 carry{GetCarryIn(inst)}; - const IR::Value tmp1{ir.IAddCary(src0, src1)}; + const IR::Value tmp1{ir.IAddCarry(src0, src1)}; const IR::U32 result1{ir.CompositeExtract(tmp1, 0)}; const IR::U32 carry_out1{ir.CompositeExtract(tmp1, 1)}; - const IR::Value tmp2{ir.IAddCary(result1, carry)}; + const IR::Value tmp2{ir.IAddCarry(result1, carry)}; const IR::U32 result2{ir.CompositeExtract(tmp2, 0)}; const IR::U32 carry_out2{ir.CompositeExtract(tmp2, 1)}; SetDst(inst.dst[0], result2); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3d64cc5da..2334777ed 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -1424,13 +1424,13 @@ U32U64 IREmitter::IAdd(const U32U64& a, const U32U64& b) { } } -Value IREmitter::IAddCary(const U32& a, const U32& b) { +Value IREmitter::IAddCarry(const U32& a, const U32& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); } switch (a.Type()) { case Type::U32: - return Inst(Opcode::IAddCary32, a, b); + return Inst(Opcode::IAddCarry32, a, b); default: ThrowInvalidType(a.Type()); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 119e3752e..1c5a8f545 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -254,7 +254,7 @@ public: [[nodiscard]] F32F64 FPMedTri(const F32F64& a, const F32F64& b, const F32F64& c); [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); - [[nodiscard]] Value IAddCary(const U32& a, const U32& b); + [[nodiscard]] Value IAddCarry(const U32& a, const U32& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); [[nodiscard]] U32 IMulHi(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 IMul(const U32U64& a, const U32U64& b); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 008f44659..680159132 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -328,7 +328,7 @@ OPCODE(FPCmpClass32, U1, F32, // Integer operations OPCODE(IAdd32, U32, U32, U32, ) OPCODE(IAdd64, U64, U64, U64, ) -OPCODE(IAddCary32, U32x2, U32, U32, ) +OPCODE(IAddCarry32, U32x2, U32, U32, ) OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) From d6163a6edbdbd6cc96ada7eb523e56a8aab03bc8 Mon Sep 17 00:00:00 2001 From: georgemoralis Date: Mon, 7 Jul 2025 13:37:08 +0300 Subject: [PATCH 05/22] uber fix --- src/shader_recompiler/ir/opcodes.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 680159132..553e63f3e 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -328,7 +328,7 @@ OPCODE(FPCmpClass32, U1, F32, // Integer operations OPCODE(IAdd32, U32, U32, U32, ) OPCODE(IAdd64, U64, U64, U64, ) -OPCODE(IAddCarry32, U32x2, U32, U32, ) +OPCODE(IAddCarry32, U32x2, U32, U32, ) OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) From 7fedbd52e0629c037be631068f609e77a2b27615 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 7 Jul 2025 16:23:20 +0300 Subject: [PATCH 06/22] texture_cache: Async download of GPU modified linear images (#3204) * texture_cache: Async download of GPU modified linear images * liverpool: Back to less submits * texture_cache: Don't download depth images * config: Add option for linear image readback --- src/common/config.cpp | 11 +++- src/common/config.h | 1 + src/video_core/amdgpu/liverpool.cpp | 3 +- src/video_core/buffer_cache/buffer_cache.h | 2 +- .../renderer_vulkan/vk_rasterizer.cpp | 16 +++-- .../renderer_vulkan/vk_rasterizer.h | 2 +- .../renderer_vulkan/vk_scheduler.cpp | 13 ++-- src/video_core/renderer_vulkan/vk_scheduler.h | 3 + .../texture_cache/texture_cache.cpp | 61 ++++++++++++++++++- src/video_core/texture_cache/texture_cache.h | 10 ++- 10 files changed, 106 insertions(+), 16 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index d3a5fa6a1..010fecf95 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -65,6 +65,7 @@ static u32 screenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; +static bool readbackLinearImagesEnabled = false; static bool directMemoryAccessEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = false; @@ -103,7 +104,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 51; +static constexpr u64 total_entries = 52; bool allowHDR() { return isHDRAllowed; @@ -262,6 +263,10 @@ bool readbacks() { return readbacksEnabled; } +bool readbackLinearImages() { + return readbackLinearImagesEnabled; +} + bool directMemoryAccess() { return directMemoryAccessEnabled; } @@ -631,6 +636,8 @@ void load(const std::filesystem::path& path) { isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); + readbackLinearImagesEnabled = + toml::find_or(gpu, "readbackLinearImages", readbackLinearImagesEnabled); directMemoryAccessEnabled = toml::find_or(gpu, "directMemoryAccess", directMemoryAccessEnabled); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", shouldDumpShaders); @@ -802,6 +809,7 @@ void save(const std::filesystem::path& path) { data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; + data["GPU"]["readbackLinearImages"] = readbackLinearImagesEnabled; data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; @@ -902,6 +910,7 @@ void setDefaultValues() { isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; + readbackLinearImagesEnabled = false; directMemoryAccessEnabled = false; shouldDumpShaders = false; shouldPatchShaders = false; diff --git a/src/common/config.h b/src/common/config.h index 931fa68e2..2ed08198a 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -47,6 +47,7 @@ bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); bool readbacks(); void setReadbacks(bool enable); +bool readbackLinearImages(); bool directMemoryAccess(); void setDirectMemoryAccess(bool enable); bool dumpShaders(); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 9b8c28b66..e264de74a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -135,9 +135,8 @@ void Liverpool::Process(std::stop_token stoken) { if (submit_done) { VideoCore::EndCapture(); - if (rasterizer) { - rasterizer->ProcessFaults(); + rasterizer->EndCommandList(); rasterizer->Flush(); } submit_done = false; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 900a27aee..354d01431 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -112,7 +112,7 @@ public: /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size); - /// Waits on pending downloads in the logical page range. + /// Flushes any GPU modified buffer in the logical page range back to CPU memory. void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); /// Binds host vertex buffers for the current draw. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e4e026485..cca193831 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -272,6 +272,8 @@ void Rasterizer::EliminateFastClear() { void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -317,6 +319,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 u32 max_count, VAddr count_address) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -380,6 +384,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -407,6 +413,8 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -439,11 +447,12 @@ void Rasterizer::Finish() { scheduler.Finish(); } -void Rasterizer::ProcessFaults() { +void Rasterizer::EndCommandList() { if (fault_process_pending) { fault_process_pending = false; buffer_cache.ProcessFaultBuffer(); } + texture_cache.ProcessDownloadImages(); } bool Rasterizer::BindResources(const Pipeline* pipeline) { @@ -649,8 +658,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (instance.IsNullDescriptorSupported()) { image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); } else { - auto& null_image_view = - texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc.view_info); + auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, vk::ImageLayout::eGeneral); } @@ -664,7 +672,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin bound_images.emplace_back(image_id); auto& image = texture_cache.GetImage(image_id); - auto& image_view = texture_cache.FindTexture(image_id, desc.view_info); + auto& image_view = texture_cache.FindTexture(image_id, desc); if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4a978746c..1e1680258 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -68,7 +68,7 @@ public: void CpSync(); u64 Flush(); void Finish(); - void ProcessFaults(); + void EndCommandList(); PipelineCache& GetPipelineCache() { return pipeline_cache; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e75a69924..4c4e17fe4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -101,6 +101,14 @@ void Scheduler::Wait(u64 tick) { } } +void Scheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + void Scheduler::AllocateWorkerCommandBuffers() { const vk::CommandBufferBeginInfo begin_info = { .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, @@ -175,10 +183,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { AllocateWorkerCommandBuffers(); // Apply pending operations - while (!pending_ops.empty() && IsFree(pending_ops.front().gpu_tick)) { - pending_ops.front().callback(); - pending_ops.pop(); - } + PopPendingOperations(); } void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8ddf00f6a..36fd9c055 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -317,6 +317,9 @@ public: /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + /// Starts a new rendering scope with provided state. void BeginRendering(const RenderState& new_state); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index aa6563a84..723b95892 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -5,7 +5,9 @@ #include #include "common/assert.h" +#include "common/config.h" #include "common/debug.h" +#include "core/memory.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -58,6 +60,50 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { return null_id; } +void TextureCache::ProcessDownloadImages() { + for (const ImageId image_id : download_images) { + DownloadImageMemory(image_id); + } + download_images.clear(); +} + +void TextureCache::DownloadImageMemory(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download); + const u32 download_size = image.info.pitch * image.info.size.height * + image.info.resources.layers * (image.info.num_bits / 8); + ASSERT(download_size <= image.info.guest_size); + const auto [download, offset] = download_buffer.Map(download_size); + download_buffer.Commit(); + const vk::BufferImageCopy image_download = { + .bufferOffset = offset, + .bufferRowLength = image.info.pitch, + .bufferImageHeight = image.info.size.height, + .imageSubresource = + { + .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = image.info.resources.layers, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, + download_buffer.Handle(), image_download); + scheduler.DeferOperation([device_addr = image.info.guest_address, download, download_size] { + auto* memory = Core::Memory::Instance(); + memory->TryWriteBacking(std::bit_cast(device_addr), download, download_size); + }); +} + void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { if (image.hash == 0) { // Initialize hash @@ -437,16 +483,27 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo return slot_image_views[view_id]; } -ImageView& TextureCache::FindTexture(ImageId image_id, const ImageViewInfo& view_info) { +ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; + if (desc.type == BindingType::Storage) { + image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } + } UpdateImage(image_id); - return RegisterImageView(image_id, view_info); + return RegisterImageView(image_id, desc.view_info); } ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } image.usage.render_target = 1u; UpdateImage(image_id); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 87228b84f..ff8ffb61c 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -105,11 +106,14 @@ public: /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); + /// Schedules a copy of pending images for download back to CPU memory. + void ProcessDownloadImages(); + /// Retrieves the image handle of the image with the provided attributes. [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); /// Retrieves an image view with the properties of the specified image id. - [[nodiscard]] ImageView& FindTexture(ImageId image_id, const ImageViewInfo& view_info); + [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); /// Retrieves the render target with specified properties [[nodiscard]] ImageView& FindRenderTarget(BaseDesc& desc); @@ -252,6 +256,9 @@ private: /// Gets or creates a null image for a particular format. ImageId GetNullImage(vk::Format format); + /// Copies image memory back to CPU. + void DownloadImageMemory(ImageId image_id); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); @@ -293,6 +300,7 @@ private: Common::SlotVector slot_image_views; tsl::robin_map samplers; tsl::robin_map null_images; + std::unordered_set download_images; PageTable page_table; std::mutex mutex; From 80f7ec26816705f8080187c284c828a7a6382b8a Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Mon, 7 Jul 2025 18:17:56 +0200 Subject: [PATCH 07/22] video_out: Internal Resolution Support (#3194) * impl * clang * clang+ * update total_entries too --- src/common/config.cpp | 60 ++++++++++++++++------- src/common/config.h | 12 +++-- src/core/libraries/videoout/video_out.cpp | 3 +- src/emulator.cpp | 2 +- src/qt_gui/settings_dialog.cpp | 4 +- 5 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index 010fecf95..6f8563377 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -60,8 +60,10 @@ static bool overrideControllerColor = false; static int controllerCustomColorRGB[3] = {0, 0, 255}; // GPU -static u32 screenWidth = 1280; -static u32 screenHeight = 720; +static u32 windowWidth = 1280; +static u32 windowHeight = 720; +static u32 internalScreenWidth = 1280; +static u32 internalScreenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; @@ -104,7 +106,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 52; +static constexpr u64 total_entries = 54; bool allowHDR() { return isHDRAllowed; @@ -195,12 +197,20 @@ double getTrophyNotificationDuration() { return trophyNotificationDuration; } -u32 getScreenWidth() { - return screenWidth; +u32 getWindowWidth() { + return windowWidth; } -u32 getScreenHeight() { - return screenHeight; +u32 getWindowHeight() { + return windowHeight; +} + +u32 getInternalScreenWidth() { + return internalScreenHeight; +} + +u32 getInternalScreenHeight() { + return internalScreenHeight; } s32 getGpuId() { @@ -339,12 +349,20 @@ void setGpuId(s32 selectedGpuId) { gpuId = selectedGpuId; } -void setScreenWidth(u32 width) { - screenWidth = width; +void setWindowWidth(u32 width) { + windowWidth = width; } -void setScreenHeight(u32 height) { - screenHeight = height; +void setWindowHeight(u32 height) { + windowHeight = height; +} + +void setInternalScreenWidth(u32 width) { + internalScreenWidth = width; +} + +void setInternalScreenHeight(u32 height) { + internalScreenHeight = height; } void setDebugDump(bool enable) { @@ -426,6 +444,7 @@ void setCursorState(s16 newCursorState) { void setCursorHideTimeout(int newcursorHideTimeout) { cursorHideTimeout = newcursorHideTimeout; } + void setTrophyNotificationDuration(double newTrophyNotificationDuration) { trophyNotificationDuration = newTrophyNotificationDuration; } @@ -631,8 +650,11 @@ void load(const std::filesystem::path& path) { if (data.contains("GPU")) { const toml::value& gpu = data.at("GPU"); - screenWidth = toml::find_or(gpu, "screenWidth", screenWidth); - screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); + windowWidth = toml::find_or(gpu, "screenWidth", windowWidth); + windowHeight = toml::find_or(gpu, "screenHeight", windowHeight); + internalScreenWidth = toml::find_or(gpu, "internalScreenWidth", internalScreenWidth); + internalScreenHeight = + toml::find_or(gpu, "internalScreenHeight", internalScreenHeight); isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); @@ -804,8 +826,10 @@ void save(const std::filesystem::path& path) { data["Input"]["specialPadClass"] = specialPadClass; data["Input"]["isMotionControlsEnabled"] = isMotionControlsEnabled; data["Input"]["useUnifiedInputConfig"] = useUnifiedInputConfig; - data["GPU"]["screenWidth"] = screenWidth; - data["GPU"]["screenHeight"] = screenHeight; + data["GPU"]["screenWidth"] = windowWidth; + data["GPU"]["screenHeight"] = windowHeight; + data["GPU"]["internalScreenWidth"] = internalScreenWidth; + data["GPU"]["internalScreenHeight"] = internalScreenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; @@ -905,8 +929,10 @@ void setDefaultValues() { controllerCustomColorRGB[2] = 255; // GPU - screenWidth = 1280; - screenHeight = 720; + windowWidth = 1280; + windowHeight = 720; + internalScreenWidth = 1280; + internalScreenHeight = 720; isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; diff --git a/src/common/config.h b/src/common/config.h index 2ed08198a..e54425676 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -25,10 +25,14 @@ bool getIsFullscreen(); void setIsFullscreen(bool enable); std::string getFullscreenMode(); void setFullscreenMode(std::string mode); -u32 getScreenWidth(); -u32 getScreenHeight(); -void setScreenWidth(u32 width); -void setScreenHeight(u32 height); +u32 getWindowWidth(); +u32 getWindowHeight(); +void setWindowWidth(u32 width); +void setWindowHeight(u32 height); +u32 getInternalScreenWidth(); +u32 getInternalScreenHeight(); +void setInternalScreenWidth(u32 width); +void setInternalScreenHeight(u32 height); bool debugDump(); void setDebugDump(bool enable); s32 getGpuId(); diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index da715b3bf..0f961923a 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -445,7 +445,8 @@ s32 PS4_SYSV_ABI sceVideoOutConfigureOutputMode_(s32 handle, u32 reserved, const } void RegisterLib(Core::Loader::SymbolsResolver* sym) { - driver = std::make_unique(Config::getScreenWidth(), Config::getScreenHeight()); + driver = std::make_unique(Config::getInternalScreenWidth(), + Config::getInternalScreenHeight()); LIB_FUNCTION("SbU3dwp80lQ", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutGetFlipStatus); diff --git a/src/emulator.cpp b/src/emulator.cpp index fbab5929b..332287d22 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -222,7 +222,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar } } window = std::make_unique( - Config::getScreenWidth(), Config::getScreenHeight(), controller, window_title); + Config::getWindowWidth(), Config::getWindowHeight(), controller, window_title); g_window = window.get(); diff --git a/src/qt_gui/settings_dialog.cpp b/src/qt_gui/settings_dialog.cpp index c9d264587..ed2a17e25 100644 --- a/src/qt_gui/settings_dialog.cpp +++ b/src/qt_gui/settings_dialog.cpp @@ -762,8 +762,8 @@ void SettingsDialog::UpdateSettings() { m_gui_settings->SetValue(gui::gl_backgroundMusicVolume, ui->BGMVolumeSlider->value()); Config::setLanguage(languageIndexes[ui->consoleLanguageComboBox->currentIndex()]); Config::setEnableDiscordRPC(ui->discordRPCCheckbox->isChecked()); - Config::setScreenWidth(ui->widthSpinBox->value()); - Config::setScreenHeight(ui->heightSpinBox->value()); + Config::setWindowWidth(ui->widthSpinBox->value()); + Config::setWindowHeight(ui->heightSpinBox->value()); Config::setVblankDiv(ui->vblankSpinBox->value()); Config::setDumpShaders(ui->dumpShadersCheckBox->isChecked()); Config::setNullGpu(ui->nullGpuCheckBox->isChecked()); From ddede4a52dd2573d84abbf02c4bc35a9b1d18e3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Bogd=C4=81ns?= Date: Tue, 8 Jul 2025 01:04:16 +0300 Subject: [PATCH 08/22] IME fixes (#3207) - Moved enums, flags, and structs to ime_common.h to simplify usage with Ime and ImeDialog - Updated Ime to use an enum as the return type, consistent with ImeDialog - Removed duplicate definition of OrbisImeKeycode - Added OrbisImeLanguage as a flags enum - Added missing options to OrbisImeOption - Removed OrbisImeDialogOption; OrbisImeOption should be used instead - Added OrbisImeTextAreaMode - Updated OrbisImeTextAreaMode - Fixed OrbisImeEventParam by adding the missing member OrbisImePanelType panel_type - Updated the sceImeOpen declaration to use extended parameters (not yet implemented) -Fixed Diablo III (CUSA00434) assertion failure on ImeDialog initialization Co-authored-by: w1naenator --- src/core/libraries/ime/ime.cpp | 68 +++---- src/core/libraries/ime/ime.h | 80 +------- src/core/libraries/ime/ime_common.h | 232 ++++++++++++++++++++++- src/core/libraries/ime/ime_dialog.cpp | 49 +++-- src/core/libraries/ime/ime_dialog.h | 120 ------------ src/core/libraries/ime/ime_dialog_ui.cpp | 32 +++- src/core/libraries/ime/ime_ui.cpp | 2 +- 7 files changed, 325 insertions(+), 258 deletions(-) diff --git a/src/core/libraries/ime/ime.cpp b/src/core/libraries/ime/ime.cpp index 1c61bc276..54e856e87 100644 --- a/src/core/libraries/ime/ime.cpp +++ b/src/core/libraries/ime/ime.cpp @@ -43,8 +43,8 @@ public: openEvent.param.rect.x = m_param.ime.posx; openEvent.param.rect.y = m_param.ime.posy; } else { - openEvent.param.resource_id_array.userId = 1; - openEvent.param.resource_id_array.resourceId[0] = 1; + openEvent.param.resource_id_array.user_id = 1; + openEvent.param.resource_id_array.resource_id[0] = 1; } // Are we supposed to call the event handler on init with @@ -59,10 +59,10 @@ public: } } - s32 Update(OrbisImeEventHandler handler) { + Error Update(OrbisImeEventHandler handler) { if (!m_ime_mode) { /* We don't handle any events for ImeKeyboard */ - return ORBIS_OK; + return Error::OK; } std::unique_lock lock{g_ime_state.queue_mutex}; @@ -73,7 +73,7 @@ public: Execute(handler, &event, false); } - return ORBIS_OK; + return Error::OK; } void Execute(OrbisImeEventHandler handler, OrbisImeEvent* event, bool use_param_handler) { @@ -94,14 +94,14 @@ public: } } - s32 SetText(const char16_t* text, u32 length) { + Error SetText(const char16_t* text, u32 length) { g_ime_state.SetText(text, length); - return ORBIS_OK; + return Error::OK; } - s32 SetCaret(const OrbisImeCaret* caret) { + Error SetCaret(const OrbisImeCaret* caret) { g_ime_state.SetCaret(caret->index); - return ORBIS_OK; + return Error::OK; } bool IsIme() { @@ -222,11 +222,11 @@ int PS4_SYSV_ABI sceImeGetPanelPositionAndForm() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height) { +Error PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height) { LOG_INFO(Lib_Ime, "called"); if (!width || !height) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } switch (param->type) { @@ -244,18 +244,18 @@ s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* break; } - return ORBIS_OK; + return Error::OK; } -s32 PS4_SYSV_ABI sceImeKeyboardClose(s32 userId) { +Error PS4_SYSV_ABI sceImeKeyboardClose(s32 userId) { LOG_INFO(Lib_Ime, "(STUBBED) called"); if (!g_keyboard_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } g_keyboard_handler.release(); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeKeyboardGetInfo() { @@ -268,25 +268,25 @@ int PS4_SYSV_ABI sceImeKeyboardGetResourceId() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param) { +Error PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param) { LOG_INFO(Lib_Ime, "called"); if (!param) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } if (!param->arg) { - return ORBIS_IME_ERROR_INVALID_ARG; + return Error::INVALID_ARG; } if (!param->handler) { - return ORBIS_IME_ERROR_INVALID_HANDLER; + return Error::INVALID_HANDLER; } if (g_keyboard_handler) { - return ORBIS_IME_ERROR_BUSY; + return Error::BUSY; } g_keyboard_handler = std::make_unique(param); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeKeyboardOpenInternal() { @@ -304,18 +304,18 @@ int PS4_SYSV_ABI sceImeKeyboardUpdate() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const void* extended) { +Error PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const OrbisImeParamExtended* extended) { LOG_INFO(Lib_Ime, "called"); if (!param) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } if (g_ime_handler) { - return ORBIS_IME_ERROR_BUSY; + return Error::BUSY; } g_ime_handler = std::make_unique(param); - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeOpenInternal() { @@ -339,27 +339,27 @@ int PS4_SYSV_ABI sceImeSetCandidateIndex() { return ORBIS_OK; } -int PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret) { +Error PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret) { LOG_TRACE(Lib_Ime, "called"); if (!g_ime_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } if (!caret) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } return g_ime_handler->SetCaret(caret); } -s32 PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length) { +Error PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length) { LOG_TRACE(Lib_Ime, "called"); if (!g_ime_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } if (!text) { - return ORBIS_IME_ERROR_INVALID_ADDRESS; + return Error::INVALID_ADDRESS; } return g_ime_handler->SetText(text, length); @@ -370,7 +370,7 @@ int PS4_SYSV_ABI sceImeSetTextGeometry() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { +Error PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { if (g_ime_handler) { g_ime_handler->Update(handler); } @@ -380,10 +380,10 @@ s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler) { } if (!g_ime_handler || !g_keyboard_handler) { - return ORBIS_IME_ERROR_NOT_OPENED; + return Error::NOT_OPENED; } - return ORBIS_OK; + return Error::OK; } int PS4_SYSV_ABI sceImeVshClearPreedit() { diff --git a/src/core/libraries/ime/ime.h b/src/core/libraries/ime/ime.h index fcf381048..c2b80809c 100644 --- a/src/core/libraries/ime/ime.h +++ b/src/core/libraries/ime/ime.h @@ -13,72 +13,6 @@ class SymbolsResolver; namespace Libraries::Ime { -constexpr u32 ORBIS_IME_MAX_TEXT_LENGTH = 2048; - -enum class OrbisImeKeyboardOption : u32 { - Default = 0, - Repeat = 1, - RepeatEachKey = 2, - AddOsk = 4, - EffectiveWithIme = 8, - DisableResume = 16, - DisableCapslockWithoutShift = 32, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeKeyboardOption) - -enum class OrbisImeOption : u32 { - DEFAULT = 0, - MULTILINE = 1, - NO_AUTO_CAPITALIZATION = 2, - PASSWORD = 4, - LANGUAGES_FORCED = 8, - EXT_KEYBOARD = 16, - NO_LEARNING = 32, - FIXED_POSITION = 64, - DISABLE_RESUME = 256, - DISABLE_AUTO_SPACE = 512, - DISABLE_POSITION_ADJUSTMENT = 2048, - EXPANDED_PREEDIT_BUFFER = 4096, - USE_JAPANESE_EISUU_KEY_AS_CAPSLOCK = 8192, - USE_2K_COORDINATES = 16384, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeOption) - -struct OrbisImeKeyboardParam { - OrbisImeKeyboardOption option; - s8 reserved1[4]; - void* arg; - OrbisImeEventHandler handler; - s8 reserved2[8]; -}; - -struct OrbisImeParam { - s32 user_id; - OrbisImeType type; - u64 supported_languages; - OrbisImeEnterLabel enter_label; - OrbisImeInputMethod input_method; - OrbisImeTextFilter filter; - OrbisImeOption option; - u32 maxTextLength; - char16_t* inputTextBuffer; - float posx; - float posy; - OrbisImeHorizontalAlignment horizontal_alignment; - OrbisImeVerticalAlignment vertical_alignment; - void* work; - void* arg; - OrbisImeEventHandler handler; - s8 reserved[8]; -}; - -struct OrbisImeCaret { - f32 x; - f32 y; - u32 height; - u32 index; -}; - int PS4_SYSV_ABI FinalizeImeModule(); int PS4_SYSV_ABI InitializeImeModule(); int PS4_SYSV_ABI sceImeCheckFilterText(); @@ -98,22 +32,22 @@ int PS4_SYSV_ABI sceImeDisableController(); int PS4_SYSV_ABI sceImeFilterText(); int PS4_SYSV_ABI sceImeForTestFunction(); int PS4_SYSV_ABI sceImeGetPanelPositionAndForm(); -s32 PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height); -s32 PS4_SYSV_ABI sceImeKeyboardClose(s32 userId); +Error PS4_SYSV_ABI sceImeGetPanelSize(const OrbisImeParam* param, u32* width, u32* height); +Error PS4_SYSV_ABI sceImeKeyboardClose(s32 userId); int PS4_SYSV_ABI sceImeKeyboardGetInfo(); int PS4_SYSV_ABI sceImeKeyboardGetResourceId(); -s32 PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param); +Error PS4_SYSV_ABI sceImeKeyboardOpen(s32 userId, const OrbisImeKeyboardParam* param); int PS4_SYSV_ABI sceImeKeyboardOpenInternal(); int PS4_SYSV_ABI sceImeKeyboardSetMode(); int PS4_SYSV_ABI sceImeKeyboardUpdate(); -s32 PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const void* extended); +Error PS4_SYSV_ABI sceImeOpen(const OrbisImeParam* param, const OrbisImeParamExtended* extended); int PS4_SYSV_ABI sceImeOpenInternal(); void PS4_SYSV_ABI sceImeParamInit(OrbisImeParam* param); int PS4_SYSV_ABI sceImeSetCandidateIndex(); -s32 PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret); -s32 PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length); +Error PS4_SYSV_ABI sceImeSetCaret(const OrbisImeCaret* caret); +Error PS4_SYSV_ABI sceImeSetText(const char16_t* text, u32 length); int PS4_SYSV_ABI sceImeSetTextGeometry(); -s32 PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler); +Error PS4_SYSV_ABI sceImeUpdate(OrbisImeEventHandler handler); int PS4_SYSV_ABI sceImeVshClearPreedit(); int PS4_SYSV_ABI sceImeVshClose(); int PS4_SYSV_ABI sceImeVshConfirmPreedit(); diff --git a/src/core/libraries/ime/ime_common.h b/src/core/libraries/ime/ime_common.h index 96f073dc5..5c0030030 100644 --- a/src/core/libraries/ime/ime_common.h +++ b/src/core/libraries/ime/ime_common.h @@ -3,9 +3,108 @@ #pragma once +#include "common/enum.h" #include "common/types.h" #include "core/libraries/rtc/rtc.h" +constexpr u32 ORBIS_IME_MAX_TEXT_LENGTH = 2048; +constexpr u32 ORBIS_IME_DIALOG_MAX_TEXT_LENGTH = 2048; + +enum class Error : u32 { + OK = 0x0, + BUSY = 0x80bc0001, + NOT_OPENED = 0x80bc0002, + NO_MEMORY = 0x80bc0003, + CONNECTION_FAILED = 0x80bc0004, + TOO_MANY_REQUESTS = 0x80bc0005, + INVALID_TEXT = 0x80bc0006, + EVENT_OVERFLOW = 0x80bc0007, + NOT_ACTIVE = 0x80bc0008, + IME_SUSPENDING = 0x80bc0009, + DEVICE_IN_USE = 0x80bc000a, + INVALID_USER_ID = 0x80bc0010, + INVALID_TYPE = 0x80bc0011, + INVALID_SUPPORTED_LANGUAGES = 0x80bc0012, + INVALID_ENTER_LABEL = 0x80bc0013, + INVALID_INPUT_METHOD = 0x80bc0014, + INVALID_OPTION = 0x80bc0015, + INVALID_MAX_TEXT_LENGTH = 0x80bc0016, + INVALID_INPUT_TEXT_BUFFER = 0x80bc0017, + INVALID_POSX = 0x80bc0018, + INVALID_POSY = 0x80bc0019, + INVALID_HORIZONTALIGNMENT = 0x80bc001a, + INVALID_VERTICALALIGNMENT = 0x80bc001b, + INVALID_EXTENDED = 0x80bc001c, + INVALID_KEYBOARD_TYPE = 0x80bc001d, + INVALID_WORK = 0x80bc0020, + INVALID_ARG = 0x80bc0021, + INVALID_HANDLER = 0x80bc0022, + NO_RESOURCE_ID = 0x80bc0023, + INVALID_MODE = 0x80bc0024, + INVALID_PARAM = 0x80bc0030, + INVALID_ADDRESS = 0x80bc0031, + INVALID_RESERVED = 0x80bc0032, + INVALID_TIMING = 0x80bc0033, + INTERNAL = 0x80bc00ff, + DIALOG_INVALID_TITLE = 0x80bc0101, + DIALOG_NOT_RUNNING = 0x80bc0105, + DIALOG_NOT_FINISHED = 0x80bc0106, + DIALOG_NOT_IN_USE = 0x80bc0107 +}; + +enum class OrbisImeOption : u32 { + DEFAULT = 0, + MULTILINE = 1, + NO_AUTO_CAPITALIZATION = 2, + PASSWORD = 4, + LANGUAGES_FORCED = 8, + EXT_KEYBOARD = 16, + NO_LEARNING = 32, + FIXED_POSITION = 64, + DISABLE_COPY_PASTE = 128, + DISABLE_RESUME = 256, + DISABLE_AUTO_SPACE = 512, + DISABLE_POSITION_ADJUSTMENT = 2048, + EXPANDED_PREEDIT_BUFFER = 4096, + USE_JAPANESE_EISUU_KEY_AS_CAPSLOCK = 8192, + USE_2K_COORDINATES = 16384, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeOption); + +enum class OrbisImeLanguage : u64 { + DANISH = 0x0000000000000001, + GERMAN = 0x0000000000000002, + ENGLISH_US = 0x0000000000000004, + SPANISH = 0x0000000000000008, + FRENCH = 0x0000000000000010, + ITALIAN = 0x0000000000000020, + DUTCH = 0x0000000000000040, + NORWEGIAN = 0x0000000000000080, + POLISH = 0x0000000000000100, + PORTUGUESE_PT = 0x0000000000000200, + RUSSIAN = 0x0000000000000400, + FINNISH = 0x0000000000000800, + SWEDISH = 0x0000000000001000, + JAPANESE = 0x0000000000002000, + KOREAN = 0x0000000000004000, + SIMPLIFIED_CHINESE = 0x0000000000008000, + TRADITIONAL_CHINESE = 0x0000000000010000, + PORTUGUESE_BR = 0x0000000000020000, + ENGLISH_GB = 0x0000000000040000, + TURKISH = 0x0000000000080000, + SPANISH_LA = 0x0000000000100000, + ARABIC = 0x0000000001000000, + FRENCH_CA = 0x0000000002000000, + THAI = 0x0000000004000000, + CZECH = 0x0000000008000000, + GREEK = 0x0000000010000000, + INDONESIAN = 0x0000000020000000, + VIETNAMESE = 0x0000000040000000, + ROMANIAN = 0x0000000080000000, + HUNGARIAN = 0x0000000100000000, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeLanguage); + enum class OrbisImeType : u32 { Default = 0, BasicLatin = 1, @@ -41,6 +140,7 @@ enum class OrbisImeEventId : u32 { Open = 0, UpdateText = 1, UpdateCaret = 2, + ChangeSize = 3, PressClose = 4, PressEnter = 5, Abort = 6, @@ -51,6 +151,10 @@ enum class OrbisImeEventId : u32 { CandidateDone = 11, CandidateCancel = 12, ChangeDevice = 14, + JumpToNextObject = 15, + JumpToBeforeObject = 16, + ChangeWindowType = 17, + ChangeInputMethodState = 18, KeyboardOpen = 256, @@ -110,6 +214,13 @@ enum class OrbisImeDeviceType : u32 { RemoteOsk = 3, }; +enum class OrbisImePanelPriority : u32 { + Default = 0, + Alphabet = 1, + Symbol = 2, + Accent = 3, +}; + struct OrbisImeRect { f32 x; f32 y; @@ -117,8 +228,22 @@ struct OrbisImeRect { u32 height; }; +struct OrbisImeColor { + u8 r; + u8 g; + u8 b; + u8 a; +}; + +enum class OrbisImeTextAreaMode : u32 { + Disable = 0, + Edit = 1, + Preedit = 2, + Select = 3, +}; + struct OrbisImeTextAreaProperty { - u32 mode; // OrbisImeTextAreaMode + OrbisImeTextAreaMode mode; u32 index; s32 length; }; @@ -135,14 +260,14 @@ struct OrbisImeKeycode { char16_t character; u32 status; OrbisImeKeyboardType type; - s32 user_id; + s32 user_id; // Todo: switch to OrbisUserServiceUserId u32 resource_id; Libraries::Rtc::OrbisRtcTick timestamp; }; struct OrbisImeKeyboardResourceIdArray { - s32 userId; - u32 resourceId[5]; + s32 user_id; // Todo: switch to OrbisUserServiceUserId + u32 resource_id[5]; }; enum class OrbisImeCaretMovementDirection : u32 { @@ -159,6 +284,16 @@ enum class OrbisImeCaretMovementDirection : u32 { Bottom = 10, }; +enum class OrbisImePanelType : u32 { + Hide = 0, + Osk = 1, + Dialog = 2, + Candidate = 3, + Edit = 4, + EditAndCandidate = 5, + Accessibility = 6, +}; + union OrbisImeEventParam { OrbisImeRect rect; OrbisImeEditText text; @@ -168,6 +303,7 @@ union OrbisImeEventParam { char16_t* candidate_word; s32 candidate_index; OrbisImeDeviceType device_type; + OrbisImePanelType panel_type; u32 input_method_state; s8 reserved[64]; }; @@ -177,7 +313,95 @@ struct OrbisImeEvent { OrbisImeEventParam param; }; +using OrbisImeExtKeyboardFilter = PS4_SYSV_ABI int (*)(const OrbisImeKeycode* srcKeycode, + u16* outKeycode, u32* outStatus, + void* reserved); + using OrbisImeTextFilter = PS4_SYSV_ABI int (*)(char16_t* outText, u32* outTextLength, const char16_t* srcText, u32 srcTextLength); using OrbisImeEventHandler = PS4_SYSV_ABI void (*)(void* arg, const OrbisImeEvent* e); + +enum class OrbisImeKeyboardOption : u32 { + Default = 0, + Repeat = 1, + RepeatEachKey = 2, + AddOsk = 4, + EffectiveWithIme = 8, + DisableResume = 16, + DisableCapslockWithoutShift = 32, +}; +DECLARE_ENUM_FLAG_OPERATORS(OrbisImeKeyboardOption) + +struct OrbisImeKeyboardParam { + OrbisImeKeyboardOption option; + s8 reserved1[4]; + void* arg; + OrbisImeEventHandler handler; + s8 reserved2[8]; +}; + +struct OrbisImeParam { + s32 user_id; // Todo: switch to OrbisUserServiceUserId + OrbisImeType type; + u64 supported_languages; // OrbisImeLanguage flags + OrbisImeEnterLabel enter_label; + OrbisImeInputMethod input_method; + OrbisImeTextFilter filter; + OrbisImeOption option; + u32 maxTextLength; + char16_t* inputTextBuffer; + f32 posx; + f32 posy; + OrbisImeHorizontalAlignment horizontal_alignment; + OrbisImeVerticalAlignment vertical_alignment; + void* work; + void* arg; + OrbisImeEventHandler handler; + s8 reserved[8]; +}; + +struct OrbisImeCaret { + f32 x; + f32 y; + u32 height; + u32 index; +}; + +struct OrbisImeDialogParam { + s32 user_id; + OrbisImeType type; + u64 supported_languages; // OrbisImeLanguage flags + OrbisImeEnterLabel enter_label; + OrbisImeInputMethod input_method; + OrbisImeTextFilter filter; + OrbisImeOption option; + u32 max_text_length; + char16_t* input_text_buffer; + f32 posx; + f32 posy; + OrbisImeHorizontalAlignment horizontal_alignment; + OrbisImeVerticalAlignment vertical_alignment; + const char16_t* placeholder; + const char16_t* title; + s8 reserved[16]; +}; + +struct OrbisImeParamExtended { + u32 option; // OrbisImeExtOption flags + OrbisImeColor color_base; + OrbisImeColor color_line; + OrbisImeColor color_text_field; + OrbisImeColor color_preedit; + OrbisImeColor color_button_default; + OrbisImeColor color_button_function; + OrbisImeColor color_button_symbol; + OrbisImeColor color_text; + OrbisImeColor color_special; + OrbisImePanelPriority priority; + char* additional_dictionary_path; + OrbisImeExtKeyboardFilter ext_keyboard_filter; + u32 disable_device; + u32 ext_keyboard_mode; + s8 reserved[60]; +}; diff --git a/src/core/libraries/ime/ime_dialog.cpp b/src/core/libraries/ime/ime_dialog.cpp index bee185787..6f808636b 100644 --- a/src/core/libraries/ime/ime_dialog.cpp +++ b/src/core/libraries/ime/ime_dialog.cpp @@ -20,19 +20,19 @@ static OrbisImeDialogResult g_ime_dlg_result{}; static ImeDialogState g_ime_dlg_state{}; static ImeDialogUi g_ime_dlg_ui; -static bool IsValidOption(OrbisImeDialogOption option, OrbisImeType type) { - if (False(~option & - (OrbisImeDialogOption::Multiline | OrbisImeDialogOption::NoAutoCompletion))) { +static bool IsValidOption(OrbisImeOption option, OrbisImeType type) { + if (False(~option & (OrbisImeOption::MULTILINE | + OrbisImeOption::NO_AUTO_CAPITALIZATION /* NoAutoCompletion */))) { return false; } - if (True(option & OrbisImeDialogOption::Multiline) && type != OrbisImeType::Default && + if (True(option & OrbisImeOption::MULTILINE) && type != OrbisImeType::Default && type != OrbisImeType::BasicLatin) { return false; } - if (True(option & OrbisImeDialogOption::NoAutoCompletion) && type != OrbisImeType::Number && - type != OrbisImeType::BasicLatin) { + if (True(option & OrbisImeOption::NO_AUTO_CAPITALIZATION /* NoAutoCompletion */) && + type != OrbisImeType::Number && type != OrbisImeType::BasicLatin) { return false; } @@ -96,7 +96,7 @@ Error PS4_SYSV_ABI sceImeDialogGetPanelSize(const OrbisImeDialogParam* param, u3 case OrbisImeType::Url: case OrbisImeType::Mail: *width = 500; // original: 793 - if (True(param->option & OrbisImeDialogOption::Multiline)) { + if (True(param->option & OrbisImeOption::MULTILINE)) { *height = 300; // original: 576 } else { *height = 150; // original: 476 @@ -149,18 +149,20 @@ OrbisImeDialogStatus PS4_SYSV_ABI sceImeDialogGetStatus() { } Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExtended* extended) { + LOG_INFO(Lib_ImeDialog, ">> sceImeDialogInit: entering, param={}, extended={}", + static_cast(param), static_cast(extended)); if (g_ime_dlg_status != OrbisImeDialogStatus::None) { - LOG_INFO(Lib_ImeDialog, "IME dialog is already running"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: busy (status=%u)", (u32)g_ime_dlg_status); return Error::BUSY; } if (param == nullptr) { - LOG_INFO(Lib_ImeDialog, "called with param (NULL)"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: param is null"); return Error::INVALID_ADDRESS; } if (!magic_enum::enum_contains(param->type)) { - LOG_INFO(Lib_ImeDialog, "Invalid param->type"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid param->type=%u", (u32)param->type); return Error::INVALID_ADDRESS; } @@ -168,16 +170,14 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt // TODO: do correct param->supportedLanguages validation if (param->posx < 0.0f || - param->posx >= - MAX_X_POSITIONS[False(param->option & OrbisImeDialogOption::LargeResolution)]) { - LOG_INFO(Lib_ImeDialog, "Invalid param->posx"); + param->posx >= MAX_X_POSITIONS[False(param->option & OrbisImeOption::USE_2K_COORDINATES)]) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid posx=%f", param->posx); return Error::INVALID_POSX; } if (param->posy < 0.0f || - param->posy >= - MAX_Y_POSITIONS[False(param->option & OrbisImeDialogOption::LargeResolution)]) { - LOG_INFO(Lib_ImeDialog, "Invalid param->posy"); + param->posy >= MAX_Y_POSITIONS[False(param->option & OrbisImeOption::USE_2K_COORDINATES)]) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid posy=%f", param->posy); return Error::INVALID_POSY; } @@ -192,12 +192,13 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt } if (!IsValidOption(param->option, param->type)) { - LOG_INFO(Lib_ImeDialog, "Invalid param->option"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid option=0x%X for type=%u", + static_cast(param->option), (u32)param->type); return Error::INVALID_PARAM; } if (param->input_text_buffer == nullptr) { - LOG_INFO(Lib_ImeDialog, "Invalid param->inputTextBuffer"); + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: input_text_buffer is null"); return Error::INVALID_INPUT_TEXT_BUFFER; } @@ -220,16 +221,24 @@ Error PS4_SYSV_ABI sceImeDialogInit(OrbisImeDialogParam* param, OrbisImeParamExt } } - if (param->max_text_length > ORBIS_IME_DIALOG_MAX_TEXT_LENGTH) { - LOG_INFO(Lib_ImeDialog, "Invalid param->maxTextLength"); + if (param->max_text_length == 0 || param->max_text_length > ORBIS_IME_MAX_TEXT_LENGTH) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: invalid max_text_length=%u", + param->max_text_length); return Error::INVALID_MAX_TEXT_LENGTH; } + // Title string validation + if (param->title != nullptr && !std::char_traits::length(param->title)) { + LOG_ERROR(Lib_ImeDialog, "sceImeDialogInit: title is empty"); + return Error::INVALID_PARAM; + } + g_ime_dlg_result = {}; g_ime_dlg_state = ImeDialogState(param, extended); g_ime_dlg_status = OrbisImeDialogStatus::Running; g_ime_dlg_ui = ImeDialogUi(&g_ime_dlg_state, &g_ime_dlg_status, &g_ime_dlg_result); + LOG_INFO(Lib_ImeDialog, "<< sceImeDialogInit: successful, status now=Running"); return Error::OK; } diff --git a/src/core/libraries/ime/ime_dialog.h b/src/core/libraries/ime/ime_dialog.h index 526e5f022..a056fdd5e 100644 --- a/src/core/libraries/ime/ime_dialog.h +++ b/src/core/libraries/ime/ime_dialog.h @@ -13,50 +13,6 @@ class SymbolsResolver; namespace Libraries::ImeDialog { -constexpr u32 ORBIS_IME_DIALOG_MAX_TEXT_LENGTH = 2048; - -enum class Error : u32 { - OK = 0x0, - BUSY = 0x80bc0001, - NOT_OPENED = 0x80bc0002, - NO_MEMORY = 0x80bc0003, - CONNECTION_FAILED = 0x80bc0004, - TOO_MANY_REQUESTS = 0x80bc0005, - INVALID_TEXT = 0x80bc0006, - EVENT_OVERFLOW = 0x80bc0007, - NOT_ACTIVE = 0x80bc0008, - IME_SUSPENDING = 0x80bc0009, - DEVICE_IN_USE = 0x80bc000a, - INVALID_USER_ID = 0x80bc0010, - INVALID_TYPE = 0x80bc0011, - INVALID_SUPPORTED_LANGUAGES = 0x80bc0012, - INVALID_ENTER_LABEL = 0x80bc0013, - INVALID_INPUT_METHOD = 0x80bc0014, - INVALID_OPTION = 0x80bc0015, - INVALID_MAX_TEXT_LENGTH = 0x80bc0016, - INVALID_INPUT_TEXT_BUFFER = 0x80bc0017, - INVALID_POSX = 0x80bc0018, - INVALID_POSY = 0x80bc0019, - INVALID_HORIZONTALIGNMENT = 0x80bc001a, - INVALID_VERTICALALIGNMENT = 0x80bc001b, - INVALID_EXTENDED = 0x80bc001c, - INVALID_KEYBOARD_TYPE = 0x80bc001d, - INVALID_WORK = 0x80bc0020, - INVALID_ARG = 0x80bc0021, - INVALID_HANDLER = 0x80bc0022, - NO_RESOURCE_ID = 0x80bc0023, - INVALID_MODE = 0x80bc0024, - INVALID_PARAM = 0x80bc0030, - INVALID_ADDRESS = 0x80bc0031, - INVALID_RESERVED = 0x80bc0032, - INVALID_TIMING = 0x80bc0033, - INTERNAL = 0x80bc00ff, - DIALOG_INVALID_TITLE = 0x80bc0101, - DIALOG_NOT_RUNNING = 0x80bc0105, - DIALOG_NOT_FINISHED = 0x80bc0106, - DIALOG_NOT_IN_USE = 0x80bc0107, -}; - enum class OrbisImeDialogStatus : u32 { None = 0, Running = 1, @@ -69,87 +25,11 @@ enum class OrbisImeDialogEndStatus : u32 { Aborted = 2, }; -enum class OrbisImeDialogOption : u32 { - Default = 0, - Multiline = 1, - NoAutoCorrection = 2, - NoAutoCompletion = 4, - // TODO: Document missing options - LargeResolution = 1024, -}; -DECLARE_ENUM_FLAG_OPERATORS(OrbisImeDialogOption) - -enum class OrbisImePanelPriority : u32 { - Default = 0, - Alphabet = 1, - Symbol = 2, - Accent = 3, -}; - -struct OrbisImeColor { - u8 r; - u8 g; - u8 b; - u8 a; -}; - struct OrbisImeDialogResult { OrbisImeDialogEndStatus endstatus; s32 reserved[12]; }; -struct OrbisImeKeycode { - u16 keycode; - char16_t character; - u32 status; - OrbisImeKeyboardType type; - s32 user_id; - u32 resource_id; - u64 timestamp; -}; - -using OrbisImeExtKeyboardFilter = PS4_SYSV_ABI int (*)(const OrbisImeKeycode* srcKeycode, - u16* outKeycode, u32* outStatus, - void* reserved); - -struct OrbisImeDialogParam { - s32 user_id; - OrbisImeType type; - u64 supported_languages; - OrbisImeEnterLabel enter_label; - OrbisImeInputMethod input_method; - OrbisImeTextFilter filter; - OrbisImeDialogOption option; - u32 max_text_length; - char16_t* input_text_buffer; - float posx; - float posy; - OrbisImeHorizontalAlignment horizontal_alignment; - OrbisImeVerticalAlignment vertical_alignment; - const char16_t* placeholder; - const char16_t* title; - s8 reserved[16]; -}; - -struct OrbisImeParamExtended { - u32 option; // OrbisImeDialogOptionExtended - OrbisImeColor color_base; - OrbisImeColor color_line; - OrbisImeColor color_text_field; - OrbisImeColor color_preedit; - OrbisImeColor color_button_default; - OrbisImeColor color_button_function; - OrbisImeColor color_button_symbol; - OrbisImeColor color_text; - OrbisImeColor color_special; - OrbisImePanelPriority priority; - char* additional_dictionary_path; - OrbisImeExtKeyboardFilter ext_keyboard_filter; - uint32_t disable_device; - uint32_t ext_keyboard_mode; - int8_t reserved[60]; -}; - Error PS4_SYSV_ABI sceImeDialogAbort(); Error PS4_SYSV_ABI sceImeDialogForceClose(); Error PS4_SYSV_ABI sceImeDialogForTestFunction(); diff --git a/src/core/libraries/ime/ime_dialog_ui.cpp b/src/core/libraries/ime/ime_dialog_ui.cpp index 51183c79b..746a2c8d3 100644 --- a/src/core/libraries/ime/ime_dialog_ui.cpp +++ b/src/core/libraries/ime/ime_dialog_ui.cpp @@ -21,12 +21,16 @@ namespace Libraries::ImeDialog { ImeDialogState::ImeDialogState(const OrbisImeDialogParam* param, const OrbisImeParamExtended* extended) { + LOG_INFO(Lib_ImeDialog, ">> ImeDialogState::Ctor: param={}, text_buffer={}", + static_cast(param), + static_cast(param ? param->input_text_buffer : nullptr)); if (!param) { + LOG_ERROR(Lib_ImeDialog, " param==nullptr, returning without init"); return; } user_id = param->user_id; - is_multi_line = True(param->option & OrbisImeDialogOption::Multiline); + is_multi_line = True(param->option & OrbisImeOption::MULTILINE); is_numeric = param->type == OrbisImeType::Number; type = param->type; enter_label = param->enter_label; @@ -220,6 +224,7 @@ void ImeDialogUi::Free() { void ImeDialogUi::Draw() { std::unique_lock lock{draw_mutex}; + LOG_INFO(Lib_ImeDialog, ">> ImeDialogUi::Draw: first_render=%d", first_render); if (!state) { return; @@ -259,9 +264,13 @@ void ImeDialogUi::Draw() { } if (state->is_multi_line) { + LOG_INFO(Lib_ImeDialog, " Drawing multi-line widget…"); DrawMultiLineInputText(); + LOG_INFO(Lib_ImeDialog, " Done DrawMultiLineInputText"); } else { + LOG_INFO(Lib_ImeDialog, " Drawing input text widget…"); DrawInputText(); + LOG_INFO(Lib_ImeDialog, " Done DrawInputText"); } SetCursorPosY(GetCursorPosY() + 10.0f); @@ -306,6 +315,7 @@ void ImeDialogUi::Draw() { End(); first_render = false; + LOG_INFO(Lib_ImeDialog, "<< ImeDialogUi::Draw complete"); } void ImeDialogUi::DrawInputText() { @@ -316,7 +326,7 @@ void ImeDialogUi::DrawInputText() { } const char* placeholder = state->placeholder.empty() ? nullptr : state->placeholder.data(); if (InputTextEx("##ImeDialogInput", placeholder, state->current_text.begin(), - state->max_text_length, input_size, ImGuiInputTextFlags_CallbackCharFilter, + state->max_text_length + 1, input_size, ImGuiInputTextFlags_CallbackCharFilter, InputTextCallback, this)) { state->input_changed = true; } @@ -332,7 +342,7 @@ void ImeDialogUi::DrawMultiLineInputText() { } const char* placeholder = state->placeholder.empty() ? nullptr : state->placeholder.data(); if (InputTextEx("##ImeDialogInput", placeholder, state->current_text.begin(), - state->max_text_length, input_size, flags, InputTextCallback, this)) { + state->max_text_length + 1, input_size, flags, InputTextCallback, this)) { state->input_changed = true; } } @@ -341,13 +351,19 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { ImeDialogUi* ui = static_cast(data->UserData); ASSERT(ui); + LOG_DEBUG(Lib_ImeDialog, ">> InputTextCallback: EventFlag={}, EventChar={}", data->EventFlag, + data->EventChar); + // Should we filter punctuation? if (ui->state->is_numeric && (data->EventChar < '0' || data->EventChar > '9') && data->EventChar != '\b' && data->EventChar != ',' && data->EventChar != '.') { + LOG_INFO(Lib_ImeDialog, "InputTextCallback: rejecting non-digit char '{}'", + static_cast(data->EventChar)); return 1; } if (!ui->state->keyboard_filter) { + LOG_DEBUG(Lib_ImeDialog, "InputTextCallback: no keyboard_filter, accepting char"); return 0; } @@ -367,16 +383,20 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { }; if (!ui->state->ConvertUTF8ToOrbis(event_char, 4, &src_keycode.character, 1)) { - LOG_ERROR(Lib_ImeDialog, "Failed to convert orbis char to utf8"); + LOG_ERROR(Lib_ImeDialog, "InputTextCallback: ConvertUTF8ToOrbis failed"); return 0; } + LOG_DEBUG(Lib_ImeDialog, "InputTextCallback: converted to Orbis char={:#X}", + static_cast(src_keycode.character)); src_keycode.keycode = src_keycode.character; // TODO set this to the correct value u16 out_keycode; u32 out_status; - ui->state->CallKeyboardFilter(&src_keycode, &out_keycode, &out_status); - + bool keep = ui->state->CallKeyboardFilter(&src_keycode, &out_keycode, &out_status); + LOG_DEBUG(Lib_ImeDialog, + "InputTextCallback: CallKeyboardFilter returned %s (keycode=0x%X, status=0x%X)", + keep ? "true" : "false", out_keycode, out_status); // TODO. set the keycode return 0; diff --git a/src/core/libraries/ime/ime_ui.cpp b/src/core/libraries/ime/ime_ui.cpp index 37f25e200..c49c70ede 100644 --- a/src/core/libraries/ime/ime_ui.cpp +++ b/src/core/libraries/ime/ime_ui.cpp @@ -199,7 +199,7 @@ int ImeUi::InputTextCallback(ImGuiInputTextCallbackData* data) { eventParam.caret_index = data->CursorPos; eventParam.area_num = 1; - eventParam.text_area[0].mode = 1; // Edit mode + eventParam.text_area[0].mode = OrbisImeTextAreaMode::Edit; eventParam.text_area[0].index = data->CursorPos; eventParam.text_area[0].length = data->BufTextLen; From 2d1a2982df408d93f6d5c42e70183ed87db11616 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Tue, 8 Jul 2025 10:32:39 +0300 Subject: [PATCH 09/22] buffer_cache: Bring back upload batching and temporary buffer (#3211) * buffer_cache: Bring back upload batching and temporary buffer Because that PR fused the write and read protections under a single function call, it was a requirement to move the actual memory copy part inside the lambda to perform it before the read protection kicks in. However on certain large data transfers it had potential for data corruption. If, for example, an upload had two copies, a 400MB and a 300MB one, the first one would fit in the staging buffer, very likely with an induced stall. However the second one wouldn't have space to fit alongside the other data, but it's also small enough for the buffer to fit it, so the staging buffer would cause a flush and wait to copy it, overwriting the previous transfer. To address this the upload function has been reworked to allow for batching like previously but with the new locking behavior. Also the condition to use temporary buffers has been expanded to also include cases when staging buffer will stall, which should increase performance a little in some cases. * buffer_cache: Move buffer barriers and copy outside of lock range --- src/video_core/buffer_cache/buffer.cpp | 30 +++++++--- src/video_core/buffer_cache/buffer.h | 8 +-- src/video_core/buffer_cache/buffer_cache.cpp | 56 +++++++++++++++---- src/video_core/buffer_cache/buffer_cache.h | 3 + src/video_core/buffer_cache/memory_tracker.h | 28 +++++++--- src/video_core/renderer_vulkan/vk_scheduler.h | 6 +- 6 files changed, 96 insertions(+), 35 deletions(-) diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 15bf0d81e..e85a6eb18 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -137,12 +137,15 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& size_bytes); } -std::pair StreamBuffer::Map(u64 size, u64 alignment) { +std::pair StreamBuffer::Map(u64 size, u64 alignment, bool allow_wait) { if (!is_coherent && usage == MemoryUsage::Stream) { size = Common::AlignUp(size, instance->NonCoherentAtomSize()); } - ASSERT(size <= this->size_bytes); + if (size > this->size_bytes) { + return {nullptr, 0}; + } + mapped_size = size; if (alignment > 0) { @@ -162,8 +165,11 @@ std::pair StreamBuffer::Map(u64 size, u64 alignment) { } const u64 mapped_upper_bound = offset + size; - WaitPendingOperations(mapped_upper_bound); - return std::make_pair(mapped_data.data() + offset, offset); + if (!WaitPendingOperations(mapped_upper_bound, allow_wait)) { + return {nullptr, 0}; + } + + return {mapped_data.data() + offset, offset}; } void StreamBuffer::Commit() { @@ -177,6 +183,12 @@ void StreamBuffer::Commit() { } offset += mapped_size; + if (current_watch_cursor != 0 && + current_watches[current_watch_cursor].tick == scheduler->CurrentTick()) { + current_watches[current_watch_cursor].upper_bound = offset; + return; + } + if (current_watch_cursor + 1 >= current_watches.size()) { // Ensure that there are enough watches. ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK); @@ -191,16 +203,20 @@ void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_ watches.resize(watches.size() + grow_size); } -void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { +bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_wait) { if (!invalidation_mark) { - return; + return true; } while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; - wait_bound = watch.upper_bound; + if (!scheduler->IsFree(watch.tick) && !allow_wait) { + return false; + } scheduler->Wait(watch.tick); + wait_bound = watch.upper_bound; ++wait_cursor; } + return true; } } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 530968787..a7a0ce84f 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -168,7 +168,7 @@ public: MemoryUsage usage, u64 size_bytes_); /// Reserves a region of memory from the stream buffer. - std::pair Map(u64 size, u64 alignment = 0); + std::pair Map(u64 size, u64 alignment = 0, bool allow_wait = true); /// Ensures that reserved bytes of memory are available to the GPU. void Commit(); @@ -181,10 +181,6 @@ public: return offset; } - u64 GetFreeSize() const { - return size_bytes - offset - mapped_size; - } - private: struct Watch { u64 tick{}; @@ -195,7 +191,7 @@ private: void ReserveWatches(std::vector& watches, std::size_t grow_size); /// Waits pending watches until requested upper bound. - void WaitPendingOperations(u64 requested_upper_bound); + bool WaitPendingOperations(u64 requested_upper_bound, bool allow_wait); private: u64 offset{}; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index d55e05d1e..8a7e99ea0 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -137,8 +137,7 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { return; } memory_tracker->InvalidateRegion( - device_addr, size, Config::readbacks(), - [this, device_addr, size] { ReadMemory(device_addr, size, true); }); + device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); }); } void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { @@ -817,22 +816,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; + size_t total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); + vk::Buffer src_buffer = VK_NULL_HANDLE; memory_tracker->ForEachUploadRange( - device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { - const u64 offset = staging_buffer.Copy(device_addr_out, range_size); - copies.push_back(vk::BufferCopy{ - .srcOffset = offset, - .dstOffset = device_addr_out - buffer_start, - .size = range_size, - }); - }); + device_addr, size, is_written, + [&](u64 device_addr_out, u64 range_size) { + copies.emplace_back(total_size_bytes, device_addr_out - buffer_start, range_size); + total_size_bytes += range_size; + }, + [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); }); SCOPE_EXIT { if (is_texel_buffer) { SynchronizeBufferFromImage(buffer, device_addr, size); } }; - if (copies.empty()) { + if (!src_buffer) { return; } scheduler.EndRendering(); @@ -861,7 +860,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &pre_barrier, }); - cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -869,6 +868,39 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, }); } +vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes) { + if (copies.empty()) { + return VK_NULL_HANDLE; + } + const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + if (staging) { + for (auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + // Apply the staging offset + copy.srcOffset += offset; + } + staging_buffer.Commit(); + return staging_buffer.Handle(); + } else { + // For large one time transfers use a temporary host buffer. + auto temp_buffer = + std::make_unique(instance, scheduler, MemoryUsage::Upload, 0, + vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes); + const vk::Buffer src_buffer = temp_buffer->Handle(); + u8* const staging = temp_buffer->mapped_data.data(); + for (const auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + } + scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable { buffer.reset(); }); + return src_buffer; + } +} + bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { boost::container::small_vector image_ids; texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 354d01431..b509ce2d0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -194,6 +194,9 @@ private: void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer); + vk::Buffer UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes); + bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index ca87c7df0..ec0878c3b 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -62,17 +62,17 @@ public: } /// Removes all protection from a page and ensures GPU data has been flushed if requested - void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { + void InvalidateRegion(VAddr cpu_addr, u64 size, auto&& on_flush) noexcept { IteratePages( - cpu_addr, size, - [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) { + cpu_addr, size, [&on_flush](RegionManager* manager, u64 offset, size_t size) { const bool should_flush = [&] { // Perform both the GPU modification check and CPU state change with the lock // in case we are racing with GPU thread trying to mark the page as GPU // modified. If we need to flush the flush function is going to perform CPU // state change. std::scoped_lock lk{manager->lock}; - if (try_flush && manager->template IsRegionModified(offset, size)) { + if (Config::readbacks() && + manager->template IsRegionModified(offset, size)) { return true; } manager->template ChangeRegionState( @@ -86,17 +86,27 @@ public: } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func, + auto&& on_upload) { IteratePages(query_cpu_range, query_size, [&func, is_written](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; + manager->lock.lock(); manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, func); - if (is_written) { - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); + if (!is_written) { + manager->lock.unlock(); } }); + on_upload(); + if (!is_written) { + return; + } + IteratePages(query_cpu_range, query_size, + [&func, is_written](RegionManager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + manager->lock.unlock(); + }); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 36fd9c055..bd6fb549a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -347,7 +347,11 @@ public: } /// Returns true when a tick has been triggered by the GPU. - [[nodiscard]] bool IsFree(u64 tick) const noexcept { + [[nodiscard]] bool IsFree(u64 tick) noexcept { + if (master_semaphore.IsFree(tick)) { + return true; + } + master_semaphore.Refresh(); return master_semaphore.IsFree(tick); } From e5f899aae3b895fc383019adba56859997a449fe Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Wed, 9 Jul 2025 03:38:28 +0200 Subject: [PATCH 10/22] Fix brace elision for designated initializer warning (#3215) --- src/core/libraries/ime/ime_dialog_ui.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/libraries/ime/ime_dialog_ui.cpp b/src/core/libraries/ime/ime_dialog_ui.cpp index 746a2c8d3..800ba1124 100644 --- a/src/core/libraries/ime/ime_dialog_ui.cpp +++ b/src/core/libraries/ime/ime_dialog_ui.cpp @@ -379,7 +379,7 @@ int ImeDialogUi::InputTextCallback(ImGuiInputTextCallbackData* data) { // the current language?) .user_id = ui->state->user_id, .resource_id = 0, - .timestamp = 0, + .timestamp = {0}, }; if (!ui->state->ConvertUTF8ToOrbis(event_char, 4, &src_keycode.character, 1)) { From df4314f8312aead1707a632db4e30552012eb171 Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Wed, 9 Jul 2025 03:39:51 +0200 Subject: [PATCH 11/22] Extend Qt detection to support multiple drives (#3209) --- cmake/DetectQtInstallation.cmake | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cmake/DetectQtInstallation.cmake b/cmake/DetectQtInstallation.cmake index e95e8980f..650cc9745 100644 --- a/cmake/DetectQtInstallation.cmake +++ b/cmake/DetectQtInstallation.cmake @@ -1,14 +1,28 @@ # SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project # SPDX-License-Identifier: GPL-2.0-or-later -file(GLOB QT_KITS LIST_DIRECTORIES true "C:/Qt/*/msvc*_64") -list(SORT QT_KITS COMPARE NATURAL) -list(REVERSE QT_KITS) -if(QT_KITS) - list(GET QT_KITS 0 QT_PREFIX) - set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) - message(STATUS "Auto-detected Qt prefix: ${QT_PREFIX}") -else() - message(STATUS "findQt.cmake: no Qt‑Directory found in C:/Qt – please set CMAKE_PREFIX_PATH manually") -endif() +set(highest_version "0") +set(CANDIDATE_DRIVES A B C D E F G H I J K L M N O P Q R S T U V W X Y Z) +foreach(drive ${CANDIDATE_DRIVES}) + file(GLOB kits LIST_DIRECTORIES true CONFIGURE_DEPENDS "${drive}:/Qt/*/msvc*_64") + foreach(kit IN LISTS kits) + get_filename_component(version_dir "${kit}" DIRECTORY) + get_filename_component(kit_version "${version_dir}" NAME) + + message(STATUS "DetectQtInstallation.cmake: Detected Qt: ${kit}") + + if (kit_version VERSION_GREATER highest_version) + set(highest_version "${kit_version}") + set(QT_PREFIX "${kit}") + + endif() + endforeach() +endforeach() + +if(QT_PREFIX) + set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) + message(STATUS "DetectQtInstallation.cmake: Choose newest Qt: ${QT_PREFIX}") +else() + message(STATUS "DetectQtInstallation.cmake: No Qt‑Directory found in :/Qt – please set CMAKE_PREFIX_PATH manually") +endif() From f5336358ea5a2ae3a2bbcf857468d0d847698e25 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Wed, 9 Jul 2025 13:55:21 +0300 Subject: [PATCH 12/22] Zero top bits in INSERTQ/EXTRQ (#3217) * Zero top bits in INSERTQ/EXTRQ * Clang-format * Don't assert --- src/core/cpu_patches.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 8512858e9..e4f65cd31 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -163,7 +163,9 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan mask = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask = 0xFFFF'FFFF'FFFF'FFFF; + } // Get lower qword from xmm register c.vmovq(scratch1, xmm_dst); @@ -177,8 +179,8 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan c.mov(scratch2, mask); c.and_(scratch1, scratch2); - // Writeback to xmm register, extrq instruction says top 64-bits are undefined so we don't - // care to preserve them + // Writeback to xmm register, extrq instruction says top 64-bits are undefined but zeroed on + // AMD CPUs c.vmovq(xmm_dst, scratch1); c.pop(scratch2); @@ -287,7 +289,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper mask_value = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask_value = 0xFFFF'FFFF'FFFF'FFFF; + } c.vmovq(scratch1, xmm_src); c.vmovq(scratch2, xmm_dst); @@ -307,8 +311,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper // dst |= src c.or_(scratch2, scratch1); - // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected - c.vpinsrq(xmm_dst, xmm_dst, scratch2, 0); + // Insert scratch2 into low 64 bits of dst, upper 64 bits are undefined but zeroed on AMD + // CPUs + c.vmovq(xmm_dst, scratch2); c.pop(mask); c.pop(scratch2); @@ -374,7 +379,7 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper c.and_(scratch2, mask); c.or_(scratch2, scratch1); - // Upper 64 bits are undefined in insertq + // Upper 64 bits are undefined in insertq but AMD CPUs zero them c.vmovq(xmm_dst, scratch2); c.pop(mask); @@ -635,6 +640,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst >>= index; lowQWordDst &= mask; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); @@ -675,6 +681,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst &= ~(mask << index); lowQWordDst |= lowQWordSrc << index; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); From 7d4b875ee33c47abc4edaa5a2ddacfbb7d32f9d8 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Wed, 9 Jul 2025 17:00:06 +0300 Subject: [PATCH 13/22] Random fixes (#3216) * buffer_cache: Handle inline data to flexible memory * control_flow: Fix single instruction scopes edge case Fixes the following pattern v_cmpx_gt_u32 cond buffer_store_dword value .LABEL: Before buffer[index] = value; After if (cond) { buffer[index] = value; } * vector_memory: Handle soffset when offen is false When offen is not used we can substitute the offset argument with soffset and have it handled correctly * scalar_alu: Handle sharp moves with S_MOV_B64 This fixes unable to track sharp errors when this pattern is used in a shader * emulator: Add log * video_core: Bump binary info search range and buffer num --- src/emulator.cpp | 1 + .../backend/spirv/spirv_emit_context.cpp | 4 +++- .../frontend/control_flow_graph.cpp | 5 +++-- .../frontend/translate/scalar_alu.cpp | 9 +++++++++ .../frontend/translate/vector_memory.cpp | 12 +++++++++--- src/shader_recompiler/info.h | 2 +- src/video_core/amdgpu/liverpool.cpp | 2 ++ src/video_core/amdgpu/liverpool.h | 2 +- src/video_core/buffer_cache/buffer_cache.cpp | 5 ++++- 9 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/emulator.cpp b/src/emulator.cpp index 332287d22..480ceee0b 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -133,6 +133,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole()); LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu()); LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks()); + LOG_INFO(Config, "GPU readbackLinearImages: {}", Config::readbackLinearImages()); LOG_INFO(Config, "GPU directMemoryAccess: {}", Config::directMemoryAccess()); LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders()); LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv()); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 77336c9ec..fe489f1b6 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -700,7 +700,7 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4], - U32[4], U32[4], U32[4], U32[4], U32[4]), + U32[4], U32[4], U32[4], U32[4], U32[4], U32[2]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, PushData::Step0Index, "sr0"); @@ -715,6 +715,7 @@ void EmitContext::DefinePushDataBlock() { MemberName(struct_type, PushData::UdRegsIndex + 3, "ud_regs3"); MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0"); MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1"); + MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2"); MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U); MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U); MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U); @@ -727,6 +728,7 @@ void EmitContext::DefinePushDataBlock() { MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U); MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U); MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 120U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index b53db9e94..805fdb108 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -188,14 +188,15 @@ void CFG::SplitDivergenceScopes() { const bool is_close = is_close_scope(inst); if ((is_close || index == blk->end_index) && curr_begin != -1) { // If there are no instructions inside scope don't do anything. - if (index - curr_begin == 1) { + if (index - curr_begin == 1 && is_close) { curr_begin = -1; continue; } // If all instructions in the scope ignore exec masking, we shouldn't insert a // scope. const auto start = inst_list.begin() + curr_begin + 1; - if (!std::ranges::all_of(start, inst_list.begin() + index, IgnoresExecMask)) { + if (!std::ranges::all_of(start, inst_list.begin() + index + !is_close, + IgnoresExecMask)) { // Determine the first instruction affected by the exec mask. do { ++curr_begin; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 48f977f49..276b55567 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -586,6 +586,15 @@ void Translator::S_MOV(const GcnInst& inst) { } void Translator::S_MOV_B64(const GcnInst& inst) { + // Moving SGPR to SGPR is used for thread masks, like most operations, but it can also be used + // for moving sharps. + if (inst.dst[0].field == OperandField::ScalarGPR && + inst.src[0].field == OperandField::ScalarGPR) { + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code))); + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code + 1), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code + 1))); + } const IR::U1 src = [&] { switch (inst.src[0].field) { case OperandField::VccLo: diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 91f545cfd..68b619c0a 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -193,8 +193,8 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; if (info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, - "Non immediate offset not supported"); + ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0 || !mubuf.offen, + "Having both scalar and vector offsets is not supported"); } const IR::Value address = [&] -> IR::Value { @@ -204,15 +204,21 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } + if (mubuf.idxen && !soffset.IsImmediate()) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); + } if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } + if (!soffset.IsImmediate()) { + return soffset; + } return {}; }(); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.offset_enable.Assign(mubuf.offen || !soffset.IsImmediate()); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 72977b711..9703643e8 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -25,7 +25,7 @@ namespace Shader { static constexpr size_t NumUserDataRegs = 16; static constexpr size_t NumImages = 64; -static constexpr size_t NumBuffers = 32; +static constexpr size_t NumBuffers = 40; static constexpr size_t NumSamplers = 16; static constexpr size_t NumFMasks = 8; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index e264de74a..3e66fba6a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -603,6 +603,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanevent_index.Value() == EventIndex::ZpassDone) { + LOG_WARNING(Render, "Unimplemented occlusion query"); } break; } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 0613823ab..c07e9f63a 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -88,7 +88,7 @@ struct Liverpool { } }; - static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x1000) { + static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x2000) { constexpr u32 token_mov_vcchi = 0xBEEB03FF; if (code[0] == token_mov_vcchi) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8a7e99ea0..c1110e54d 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -312,7 +312,10 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); if (!is_gds) { - ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + if (!memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)) { + std::memcpy(std::bit_cast(address), value, num_bytes); + return; + } if (!IsRegionRegistered(address, num_bytes)) { return; } From dc6ef99dc7e33bdd0d06393672bebf3df22285f0 Mon Sep 17 00:00:00 2001 From: IndecisiveTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:02:08 +0300 Subject: [PATCH 14/22] vector_memory: Handle immediate but non zero offset too Signed-off-by: georgemoralis --- .../frontend/translate/vector_memory.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 68b619c0a..df20f7f73 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -192,8 +192,9 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; + const bool has_soffset = !soffset.IsImmediate() || soffset.U32() != 0; if (info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0 || !mubuf.offen, + ASSERT_MSG(!has_soffset || !mubuf.offen, "Having both scalar and vector offsets is not supported"); } @@ -204,13 +205,13 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } - if (mubuf.idxen && !soffset.IsImmediate()) { + if (mubuf.idxen && has_soffset) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); } if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } - if (!soffset.IsImmediate()) { + if (has_soffset) { return soffset; } return {}; @@ -218,7 +219,7 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen || !soffset.IsImmediate()); + buffer_info.offset_enable.Assign(mubuf.offen || has_soffset); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); From 27cbd6647f76531807f578d35bffe3bc21d70145 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 13:38:50 +0300 Subject: [PATCH 15/22] shader_recompiler: Reorganize data share operations and implement GDS bit (#3222) * shader_recompiler: Reorganize data share operations and implement GDS bit * Review comments --- .../backend/spirv/emit_spirv_atomic.cpp | 60 +++- .../backend/spirv/emit_spirv_instructions.h | 10 + .../backend/spirv/spirv_emit_context.cpp | 1 + .../frontend/translate/data_share.cpp | 317 +++++++----------- .../frontend/translate/translate.h | 14 +- .../frontend/translate/vector_alu.cpp | 6 +- src/shader_recompiler/ir/ir_emitter.cpp | 115 +++++-- src/shader_recompiler/ir/ir_emitter.h | 28 +- src/shader_recompiler/ir/microinstruction.cpp | 12 +- src/shader_recompiler/ir/opcodes.inc | 10 + .../ir/passes/resource_tracking_pass.cpp | 179 ++++++++-- .../ir/passes/shader_info_collection_pass.cpp | 10 + .../ir/passes/shared_memory_simplify_pass.cpp | 10 + .../passes/shared_memory_to_storage_pass.cpp | 42 ++- src/video_core/buffer_cache/buffer_cache.cpp | 2 + 15 files changed, 525 insertions(+), 291 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index e37acb2e4..80c8b836b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -54,17 +54,23 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, }); } +Id SharedAtomicU64IncDec(EmitContext& ctx, Id offset, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ctx.EmitSharedMemoryAccess(ctx.shared_u64, ctx.shared_memory_u64, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics); + }); +} + template Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& buffer = ctx.buffers[handle]; - const auto type = [&] { - if constexpr (is_float) { - return ctx.F32[1]; - } else { - return ctx.U32[1]; - } - }(); + const Id type = is_float ? ctx.F32[1] : ctx.U32[1]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { address = ctx.OpIAdd(ctx.U32[1], address, offset); } @@ -148,42 +154,82 @@ Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMax); +} + Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMax); } +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMax); +} + Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMin); } +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMin); +} + Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMin); } +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMin); +} + Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicAnd); } +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicAnd); +} + Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicOr); } +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicOr); +} + Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicXor); } +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicXor); +} + Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub); } +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicISub); +} + Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); } +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); +} + Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); } +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); +} + Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 6e146c5f6..8a0c586e9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -139,15 +139,25 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset); Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset); Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index fe489f1b6..6a731d05c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -76,6 +76,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf } else { SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); } + String(fmt::format("{:#x}", info.pgm_hash)); AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 8ead93f78..634486fc4 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -3,7 +3,6 @@ #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/reg.h" -#include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" namespace Shader::Gcn { @@ -12,29 +11,29 @@ void Translator::EmitDataShare(const GcnInst& inst) { switch (inst.opcode) { // DS case Opcode::DS_ADD_U32: - return DS_ADD_U32(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_ADD_U64: - return DS_ADD_U64(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_SUB_U32: - return DS_SUB_U32(inst, false); + return DS_OP(inst, AtomicOp::Sub, false); case Opcode::DS_INC_U32: - return DS_INC_U32(inst, false); + return DS_OP(inst, AtomicOp::Inc, false); case Opcode::DS_DEC_U32: - return DS_DEC_U32(inst, false); + return DS_OP(inst, AtomicOp::Dec, false); case Opcode::DS_MIN_I32: - return DS_MIN_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smin, false); case Opcode::DS_MAX_I32: - return DS_MAX_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smax, false); case Opcode::DS_MIN_U32: - return DS_MIN_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umin, false); case Opcode::DS_MAX_U32: - return DS_MAX_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umax, false); case Opcode::DS_AND_B32: - return DS_AND_B32(inst, false); + return DS_OP(inst, AtomicOp::And, false); case Opcode::DS_OR_B32: - return DS_OR_B32(inst, false); + return DS_OP(inst, AtomicOp::Or, false); case Opcode::DS_XOR_B32: - return DS_XOR_B32(inst, false); + return DS_OP(inst, AtomicOp::Xor, false); case Opcode::DS_WRITE_B32: return DS_WRITE(32, false, false, false, inst); case Opcode::DS_WRITE2_B32: @@ -42,19 +41,19 @@ void Translator::EmitDataShare(const GcnInst& inst) { case Opcode::DS_WRITE2ST64_B32: return DS_WRITE(32, false, true, true, inst); case Opcode::DS_ADD_RTN_U32: - return DS_ADD_U32(inst, true); + return DS_OP(inst, AtomicOp::Add, true); case Opcode::DS_SUB_RTN_U32: - return DS_SUB_U32(inst, true); + return DS_OP(inst, AtomicOp::Sub, true); case Opcode::DS_MIN_RTN_U32: - return DS_MIN_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umin, true); case Opcode::DS_MAX_RTN_U32: - return DS_MAX_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umax, true); case Opcode::DS_AND_RTN_B32: - return DS_AND_B32(inst, true); + return DS_OP(inst, AtomicOp::And, true); case Opcode::DS_OR_RTN_B32: - return DS_OR_B32(inst, true); + return DS_OP(inst, AtomicOp::Or, true); case Opcode::DS_XOR_RTN_B32: - return DS_XOR_B32(inst, true); + return DS_OP(inst, AtomicOp::Xor, true); case Opcode::DS_SWIZZLE_B32: return DS_SWIZZLE_B32(inst); case Opcode::DS_READ_B32: @@ -117,92 +116,63 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) { // DS -void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { +template +void Translator::DS_OP(const GcnInst& inst, AtomicOp op, bool rtn) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; + const T data = [&] { + if (op == AtomicOp::Inc || op == AtomicOp::Dec) { + return T{}; + } + if constexpr (std::is_same_v) { + return GetSrc(inst.src[1]); + } else { + return GetSrc64(inst.src[1]); + } + }(); const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + const T original_val = [&] -> T { + switch (op) { + case AtomicOp::Add: + return ir.SharedAtomicIAdd(addr_offset, data, is_gds); + case AtomicOp::Umin: + return ir.SharedAtomicIMin(addr_offset, data, false, is_gds); + case AtomicOp::Smin: + return ir.SharedAtomicIMin(addr_offset, data, true, is_gds); + case AtomicOp::Umax: + return ir.SharedAtomicIMax(addr_offset, data, false, is_gds); + case AtomicOp::Smax: + return ir.SharedAtomicIMax(addr_offset, data, true, is_gds); + case AtomicOp::And: + return ir.SharedAtomicAnd(addr_offset, data, is_gds); + case AtomicOp::Or: + return ir.SharedAtomicOr(addr_offset, data, is_gds); + case AtomicOp::Xor: + return ir.SharedAtomicXor(addr_offset, data, is_gds); + case AtomicOp::Sub: + return ir.SharedAtomicISub(addr_offset, data, is_gds); + case AtomicOp::Inc: + return ir.SharedAtomicInc(addr_offset, is_gds); + case AtomicOp::Dec: + return ir.SharedAtomicDec(addr_offset, is_gds); + default: + UNREACHABLE(); + } + }(); if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U64 data{GetSrc64(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); - if (rtn) { - SetDst64(inst.dst[0], IR::U64{original_val}); - } -} - -void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_AND_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicAnd(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_OR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicOr(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_XOR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicXor(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); + if constexpr (std::is_same_v) { + SetDst(inst.dst[0], original_val); + } else { + SetDst64(inst.dst[0], original_val); + } } } void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; @@ -220,33 +190,85 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), - addr0); + addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + ir.WriteShared(32, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), - addr1); + addr1, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + ir.WriteShared(32, ir.GetVectorReg(data1), addr1, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1, is_gds); } } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); + } + } +} + +void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, + const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; + const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; + IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } + if (is_pair) { + // Pair loads are either 32 or 64-bit + const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); + } + const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); + } + } else { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); } } } @@ -263,91 +285,6 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.QuadShuffle(src, index)); } -void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicInc(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicDec(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_SUB_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicISub(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, - const GcnInst& inst) { - const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - IR::VectorReg dst_reg{inst.dst[0].code}; - const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; - if (info.stage == Stage::Fragment) { - ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, - "Unexpected shared memory offset alignment: {}", offset); - ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); - return; - } - if (is_pair) { - // Pair loads are either 32 or 64-bit - const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); - const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data0}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data0}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); - } - const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); - const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data1}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data1}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); - } - } else { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data}); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg, IR::U32{data}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); - } - } -} - void Translator::DS_APPEND(const GcnInst& inst) { const u32 inst_offset = (u32(inst.control.ds.offset1) << 8u) + inst.control.ds.offset0; const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset)); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index b5bfec344..4b5ff827b 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -270,21 +270,13 @@ public: // Data share // DS - void DS_ADD_U32(const GcnInst& inst, bool rtn); - void DS_ADD_U64(const GcnInst& inst, bool rtn); - void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); - void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); + template + void DS_OP(const GcnInst& inst, AtomicOp op, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); - void DS_SWIZZLE_B32(const GcnInst& inst); - void DS_AND_B32(const GcnInst& inst, bool rtn); - void DS_OR_B32(const GcnInst& inst, bool rtn); - void DS_XOR_B32(const GcnInst& inst, bool rtn); void DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); + void DS_SWIZZLE_B32(const GcnInst& inst); void DS_APPEND(const GcnInst& inst); void DS_CONSUME(const GcnInst& inst); - void DS_SUB_U32(const GcnInst& inst, bool rtn); - void DS_INC_U32(const GcnInst& inst, bool rtn); - void DS_DEC_U32(const GcnInst& inst, bool rtn); // Buffer Memory // MUBUF / MTBUF diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 74c7ec601..017c77fb0 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -565,7 +565,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ if ((inst.src[0].field == OperandField::ExecHi || - inst.src[0].field == OperandField::VccHi) && + inst.src[0].field == OperandField::VccHi || + inst.src[0].field == OperandField::ScalarGPR) && (inst.src[1].field == OperandField::ConstZero || inst.src[1].field == OperandField::VectorGPR)) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); @@ -579,7 +580,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_lo_u32_b32 vY, exec_lo, vX // used combined with above for append buffer indexing. - if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) { + if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo || + inst.src[0].field == OperandField::ScalarGPR) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); } UNREACHABLE(); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 2334777ed..b88e1a17d 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -291,78 +291,137 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Inst(Opcode::SetPatch, patch, value); } -Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - return Inst(Opcode::LoadSharedU16, offset); + return Inst(Opcode::LoadSharedU16, Flags{is_gds}, offset); case 32: - return Inst(Opcode::LoadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, Flags{is_gds}, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, Flags{is_gds}, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { +void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - Inst(Opcode::WriteSharedU16, offset, value); + Inst(Opcode::WriteSharedU16, Flags{is_gds}, offset, value); break; case 32: - Inst(Opcode::WriteSharedU32, offset, value); + Inst(Opcode::WriteSharedU32, Flags{is_gds}, offset, value); break; case 64: - Inst(Opcode::WriteSharedU64, offset, value); + Inst(Opcode::WriteSharedU64, Flags{is_gds}, offset, value); break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds) { switch (data.Type()) { case Type::U32: - return Inst(Opcode::SharedAtomicIAdd32, address, data); + return Inst(Opcode::SharedAtomicIAdd32, Flags{is_gds}, address, data); case Type::U64: - return Inst(Opcode::SharedAtomicIAdd64, address, data); + return Inst(Opcode::SharedAtomicIAdd64, Flags{is_gds}, address, data); default: ThrowInvalidType(data.Type()); } } -U32 IREmitter::SharedAtomicIMin(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMin32, address, data) - : Inst(Opcode::SharedAtomicUMin32, address, data); +U32U64 IREmitter::SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMin32 : Opcode::SharedAtomicUMin32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMin64 : Opcode::SharedAtomicUMin64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicIMax(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMax32, address, data) - : Inst(Opcode::SharedAtomicUMax32, address, data); +U32U64 IREmitter::SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMax32 : Opcode::SharedAtomicUMax32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMax64 : Opcode::SharedAtomicUMax64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicAnd(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicAnd32, address, data); +U32U64 IREmitter::SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicOr(const U32& address, const U32& data) { +U32U64 IREmitter::SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } return Inst(Opcode::SharedAtomicOr32, address, data); } -U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicXor32, address, data); +U32U64 IREmitter::SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicXor32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicXor64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicInc(const U32& address) { - return Inst(Opcode::SharedAtomicInc32, address); +U32U64 IREmitter::SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicISub32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicISub64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicDec(const U32& address) { - return Inst(Opcode::SharedAtomicDec32, address); +template <> +U32 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc32, Flags{is_gds}, address); } -U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicISub32, address, data); +template <> +U64 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc64, Flags{is_gds}, address); +} + +template <> +U32 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec32, Flags{is_gds}, address); +} + +template <> +U64 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec64, Flags{is_gds}, address); } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 1c5a8f545..d9e5aab7a 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -96,18 +96,24 @@ public: [[nodiscard]] F32 GetPatch(Patch patch); void SetPatch(Patch patch, const F32& value); - [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); - void WriteShared(int bit_size, const Value& value, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset, + bool is_gds = false); + void WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds = false); - [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); - [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicInc(const U32& address); - [[nodiscard]] U32 SharedAtomicDec(const U32& address); - [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds); + + template + [[nodiscard]] T SharedAtomicInc(const U32& address, bool is_gds); + template + [[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 84bdb5739..eaab05cb7 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -92,7 +92,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::WriteSharedU32: case Opcode::WriteSharedU64: case Opcode::SharedAtomicIAdd32: - case Opcode::SharedAtomicIAdd64: case Opcode::SharedAtomicISub32: case Opcode::SharedAtomicSMin32: case Opcode::SharedAtomicUMin32: @@ -103,6 +102,17 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::SharedAtomicAnd32: case Opcode::SharedAtomicOr32: case Opcode::SharedAtomicXor32: + case Opcode::SharedAtomicIAdd64: + case Opcode::SharedAtomicISub64: + case Opcode::SharedAtomicSMin64: + case Opcode::SharedAtomicUMin64: + case Opcode::SharedAtomicSMax64: + case Opcode::SharedAtomicUMax64: + case Opcode::SharedAtomicInc64: + case Opcode::SharedAtomicDec64: + case Opcode::SharedAtomicAnd64: + case Opcode::SharedAtomicOr64: + case Opcode::SharedAtomicXor64: case Opcode::ImageWrite: case Opcode::ImageAtomicIAdd32: case Opcode::ImageAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 553e63f3e..08dcec458 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -41,15 +41,25 @@ OPCODE(WriteSharedU64, Void, U32, OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicISub32, U32, U32, U32, ) +OPCODE(SharedAtomicISub64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) +OPCODE(SharedAtomicSMin64, U64, U32, U64, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) +OPCODE(SharedAtomicUMin64, U64, U32, U64, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) +OPCODE(SharedAtomicSMax64, U64, U32, U64, ) OPCODE(SharedAtomicUMax32, U32, U32, U32, ) +OPCODE(SharedAtomicUMax64, U64, U32, U64, ) OPCODE(SharedAtomicInc32, U32, U32, ) +OPCODE(SharedAtomicInc64, U64, U32, ) OPCODE(SharedAtomicDec32, U32, U32, ) +OPCODE(SharedAtomicDec64, U64, U32, ) OPCODE(SharedAtomicAnd32, U32, U32, U32, ) +OPCODE(SharedAtomicAnd64, U64, U32, U64, ) OPCODE(SharedAtomicOr32, U32, U32, U32, ) +OPCODE(SharedAtomicOr64, U64, U32, U64, ) OPCODE(SharedAtomicXor32, U32, U32, U32, ) +OPCODE(SharedAtomicXor64, U64, U32, U64, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index f3972769c..e5a4beb8b 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -84,8 +84,42 @@ bool IsBufferInstruction(const IR::Inst& inst) { } bool IsDataRingInstruction(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::DataAppend || - inst.GetOpcode() == IR::Opcode::DataConsume; + switch (inst.GetOpcode()) { + case IR::Opcode::DataAppend: + case IR::Opcode::DataConsume: + return true; + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicDec64: + return inst.Flags(); // is_gds + default: + return false; + } } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { @@ -507,7 +541,8 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } } -void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { +void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, .inline_cbuf = AmdGpu::Buffer::Null(), @@ -515,37 +550,111 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto .is_written = true, }); - const auto pred = [](const IR::Inst* inst) -> std::optional { - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst; - } - return std::nullopt; - }; - - // Attempt to deduce the GDS address of counter at compile time. - u32 gds_addr = 0; - const IR::Value& gds_offset = inst.Arg(0); - if (gds_offset.IsImmediate()) { - // Nothing to do, offset is known. - gds_addr = gds_offset.U32() & 0xFFFF; - } else { - const auto result = IR::BreadthFirstSearch(&inst, pred); - ASSERT_MSG(result, "Unable to track M0 source"); - - // M0 must be set by some user data register. - const IR::Inst* prod = gds_offset.InstRecursive(); - const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); - u32 m0_val = info.user_data[ud_reg] >> 16; - if (prod->GetOpcode() == IR::Opcode::IAdd32) { - m0_val += prod->Arg(1).U32(); - } - gds_addr = m0_val & 0xFFFF; - } - - // Patch instruction. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(gds_addr >> 2)); - inst.SetArg(1, ir.Imm32(binding)); + + // For data append/consume operations attempt to deduce the GDS address. + if (inst.GetOpcode() == IR::Opcode::DataAppend || inst.GetOpcode() == IR::Opcode::DataConsume) { + const auto pred = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; + }; + + u32 gds_addr = 0; + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + gds_addr = gds_offset.U32() & 0xFFFF; + } else { + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to track M0 source"); + + // M0 must be set by some user data register. + const IR::Inst* prod = gds_offset.InstRecursive(); + const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); + u32 m0_val = info.user_data[ud_reg] >> 16; + if (prod->GetOpcode() == IR::Opcode::IAdd32) { + m0_val += prod->Arg(1).U32(); + } + gds_addr = m0_val & 0xFFFF; + } + + // Patch instruction. + inst.SetArg(0, ir.Imm32(gds_addr >> 2)); + inst.SetArg(1, ir.Imm32(binding)); + } else { + // Convert shared memory opcode to storage buffer atomic to GDS buffer. + const IR::U32 offset = IR::U32{inst.Arg(0)}; + const IR::U32 address_words = ir.ShiftRightLogical(offset, ir.Imm32(1)); + const IR::U32 address_dwords = ir.ShiftRightLogical(offset, ir.Imm32(2)); + const IR::U32 address_qwords = ir.ShiftRightLogical(offset, ir.Imm32(3)); + const IR::U32 handle = ir.Imm32(binding); + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWith(ir.BufferAtomicIAdd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicIAdd64: + inst.ReplaceUsesWith( + ir.BufferAtomicIAdd(handle, address_qwords, IR::U64{inst.Arg(1)}, {})); + break; + case IR::Opcode::SharedAtomicISub32: + inst.ReplaceUsesWith(ir.BufferAtomicISub(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMin(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMax(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicInc32: + inst.ReplaceUsesWith(ir.BufferAtomicInc(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicDec32: + inst.ReplaceUsesWith(ir.BufferAtomicDec(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWith(ir.BufferAtomicAnd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWith(ir.BufferAtomicOr(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWith(ir.BufferAtomicXor(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWith(ir.LoadBufferU16(handle, address_words, {})); + break; + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWith(ir.LoadBufferU32(1, handle, address_dwords, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWith(ir.LoadBufferU64(handle, address_qwords, {})); + break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address_words, IR::U16{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address_dwords, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU64(handle, address_qwords, IR::U64{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + default: + UNREACHABLE(); + } + } } IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, @@ -916,8 +1025,6 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferSharp(*block, inst, info, descriptors); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors); - } else if (IsDataRingInstruction(inst)) { - PatchDataRingAccess(*block, inst, info, descriptors); } } } @@ -929,6 +1036,8 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferArgs(*block, inst, info); } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); + } else if (IsDataRingInstruction(inst)) { + PatchGlobalDataShareAccess(*block, inst, info, descriptors); } } } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index a87dceb0a..079827866 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -55,6 +55,16 @@ void Visit(Info& info, const IR::Inst& inst) { info.shared_types |= IR::Type::U32; break; case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: info.uses_shared_int64_atomics = true; [[fallthrough]]; case IR::Opcode::LoadSharedU64: diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp index 0f80a3b28..555fd505b 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -15,6 +15,16 @@ static bool Requires16BitSharedAtomic(const IR::Inst& inst) { static bool Requires64BitSharedAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index a6900e180..b84011acc 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -17,7 +17,6 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd32: - case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicISub32: case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: @@ -28,6 +27,17 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; @@ -64,6 +74,16 @@ IR::Type CalculateSharedMemoryTypes(IR::Program& program) { case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: used_types |= IR::Type::U64; break; default: @@ -119,19 +139,26 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicSMin32: - case IR::Opcode::SharedAtomicUMin32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMin64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); continue; } case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMax64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); continue; @@ -143,12 +170,15 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); continue; case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::LoadSharedU16: @@ -173,7 +203,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.Invalidate(); break; default: - break; + UNREACHABLE(); } } } diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c1110e54d..28444ac60 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -48,6 +48,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory_tracker = std::make_unique(tracker); + std::memset(gds_buffer.mapped_data.data(), 0, DataShareBufferSize); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16); From ee97c5c1109d3a09a964f8ccbae12389ac4efdb4 Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Thu, 10 Jul 2025 12:53:38 +0200 Subject: [PATCH 16/22] Define S_TRAP as InstCategory::FlowControl (#3223) --- src/shader_recompiler/frontend/format.cpp | 2 +- src/shader_recompiler/frontend/translate/scalar_flow.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 52c8c733e..6c4427e5f 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -397,7 +397,7 @@ constexpr std::array InstructionFormatSOPP = {{ // 17 = S_SENDMSGHALT {InstClass::ScalarProgFlow, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 18 = S_TRAP - {InstClass::Undefined, InstCategory::Undefined, 0, 1, ScalarType::Any, ScalarType::Any}, + {InstClass::Undefined, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 19 = S_ICACHE_INV {InstClass::ScalarCache, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 20 = S_INCPERFLEVEL diff --git a/src/shader_recompiler/frontend/translate/scalar_flow.cpp b/src/shader_recompiler/frontend/translate/scalar_flow.cpp index cd1cf51f0..7b57d89ca 100644 --- a/src/shader_recompiler/frontend/translate/scalar_flow.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_flow.cpp @@ -16,6 +16,9 @@ void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { case Opcode::S_SETPRIO: LOG_WARNING(Render_Vulkan, "S_SETPRIO instruction!"); return; + case Opcode::S_TRAP: + LOG_WARNING(Render_Vulkan, "S_TRAP instruction!"); + return; case Opcode::S_GETPC_B64: return S_GETPC_B64(pc, inst); case Opcode::S_SETPC_B64: From 88abb93669a560ab8914968d4696f6011317babf Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 14:19:44 +0300 Subject: [PATCH 17/22] ir_passes: Fold readlane with ff1 pattern (#3224) --- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/emit_spirv_warp.cpp | 5 ++--- .../ir/passes/readlane_elimination_pass.cpp | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 8a0c586e9..f3dd9b2ea 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -529,7 +529,7 @@ Id EmitLaneId(EmitContext& ctx); Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 2d13d09f0..20fb83fa6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -26,9 +26,8 @@ Id EmitReadFirstLane(EmitContext& ctx, Id value) { return ctx.OpGroupNonUniformBroadcastFirst(ctx.U32[1], SubgroupScope(ctx), value); } -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane) { - return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, - ctx.ConstU32(lane)); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane) { + return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, lane); } Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index 9c5f64f84..3378d785f 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -95,6 +95,20 @@ void ReadLaneEliminationPass(IR::Program& program) { if (inst.GetOpcode() != IR::Opcode::ReadLane) { continue; } + + // Check for the following pattern and replace it with ReadFirstLane + // s_ff1_i32_b64 sgpr, exec + // v_readlane_b32 sdst, vgpr, sgpr + if (const auto lane = inst.Arg(1); !lane.IsImmediate()) { + if (lane.InstRecursive()->GetOpcode() == IR::Opcode::FindILsb64) { + const auto value = inst.Arg(0); + inst.ReplaceOpcode(IR::Opcode::ReadFirstLane); + inst.ClearArgs(); + inst.SetArg(0, value); + } + continue; + } + const u32 lane = inst.Arg(1).U32(); IR::Inst* prod = inst.Arg(0).InstRecursive(); From 8bc30270c853635885fffdca9f2ef757ea7ef484 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 21:52:56 +0300 Subject: [PATCH 18/22] shader_recompiler: Implement ff1 with subgroup ops (#3225) --- externals/sirit | 2 +- .../backend/spirv/emit_spirv_instructions.h | 2 ++ .../backend/spirv/emit_spirv_warp.cpp | 8 ++++++++ .../frontend/translate/scalar_alu.cpp | 5 +++-- src/shader_recompiler/ir/ir_emitter.cpp | 8 ++++++++ src/shader_recompiler/ir/ir_emitter.h | 2 ++ src/shader_recompiler/ir/opcodes.inc | 2 ++ .../ir/passes/readlane_elimination_pass.cpp | 12 +----------- 8 files changed, 27 insertions(+), 14 deletions(-) diff --git a/externals/sirit b/externals/sirit index 6b450704f..b4eccb336 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0 +Subproject commit b4eccb336f1b1169af48dac1e04015985af86e3e diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index f3dd9b2ea..74c94754d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -531,6 +531,8 @@ Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); Id EmitReadLane(EmitContext& ctx, Id value, Id lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); +Id EmitBallot(EmitContext& ctx, Id bit); +Id EmitBallotFindLsb(EmitContext& ctx, Id mask); Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 20fb83fa6..951c76001 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -34,4 +34,12 @@ Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { return ctx.u32_zero_value; } +Id EmitBallot(EmitContext& ctx, Id bit) { + return ctx.OpGroupNonUniformBallot(ctx.U32[4], SubgroupScope(ctx), bit); +} + +Id EmitBallotFindLsb(EmitContext& ctx, Id mask) { + return ctx.OpGroupNonUniformBallotFindLSB(ctx.U32[1], SubgroupScope(ctx), mask); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 276b55567..e3134c300 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -680,8 +680,9 @@ void Translator::S_FF1_I32_B32(const GcnInst& inst) { } void Translator::S_FF1_I32_B64(const GcnInst& inst) { - const IR::U64 src0{GetSrc64(inst.src[0])}; - const IR::U32 result{ir.FindILsb(src0)}; + ASSERT(inst.src[0].field == OperandField::ScalarGPR); + const IR::U32 result{ + ir.BallotFindLsb(ir.Ballot(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))))}; SetDst(inst.dst[0], result); } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index b88e1a17d..4997145d7 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -660,6 +660,14 @@ U32 IREmitter::WriteLane(const U32& value, const U32& write_value, const U32& la return Inst(Opcode::WriteLane, value, write_value, lane); } +Value IREmitter::Ballot(const U1& bit) { + return Inst(Opcode::Ballot, bit); +} + +U32 IREmitter::BallotFindLsb(const Value& mask) { + return Inst(Opcode::BallotFindLsb, mask); +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index d9e5aab7a..6055df565 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -176,6 +176,8 @@ public: [[nodiscard]] U32 ReadFirstLane(const U32& value); [[nodiscard]] U32 ReadLane(const U32& value, const U32& lane); [[nodiscard]] U32 WriteLane(const U32& value, const U32& write_value, const U32& lane); + [[nodiscard]] Value Ballot(const U1& bit); + [[nodiscard]] U32 BallotFindLsb(const Value& mask); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 08dcec458..747a27e35 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -472,5 +472,7 @@ OPCODE(QuadShuffle, U32, U32, OPCODE(ReadFirstLane, U32, U32, ) OPCODE(ReadLane, U32, U32, U32 ) OPCODE(WriteLane, U32, U32, U32, U32 ) +OPCODE(Ballot, U32x4, U1, ) +OPCODE(BallotFindLsb, U32, U32x4, ) OPCODE(DataAppend, U32, U32, U32 ) OPCODE(DataConsume, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index 3378d785f..d6586bda0 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -95,17 +95,7 @@ void ReadLaneEliminationPass(IR::Program& program) { if (inst.GetOpcode() != IR::Opcode::ReadLane) { continue; } - - // Check for the following pattern and replace it with ReadFirstLane - // s_ff1_i32_b64 sgpr, exec - // v_readlane_b32 sdst, vgpr, sgpr - if (const auto lane = inst.Arg(1); !lane.IsImmediate()) { - if (lane.InstRecursive()->GetOpcode() == IR::Opcode::FindILsb64) { - const auto value = inst.Arg(0); - inst.ReplaceOpcode(IR::Opcode::ReadFirstLane); - inst.ClearArgs(); - inst.SetArg(0, value); - } + if (!inst.Arg(1).IsImmediate()) { continue; } From b403e1be339b55dd7ab3801e939e5ecd833da015 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Thu, 10 Jul 2025 22:14:02 +0300 Subject: [PATCH 19/22] vk_rasterizer: Set render area to max when no framebuffers are bound (#3227) --- src/video_core/renderer_vulkan/vk_instance.h | 12 +++++++++++- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 ++ src/video_core/renderer_vulkan/vk_scheduler.cpp | 7 +------ src/video_core/renderer_vulkan/vk_scheduler.h | 4 ++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index c9e354186..830b1d5c2 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -324,11 +324,21 @@ public: return properties.limits.maxViewportDimensions[0]; } - /// Returns the maximum viewport height. + /// Returns the maximum viewport height. u32 GetMaxViewportHeight() const { return properties.limits.maxViewportDimensions[1]; } + /// Returns the maximum render area width. + u32 GetMaxFramebufferWidth() const { + return properties.limits.maxFramebufferWidth; + } + + /// Returns the maximum render area height. + u32 GetMaxFramebufferHeight() const { + return properties.limits.maxFramebufferHeight; + } + /// Returns the sample count flags supported by framebuffers. vk::SampleCountFlags GetFramebufferSampleCounts() const { return properties.limits.framebufferColorSampleCounts & diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index cca193831..5d0a14ce3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -113,6 +113,8 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) { // Prefetch color and depth buffers to let texture cache handle possible overlaps with bound // textures (e.g. mipgen) RenderState state; + state.width = instance.GetMaxFramebufferWidth(); + state.height = instance.GetMaxFramebufferHeight(); cb_descs.clear(); db_desc.reset(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 4c4e17fe4..ac645c9ce 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -34,16 +34,11 @@ void Scheduler::BeginRendering(const RenderState& new_state) { is_rendering = true; render_state = new_state; - const auto width = - render_state.width != std::numeric_limits::max() ? render_state.width : 1; - const auto height = - render_state.height != std::numeric_limits::max() ? render_state.height : 1; - const vk::RenderingInfo rendering_info = { .renderArea = { .offset = {0, 0}, - .extent = {width, height}, + .extent = {render_state.width, render_state.height}, }, .layerCount = 1, .colorAttachmentCount = render_state.num_color_attachments, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index bd6fb549a..b5678edbc 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -26,8 +26,8 @@ struct RenderState { u32 num_color_attachments{}; bool has_depth{}; bool has_stencil{}; - u32 width = std::numeric_limits::max(); - u32 height = std::numeric_limits::max(); + u32 width{}; + u32 height{}; bool operator==(const RenderState& other) const noexcept { return std::memcmp(this, &other, sizeof(RenderState)) == 0; From 399a725343db2c8b07de94ec95789025bc91dd5c Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 14 Jul 2025 00:32:02 +0300 Subject: [PATCH 20/22] shader_recompiler: Replace buffer pulling with attribute divisor for instance step rates (#3238) * shader_recompiler: Replace buffer pulling with attribute divisor for instance step rates * flatten_extended_userdata: Remove special step rate buffer handling * Review comments * spirv_emit_context: Name all instance rate attribs properly * spirv: Merge ReadConstBuffer again template function only has 1 user now * attribute: Add missing attributes * translate: Reimplement step rate instance id * Resolve validation warnings * shader_recompiler: Separate vertex inputs from LS stage, cleanup tess --- .../spirv/emit_spirv_context_get_set.cpp | 128 +++++++----------- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/spirv_emit_context.cpp | 74 ++++------ .../backend/spirv/spirv_emit_context.h | 1 - src/shader_recompiler/frontend/fetch_shader.h | 12 -- .../frontend/translate/translate.cpp | 63 +++++---- src/shader_recompiler/info.h | 14 +- src/shader_recompiler/ir/attribute.cpp | 22 ++- src/shader_recompiler/ir/attribute.h | 2 - src/shader_recompiler/ir/ir_emitter.cpp | 4 +- src/shader_recompiler/ir/ir_emitter.h | 3 +- .../passes/flatten_extended_userdata_pass.cpp | 24 +--- .../ir/passes/ring_access_elimination.cpp | 11 +- src/shader_recompiler/ir/passes/srt.h | 13 +- src/shader_recompiler/runtime_info.h | 6 +- src/shader_recompiler/specialization.h | 20 ++- src/video_core/buffer_cache/buffer_cache.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 46 +++++-- .../renderer_vulkan/vk_graphics_pipeline.h | 4 +- .../renderer_vulkan/vk_instance.cpp | 4 + .../renderer_vulkan/vk_pipeline_cache.cpp | 19 +-- .../renderer_vulkan/vk_rasterizer.cpp | 5 +- 22 files changed, 208 insertions(+), 274 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index f3a8c518c..40f8d307c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -52,7 +52,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { if (IR::IsParam(attr)) { const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)}; - if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { + if (ctx.stage == Stage::Local) { const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); return ctx.OpAccessChain(component_ptr, ctx.output_attr_array, ctx.ConstU32(attr_index), ctx.ConstU32(element)); @@ -94,13 +94,9 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { if (IR::IsParam(attr)) { - if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { - return {ctx.F32[1], false}; - } else { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& info{ctx.output_params.at(index)}; - return {info.component_type, info.is_integer}; - } + const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; + const auto& info{ctx.output_params.at(index)}; + return {info.component_type, info.is_integer}; } if (IR::IsMrt(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)}; @@ -120,6 +116,9 @@ std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr } } // Anonymous namespace +using PointerType = EmitContext::PointerType; +using PointerSize = EmitContext::PointerSize; + Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg); const u32 half = PushData::UdRegsIndex + (index >> 2); @@ -131,41 +130,6 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { return ud_reg; } -void EmitGetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -using PointerType = EmitContext::PointerType; -using PointerSize = EmitContext::PointerSize; - Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { const u32 flatbuf_off_dw = inst->Flags(); if (!Config::directMemoryAccess()) { @@ -180,39 +144,27 @@ Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { } } -template -Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { +Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { const auto& buffer = ctx.buffers[handle]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { index = ctx.OpIAdd(ctx.U32[1], index, offset); } - const auto [id, pointer_type] = buffer.Alias(type); - const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1]; + const auto [id, pointer_type] = buffer.Alias(PointerType::U32); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpLoad(value_type, ptr)}; + const Id result{ctx.OpLoad(ctx.U32[1], ptr)}; if (const Id size = buffer.Size(PointerSize::B32); Sirit::ValidId(size)) { const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, size); - return ctx.OpSelect(value_type, in_bounds, result, ctx.u32_zero_value); + return ctx.OpSelect(ctx.U32[1], in_bounds, result, ctx.u32_zero_value); } return result; } -Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - return ReadConstBuffer(ctx, handle, index); -} - -Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { - const auto index{rate_idx == 0 ? PushData::Step0Index : PushData::Step1Index}; - return ctx.OpLoad( - ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]), - ctx.push_data_block, ctx.ConstU32(index))); -} - -static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))}; + const auto pointer{ + ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -222,7 +174,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; const auto param = ctx.input_params.at(param_id).id; const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -230,7 +182,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 UNREACHABLE(); } -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (ctx.info.l_stage == LogicalStage::Geometry) { return EmitGetAttributeForGeometry(ctx, attr, comp, index); } else if (ctx.info.l_stage == LogicalStage::TessellationControl || @@ -248,18 +200,6 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsParam(attr)) { const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)}; const auto& param{ctx.input_params.at(param_index)}; - if (param.buffer_handle >= 0) { - const auto step_rate = EmitReadStepRate(ctx, param.id.value); - const auto offset = ctx.OpIAdd( - ctx.U32[1], - ctx.OpIMul( - ctx.U32[1], - ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate), - ctx.ConstU32(param.num_components)), - ctx.ConstU32(comp)); - return ReadConstBuffer(ctx, param.buffer_handle, offset); - } - Id result; if (param.is_loaded) { // Attribute is either default or manually interpolated. The id points to an already @@ -305,10 +245,6 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpLoad(ctx.U32[1], ctx.vertex_index); case IR::Attribute::InstanceId: return ctx.OpLoad(ctx.U32[1], ctx.instance_id); - case IR::Attribute::InstanceId0: - return EmitReadStepRate(ctx, 0); - case IR::Attribute::InstanceId1: - return EmitReadStepRate(ctx, 1); case IR::Attribute::WorkgroupIndex: return ctx.workgroup_index_id; case IR::Attribute::WorkgroupId: @@ -640,4 +576,36 @@ void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a UNREACHABLE_MSG("SPIR-V instruction"); } +void EmitGetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 74c94754d..37d5d84c9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -108,7 +108,7 @@ Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id cmp_value); -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 6a731d05c..e16bba755 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -377,35 +377,13 @@ void EmitContext::DefineInputs() { ASSERT(attrib.semantic < IR::NumParams); const auto sharp = attrib.GetSharp(info); const Id type{GetAttributeType(*this, sharp.GetNumberFmt())[4]}; - if (attrib.UsesStepRates()) { - const u32 rate_idx = - attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::OverStepRate0 ? 0 - : 1; - const u32 num_components = AmdGpu::NumComponents(sharp.GetDataFmt()); - const auto buffer = - std::ranges::find_if(info.buffers, [&attrib](const auto& buffer) { - return buffer.instance_attrib == attrib.semantic; - }); - // Note that we pass index rather than Id - input_params[attrib.semantic] = SpirvAttribute{ - .id = {rate_idx}, - .pointer_type = input_u32, - .component_type = U32[1], - .num_components = std::min(attrib.num_elements, num_components), - .is_integer = true, - .is_loaded = false, - .buffer_handle = int(buffer - info.buffers.begin()), - }; + Id id{DefineInput(type, attrib.semantic)}; + if (attrib.GetStepRate() != Gcn::VertexAttribute::InstanceIdType::None) { + Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); } else { - Id id{DefineInput(type, attrib.semantic)}; - if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) { - Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); - } else { - Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); - } - input_params[attrib.semantic] = - GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); + Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); } + input_params[attrib.semantic] = GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); } break; } @@ -573,7 +551,7 @@ void EmitContext::DefineOutputs() { cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output); } - if (stage == Shader::Stage::Local && runtime_info.ls_info.links_with_tcs) { + if (stage == Stage::Local) { const u32 num_attrs = Common::AlignUp(runtime_info.ls_info.ls_stride, 16) >> 4; if (num_attrs > 0) { const Id type{TypeArray(F32[4], ConstU32(num_attrs))}; @@ -700,12 +678,10 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates - const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4], - U32[4], U32[4], U32[4], U32[4], U32[4], U32[2]), + const Id struct_type{Name(TypeStruct(F32[1], F32[1], F32[1], F32[1], U32[4], U32[4], U32[4], + U32[4], U32[4], U32[4], U32[2]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, PushData::Step0Index, "sr0"); - MemberName(struct_type, PushData::Step1Index, "sr1"); MemberName(struct_type, PushData::XOffsetIndex, "xoffset"); MemberName(struct_type, PushData::YOffsetIndex, "yoffset"); MemberName(struct_type, PushData::XScaleIndex, "xscale"); @@ -717,19 +693,17 @@ void EmitContext::DefinePushDataBlock() { MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0"); MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1"); MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2"); - MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U); - MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U); - MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U); - MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 12U); - MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 16U); - MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 20U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 24U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 40U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 56U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 120U); + MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 0U); + MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 4U); + MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 8U); + MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 12U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 16U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 32U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 48U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 64U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 80U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 96U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 112U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); @@ -763,19 +737,19 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte Decorate(id, spv::Decoration::NonWritable); } switch (buffer_type) { - case Shader::BufferType::GdsBuffer: + case BufferType::GdsBuffer: Name(id, "gds_buffer"); break; - case Shader::BufferType::Flatbuf: + case BufferType::Flatbuf: Name(id, "srt_flatbuf"); break; - case Shader::BufferType::BdaPagetable: + case BufferType::BdaPagetable: Name(id, "bda_pagetable"); break; - case Shader::BufferType::FaultBuffer: + case BufferType::FaultBuffer: Name(id, "fault_buffer"); break; - case Shader::BufferType::SharedMemory: + case BufferType::SharedMemory: Name(id, "ssbo_shmem"); break; default: diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 28e9099d8..186925706 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -361,7 +361,6 @@ public: u32 num_components; bool is_integer{}; bool is_loaded{}; - s32 buffer_handle{-1}; }; Id input_attr_array; Id output_attr_array; diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index 837caafa0..e77925232 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include "common/types.h" #include "shader_recompiler/info.h" @@ -29,11 +28,6 @@ struct VertexAttribute { return static_cast(instance_data); } - [[nodiscard]] bool UsesStepRates() const { - const auto step_rate = GetStepRate(); - return step_rate == OverStepRate0 || step_rate == OverStepRate1; - } - [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Shader::Info& info) const noexcept { return info.ReadUdReg(sgpr_base, dword_offset); } @@ -52,12 +46,6 @@ struct FetchShaderData { s8 vertex_offset_sgpr = -1; ///< SGPR of vertex offset from VADDR s8 instance_offset_sgpr = -1; ///< SGPR of instance offset from VADDR - [[nodiscard]] bool UsesStepRates() const { - return std::ranges::find_if(attributes, [](const VertexAttribute& attribute) { - return attribute.UsesStepRates(); - }) != attributes.end(); - } - bool operator==(const FetchShaderData& other) const { return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr && instance_offset_sgpr == other.instance_offset_sgpr; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 5853f3e72..310ac9156 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -90,17 +90,40 @@ void Translator::EmitPrologue(IR::Block* first_block) { case LogicalStage::Vertex: // v0: vertex ID, always present ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::VertexId)); - // v1: instance ID, step rate 0 - if (runtime_info.num_input_vgprs > 0) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId0)); - } - // v2: instance ID, step rate 1 - if (runtime_info.num_input_vgprs > 1) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId1)); - } - // v3: instance ID, plain - if (runtime_info.num_input_vgprs > 2) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + if (info.stage == Stage::Local) { + // v1: rel patch ID + if (runtime_info.num_input_vgprs > 0) { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + // v2: instance ID + if (runtime_info.num_input_vgprs > 1) { + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + } + } else { + // v1: instance ID, step rate 0 + if (runtime_info.num_input_vgprs > 0) { + if (runtime_info.vs_info.step_rate_0 != 0) { + ir.SetVectorReg(dst_vreg++, + ir.IDiv(ir.GetAttributeU32(IR::Attribute::InstanceId), + ir.Imm32(runtime_info.vs_info.step_rate_0))); + } else { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + } + // v2: instance ID, step rate 1 + if (runtime_info.num_input_vgprs > 1) { + if (runtime_info.vs_info.step_rate_1 != 0) { + ir.SetVectorReg(dst_vreg++, + ir.IDiv(ir.GetAttributeU32(IR::Attribute::InstanceId), + ir.Imm32(runtime_info.vs_info.step_rate_1))); + } else { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + } + // v3: instance ID, plain + if (runtime_info.num_input_vgprs > 2) { + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + } } break; case LogicalStage::Fragment: @@ -183,10 +206,8 @@ void Translator::EmitPrologue(IR::Block* first_block) { switch (runtime_info.gs_info.out_primitive[0]) { case AmdGpu::GsOutputPrimitiveType::TriangleStrip: ir.SetVectorReg(IR::VectorReg::V3, ir.Imm32(2u)); // vertex 2 - [[fallthrough]]; case AmdGpu::GsOutputPrimitiveType::LineStrip: ir.SetVectorReg(IR::VectorReg::V1, ir.Imm32(1u)); // vertex 1 - [[fallthrough]]; default: ir.SetVectorReg(IR::VectorReg::V0, ir.Imm32(0u)); // vertex 0 break; @@ -481,11 +502,11 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra } void Translator::EmitFetch(const GcnInst& inst) { - // Read the pointer to the fetch shader assembly. const auto code_sgpr_base = inst.src[0].code; + + // The fetch shader must be inlined to access as regular buffers, so that + // bounds checks can be emitted to emulate robust buffer access. if (!profile.supports_robust_buffer_access) { - // The fetch shader must be inlined to access as regular buffers, so that - // bounds checks can be emitted to emulate robust buffer access. const auto* code = GetFetchShaderCode(info, code_sgpr_base); GcnCodeSlice slice(code, code + std::numeric_limits::max()); GcnDecodeContext decoder; @@ -535,16 +556,6 @@ void Translator::EmitFetch(const GcnInst& inst) { for (u32 i = 0; i < 4; i++) { ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(swizzled, i)}); } - - // In case of programmable step rates we need to fallback to instance data pulling in - // shader, so VBs should be bound as regular data buffers - if (attrib.UsesStepRates()) { - info.buffers.push_back({ - .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), - .used_types = IR::Type::F32, - .instance_attrib = attrib.semantic, - }); - } } } diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 9703643e8..6e12c6816 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -113,17 +113,13 @@ struct FMaskResource { using FMaskResourceList = boost::container::small_vector; struct PushData { - static constexpr u32 Step0Index = 0; - static constexpr u32 Step1Index = 1; - static constexpr u32 XOffsetIndex = 2; - static constexpr u32 YOffsetIndex = 3; - static constexpr u32 XScaleIndex = 4; - static constexpr u32 YScaleIndex = 5; - static constexpr u32 UdRegsIndex = 6; + static constexpr u32 XOffsetIndex = 0; + static constexpr u32 YOffsetIndex = 1; + static constexpr u32 XScaleIndex = 2; + static constexpr u32 YScaleIndex = 3; + static constexpr u32 UdRegsIndex = 4; static constexpr u32 BufOffsetIndex = UdRegsIndex + NumUserDataRegs / 4; - u32 step0; - u32 step1; float xoffset; float yoffset; float xscale; diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 6a267e21b..b2f11d141 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -100,22 +100,36 @@ std::string NameOf(Attribute attribute) { return "Param30"; case Attribute::Param31: return "Param31"; + case Attribute::ClipDistance: + return "ClipDistanace"; + case Attribute::CullDistance: + return "CullDistance"; + case Attribute::RenderTargetId: + return "RenderTargetId"; + case Attribute::ViewportId: + return "ViewportId"; case Attribute::VertexId: return "VertexId"; - case Attribute::InstanceId: - return "InstanceId"; case Attribute::PrimitiveId: return "PrimitiveId"; - case Attribute::FragCoord: - return "FragCoord"; + case Attribute::InstanceId: + return "InstanceId"; case Attribute::IsFrontFace: return "IsFrontFace"; + case Attribute::SampleIndex: + return "SampleIndex"; + case Attribute::GlobalInvocationId: + return "GlobalInvocationId"; case Attribute::WorkgroupId: return "WorkgroupId"; + case Attribute::WorkgroupIndex: + return "WorkgroupIndex"; case Attribute::LocalInvocationId: return "LocalInvocationId"; case Attribute::LocalInvocationIndex: return "LocalInvocationIndex"; + case Attribute::FragCoord: + return "FragCoord"; case Attribute::InvocationId: return "InvocationId"; case Attribute::PatchVertices: diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 68472f052..b6b1c8b59 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -73,8 +73,6 @@ enum class Attribute : u64 { LocalInvocationId = 76, LocalInvocationIndex = 77, FragCoord = 78, - InstanceId0 = 79, // step rate 0 - InstanceId1 = 80, // step rate 1 InvocationId = 81, // TCS id in output patch and instanced geometry shader id PatchVertices = 82, TessellationEvaluationPointU = 83, diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 4997145d7..6ca86b2c0 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -255,8 +255,8 @@ void IREmitter::SetM0(const U32& value) { Inst(Opcode::SetM0, value); } -F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) { - return Inst(Opcode::GetAttribute, attribute, Imm32(comp), index); +F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) { + return Inst(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index)); } U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 6055df565..a105b042d 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -81,8 +81,7 @@ public: [[nodiscard]] U1 Condition(IR::Condition cond); - [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, - IR::Value index = IR::Value(u32(0u))); + [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0); [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); diff --git a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp index 7253e18c1..e0c99655d 100644 --- a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp +++ b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp @@ -191,7 +191,7 @@ static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info, static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { Xbyak::CodeGenerator& c = g_srt_codegen; - if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) { + if (pass_info.srt_roots.empty()) { return; } @@ -205,29 +205,7 @@ static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { } info.srt_info.walker_func = c.getCurr(); - pass_info.dst_off_dw = NumUserDataRegs; - - // Special case for V# step rate buffers in fetch shader - for (const auto [sgpr_base, dword_offset, num_dwords] : info.srt_info.srt_reservations) { - // get pointer to V# - if (sgpr_base != IR::NumScalarRegs) { - PushPtr(c, sgpr_base); - } - u32 src_off = dword_offset << 2; - - for (auto j = 0; j < num_dwords; j++) { - c.mov(r11d, ptr[rdi + src_off]); - c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d); - - src_off += 4; - ++pass_info.dst_off_dw; - } - if (sgpr_base != IR::NumScalarRegs) { - PopPtr(c); - } - } - ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw); for (const auto& [sgpr_base, root] : pass_info.srt_roots) { diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index b292b41b9..e1e5d762c 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -33,12 +33,9 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim bool is_composite = opcode == IR::Opcode::WriteSharedU64; u32 num_components = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; - u32 offset = 0; - const auto* addr = inst.Arg(0).InstRecursive(); - if (addr->GetOpcode() == IR::Opcode::IAdd32) { - ASSERT(addr->Arg(1).IsImmediate()); - offset = addr->Arg(1).U32(); - } + ASSERT(inst.Arg(0).IsImmediate()); + + u32 offset = inst.Arg(0).U32(); IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) : inst.Arg(1).Resolve(); for (s32 i = 0; i < num_components; i++) { @@ -116,7 +113,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2); + const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); const auto bucket = offset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 diff --git a/src/shader_recompiler/ir/passes/srt.h b/src/shader_recompiler/ir/passes/srt.h index 0ddc15ea6..4dce38674 100644 --- a/src/shader_recompiler/ir/passes/srt.h +++ b/src/shader_recompiler/ir/passes/srt.h @@ -20,18 +20,7 @@ struct PersistentSrtInfo { }; PFN_SrtWalker walker_func{}; - boost::container::small_vector srt_reservations; u32 flattened_bufsize_dw = 16; // NumUserDataRegs - - // Special case for fetch shaders because we don't generate IR to read from step rate buffers, - // so we won't see usage with GetUserData/ReadConst. - // Reserve space in the flattened buffer for a sharp ahead of time - u32 ReserveSharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) { - u32 rv = flattened_bufsize_dw; - srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords); - flattened_bufsize_dw += num_dwords; - return rv; - } }; -} // namespace Shader \ No newline at end of file +} // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 5a0408e2c..6cede44a8 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -42,7 +42,6 @@ constexpr u32 MaxStageTypes = static_cast(LogicalStage::NumLogicalStages); struct LocalRuntimeInfo { u32 ls_stride; - bool links_with_tcs; auto operator<=>(const LocalRuntimeInfo&) const noexcept = default; }; @@ -85,6 +84,8 @@ struct VertexRuntimeInfo { std::array outputs; bool emulate_depth_negative_one_to_one{}; bool clip_disable{}; + u32 step_rate_0; + u32 step_rate_1; // Domain AmdGpu::TessellationType tess_type; AmdGpu::TessellationTopology tess_topology; @@ -96,7 +97,8 @@ struct VertexRuntimeInfo { clip_disable == other.clip_disable && tess_type == other.tess_type && tess_topology == other.tess_topology && tess_partitioning == other.tess_partitioning && - hs_output_cp_stride == other.hs_output_cp_stride; + hs_output_cp_stride == other.hs_output_cp_stride && + step_rate_0 == other.step_rate_0 && step_rate_1 == other.step_rate_1; } void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index e40309aaf..d3e671c58 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -13,7 +13,7 @@ namespace Shader { struct VsAttribSpecialization { - s32 num_components{}; + u32 divisor{}; AmdGpu::NumberClass num_class{}; AmdGpu::CompMapping dst_select{}; @@ -74,13 +74,13 @@ struct SamplerSpecialization { * after the first compilation of a module. */ struct StageSpecialization { - static constexpr size_t MaxStageResources = 64; + static constexpr size_t MaxStageResources = 128; const Shader::Info* info; RuntimeInfo runtime_info; + std::bitset bitset{}; std::optional fetch_shader_data{}; boost::container::small_vector vs_attribs; - std::bitset bitset{}; boost::container::small_vector buffers; boost::container::small_vector images; boost::container::small_vector fmasks; @@ -94,10 +94,16 @@ struct StageSpecialization { if (info_.stage == Stage::Vertex && fetch_shader_data) { // Specialize shader on VS input number types to follow spec. ForEachSharp(vs_attribs, fetch_shader_data->attributes, - [&profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { - spec.num_components = desc.UsesStepRates() - ? AmdGpu::NumComponents(sharp.GetDataFmt()) - : 0; + [&profile_, this](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; + if (const auto step_rate = desc.GetStepRate(); + step_rate != InstanceIdType::None) { + spec.divisor = step_rate == InstanceIdType::OverStepRate0 + ? runtime_info.vs_info.step_rate_0 + : (step_rate == InstanceIdType::OverStepRate1 + ? runtime_info.vs_info.step_rate_1 + : 1); + } spec.num_class = profile_.support_legacy_vertex_attributes ? AmdGpu::NumberClass{} : AmdGpu::GetNumberClass(sharp.GetNumberFmt()); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 28444ac60..42e3c61a5 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -198,10 +198,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { + const auto& regs = liverpool->regs; Vulkan::VertexInputs attributes; Vulkan::VertexInputs bindings; + Vulkan::VertexInputs divisors; Vulkan::VertexInputs guest_buffers; - pipeline.GetVertexInputs(attributes, bindings, guest_buffers); + pipeline.GetVertexInputs(attributes, bindings, divisors, guest_buffers, + regs.vgt_instance_step_rate_0, regs.vgt_instance_step_rate_1); if (instance.IsVertexInputDynamicState()) { // Update current vertex inputs. diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7c020a012..3596bb041 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -72,12 +72,21 @@ GraphicsPipeline::GraphicsPipeline( VertexInputs vertex_attributes; VertexInputs vertex_bindings; + VertexInputs divisors; VertexInputs guest_buffers; if (!instance.IsVertexInputDynamicState()) { - GetVertexInputs(vertex_attributes, vertex_bindings, guest_buffers); + const auto& vs_info = runtime_infos[u32(Shader::LogicalStage::Vertex)].vs_info; + GetVertexInputs(vertex_attributes, vertex_bindings, divisors, guest_buffers, + vs_info.step_rate_0, vs_info.step_rate_1); } + const vk::PipelineVertexInputDivisorStateCreateInfo divisor_state = { + .vertexBindingDivisorCount = static_cast(divisors.size()), + .pVertexBindingDivisors = divisors.data(), + }; + const vk::PipelineVertexInputStateCreateInfo vertex_input_info = { + .pNext = divisors.empty() ? nullptr : &divisor_state, .vertexBindingDescriptionCount = static_cast(vertex_bindings.size()), .pVertexBindingDescriptions = vertex_bindings.data(), .vertexAttributeDescriptionCount = static_cast(vertex_attributes.size()), @@ -304,19 +313,17 @@ GraphicsPipeline::GraphicsPipeline( GraphicsPipeline::~GraphicsPipeline() = default; template -void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, - VertexInputs& bindings, - VertexInputs& guest_buffers) const { +void GraphicsPipeline::GetVertexInputs( + VertexInputs& attributes, VertexInputs& bindings, + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const { + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; if (!fetch_shader || fetch_shader->attributes.empty()) { return; } const auto& vs_info = GetStage(Shader::LogicalStage::Vertex); for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } - + const auto step_rate = attrib.GetStepRate(); const auto& buffer = attrib.GetSharp(vs_info); attributes.push_back(Attribute{ .location = attrib.semantic, @@ -327,12 +334,19 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, bindings.push_back(Binding{ .binding = attrib.semantic, .stride = buffer.GetStride(), - .inputRate = attrib.GetStepRate() == Shader::Gcn::VertexAttribute::InstanceIdType::None - ? vk::VertexInputRate::eVertex - : vk::VertexInputRate::eInstance, + .inputRate = step_rate == InstanceIdType::None ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, }); + const u32 divisor = step_rate == InstanceIdType::OverStepRate0 + ? step_rate_0 + : (step_rate == InstanceIdType::OverStepRate1 ? step_rate_1 : 1); if constexpr (std::is_same_v) { - bindings.back().divisor = 1; + bindings.back().divisor = divisor; + } else if (step_rate != InstanceIdType::None) { + divisors.push_back(vk::VertexInputBindingDivisorDescriptionEXT{ + .binding = attrib.semantic, + .divisor = divisor, + }); } guest_buffers.emplace_back(buffer); } @@ -342,11 +356,13 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; void GraphicsPipeline::BuildDescSetLayout() { boost::container::small_vector bindings; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 59230ae46..ab67a52b4 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -81,7 +81,9 @@ public: /// Gets the attributes and bindings for vertex inputs. template void GetVertexInputs(VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, + u32 step_rate_1) const; private: void BuildDescSetLayout(); diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 237fa202d..85fc993a9 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -248,6 +248,7 @@ bool Instance::CreateDevice() { // Required ASSERT(add_extension(VK_KHR_SWAPCHAIN_EXTENSION_NAME)); ASSERT(add_extension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME)); + ASSERT(add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME)); // Optional depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); @@ -436,6 +437,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{ .legacyVertexAttributes = true, }, + vk::PhysicalDeviceVertexAttributeDivisorFeatures{ + .vertexAttributeInstanceRateDivisor = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7dd468f9a..8d12b74f3 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -94,15 +94,10 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS switch (stage) { case Stage::Local: { BuildCommon(regs.ls_program); - if (regs.stage_enable.IsStageEnabled(static_cast(Stage::Hull))) { - info.ls_info.links_with_tcs = true; - Shader::TessellationDataConstantBuffer tess_constants; - const auto* pgm = regs.ProgramForStage(static_cast(Stage::Hull)); - const auto params = Liverpool::GetParams(*pgm); - const auto& hull_info = program_cache.at(params.hash)->info; - hull_info.ReadTessConstantBuffer(tess_constants); - info.ls_info.ls_stride = tess_constants.ls_stride; - } + Shader::TessellationDataConstantBuffer tess_constants; + const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)]; + hull_info->ReadTessConstantBuffer(tess_constants); + info.ls_info.ls_stride = tess_constants.ls_stride; break; } case Stage::Hull: { @@ -122,6 +117,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS case Stage::Vertex: { BuildCommon(regs.vs_program); GatherVertexOutputs(info.vs_info, regs.vs_output_control); + info.vs_info.step_rate_0 = regs.vgt_instance_step_rate_0; + info.vs_info.step_rate_1 = regs.vgt_instance_step_rate_1; info.vs_info.emulate_depth_negative_one_to_one = !instance.IsDepthClipControlSupported() && regs.clipper_control.clip_space == Liverpool::ClipSpace::MinusWToW; @@ -460,10 +457,6 @@ bool PipelineCache::RefreshGraphicsKey() { // Stride will still be handled outside the pipeline using dynamic state. u32 vertex_binding = 0; for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } const auto& buffer = attrib.GetSharp(*vs_info); ASSERT(vertex_binding < MaxVertexBufferCount); key.vertex_buffer_formats[vertex_binding++] = diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 5d0a14ce3..2a645f338 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -20,12 +20,9 @@ namespace Vulkan { static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { - Shader::PushData push_data{}; - push_data.step0 = regs.vgt_instance_step_rate_0; - push_data.step1 = regs.vgt_instance_step_rate_1; - // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex // is encountered and implemented in the recompiler. + Shader::PushData push_data{}; push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; From 00f4eeddaf080ec65c9ac6d63141577c81819449 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 14 Jul 2025 21:23:18 +0300 Subject: [PATCH 21/22] renderer_vulkan: Handle more miscellaneous GPU settings (#3241) * renderer_vulkan: Respect provoking vertex setting * renderer_vulkan: Handle rasterization discard * renderer_vulkan: Implement logic ops * renderer_vulkan: Properly implement depth clamp and clip * renderer_vulkan: Handle line width * Fix build * vk_pipeline_cache: Don't check depth clamp without a depth buffer * liverpool: Fix line control offset * vk_pipeline_cache: Don't run search if depth clamp is disabled * vk_pipeline_cache: Allow using viewport range when it's more restrictive then depth clamp * liverpool: Disable depth clip when near and far planes have different setting * vk_graphics_pipeline: Move warning to pipeline * vk_pipeline_cache: Revert viewport check and remove log * vk_graphics_pipeline: Enable depth clamp when depth clip is disabled and extension is not supported Without the depth clip extension depth clipping is controlled by depth clamping --- src/video_core/amdgpu/liverpool.h | 51 ++++++++--- .../renderer_vulkan/liverpool_to_vk.cpp | 40 +++++++++ .../renderer_vulkan/liverpool_to_vk.h | 2 + .../renderer_vulkan/vk_graphics_pipeline.cpp | 79 +++++++++++++---- .../renderer_vulkan/vk_graphics_pipeline.h | 32 ++++--- .../renderer_vulkan/vk_instance.cpp | 23 +++++ src/video_core/renderer_vulkan/vk_instance.h | 23 +++++ .../renderer_vulkan/vk_pipeline_cache.cpp | 85 +++++++++++++++---- .../renderer_vulkan/vk_pipeline_cache.h | 2 + .../renderer_vulkan/vk_rasterizer.cpp | 16 ++-- .../renderer_vulkan/vk_rasterizer.h | 1 + .../renderer_vulkan/vk_scheduler.cpp | 10 ++- src/video_core/renderer_vulkan/vk_scheduler.h | 26 +++++- 13 files changed, 323 insertions(+), 67 deletions(-) diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index c07e9f63a..c517285fb 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -304,6 +304,14 @@ struct Liverpool { } }; + struct LineControl { + u32 width_fixed_point; + + float Width() const { + return static_cast(width_fixed_point) / 8.0; + } + }; + struct ModeControl { s32 msaa_enable : 1; s32 vport_scissor_enable : 1; @@ -513,9 +521,16 @@ struct Liverpool { BitField<19, 1, ClipSpace> clip_space; BitField<21, 1, PrimKillCond> vtx_kill_or; BitField<22, 1, u32> dx_rasterization_kill; - BitField<23, 1, u32> dx_linear_attr_clip_enable; + BitField<24, 1, u32> dx_linear_attr_clip_enable; BitField<26, 1, u32> zclip_near_disable; - BitField<26, 1, u32> zclip_far_disable; + BitField<27, 1, u32> zclip_far_disable; + + bool ZclipEnable() const { + if (zclip_near_disable != zclip_far_disable) { + return false; + } + return !zclip_near_disable; + } }; enum class PolygonMode : u32 { @@ -738,12 +753,7 @@ struct Liverpool { u32 data_w; }; - struct BlendConstants { - float red; - float green; - float blue; - float alpha; - }; + using BlendConstants = std::array; union BlendControl { enum class BlendFactor : u32 { @@ -796,11 +806,29 @@ struct Liverpool { Err = 4u, FmaskDecompress = 5u, }; + enum class LogicOp : u32 { + Clear = 0x00, + Nor = 0x11, + AndInverted = 0x22, + CopyInverted = 0x33, + AndReverse = 0x44, + Invert = 0x55, + Xor = 0x66, + Nand = 0x77, + And = 0x88, + Equiv = 0x99, + Noop = 0xAA, + OrInverted = 0xBB, + Copy = 0xCC, + OrReverse = 0xDD, + Or = 0xEE, + Set = 0xFF, + }; BitField<0, 1, u32> disable_dual_quad; BitField<3, 1, u32> degamma_enable; BitField<4, 3, OperationMode> mode; - BitField<16, 8, u32> rop3; + BitField<16, 8, LogicOp> rop3; }; struct ColorBuffer { @@ -1369,7 +1397,9 @@ struct Liverpool { PolygonControl polygon_control; ViewportControl viewport_control; VsOutputControl vs_output_control; - INSERT_PADDING_WORDS(0xA287 - 0xA207 - 1); + INSERT_PADDING_WORDS(0xA287 - 0xA207 - 6); + LineControl line_control; + INSERT_PADDING_WORDS(4); HsTessFactorClamp hs_clamp; INSERT_PADDING_WORDS(0xA290 - 0xA287 - 2); GsMode vgt_gs_mode; @@ -1695,6 +1725,7 @@ static_assert(GFX6_3D_REG_INDEX(color_control) == 0xA202); static_assert(GFX6_3D_REG_INDEX(clipper_control) == 0xA204); static_assert(GFX6_3D_REG_INDEX(viewport_control) == 0xA206); static_assert(GFX6_3D_REG_INDEX(vs_output_control) == 0xA207); +static_assert(GFX6_3D_REG_INDEX(line_control) == 0xA282); static_assert(GFX6_3D_REG_INDEX(hs_clamp) == 0xA287); static_assert(GFX6_3D_REG_INDEX(vgt_gs_mode) == 0xA290); static_assert(GFX6_3D_REG_INDEX(mode_control) == 0xA292); diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 5972296c0..fd1a91260 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -245,6 +245,46 @@ vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func) { } } +vk::LogicOp LogicOp(Liverpool::ColorControl::LogicOp logic_op) { + using LogicOp = Liverpool::ColorControl::LogicOp; + switch (logic_op) { + case LogicOp::Clear: + return vk::LogicOp::eClear; + case LogicOp::Nor: + return vk::LogicOp::eNor; + case LogicOp::AndInverted: + return vk::LogicOp::eAndInverted; + case LogicOp::CopyInverted: + return vk::LogicOp::eCopyInverted; + case LogicOp::AndReverse: + return vk::LogicOp::eAndReverse; + case LogicOp::Invert: + return vk::LogicOp::eInvert; + case LogicOp::Xor: + return vk::LogicOp::eXor; + case LogicOp::Nand: + return vk::LogicOp::eNand; + case LogicOp::And: + return vk::LogicOp::eAnd; + case LogicOp::Equiv: + return vk::LogicOp::eEquivalent; + case LogicOp::Noop: + return vk::LogicOp::eNoOp; + case LogicOp::OrInverted: + return vk::LogicOp::eOrInverted; + case LogicOp::Copy: + return vk::LogicOp::eCopy; + case LogicOp::OrReverse: + return vk::LogicOp::eOrReverse; + case LogicOp::Or: + return vk::LogicOp::eOr; + case LogicOp::Set: + return vk::LogicOp::eSet; + default: + UNREACHABLE_MSG("Unknown logic op {}", u32(logic_op)); + } +} + // https://github.com/chaotic-cx/mesa-mirror/blob/0954afff5/src/amd/vulkan/radv_sampler.c#L21 vk::SamplerAddressMode ClampMode(AmdGpu::ClampMode mode) { switch (mode) { diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.h b/src/video_core/renderer_vulkan/liverpool_to_vk.h index 61fd4a8c1..61b7ea0a9 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.h +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.h @@ -34,6 +34,8 @@ bool IsDualSourceBlendFactor(Liverpool::BlendControl::BlendFactor factor); vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func); +vk::LogicOp LogicOp(Liverpool::ColorControl::LogicOp logic_op); + vk::SamplerAddressMode ClampMode(AmdGpu::ClampMode mode); vk::CompareOp DepthCompare(AmdGpu::DepthCompare comp); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 3596bb041..10e5bed5f 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -109,28 +109,63 @@ GraphicsPipeline::GraphicsPipeline( .patchControlPoints = is_rect_list ? 3U : (is_quad_list ? 4U : key.patch_control_points), }; - const vk::PipelineRasterizationStateCreateInfo raster_state = { - .depthClampEnable = false, - .rasterizerDiscardEnable = false, - .polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode), - .lineWidth = 1.0f, + vk::StructureChain raster_chain = { + vk::PipelineRasterizationStateCreateInfo{ + .depthClampEnable = key.depth_clamp_enable || + (!key.depth_clip_enable && !instance.IsDepthClipEnableSupported()), + .rasterizerDiscardEnable = false, + .polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode), + .lineWidth = 1.0f, + }, + vk::PipelineRasterizationProvokingVertexStateCreateInfoEXT{ + .provokingVertexMode = key.provoking_vtx_last == Liverpool::ProvokingVtxLast::First + ? vk::ProvokingVertexModeEXT::eFirstVertex + : vk::ProvokingVertexModeEXT::eLastVertex, + }, + vk::PipelineRasterizationDepthClipStateCreateInfoEXT{ + .depthClipEnable = key.depth_clip_enable, + }, }; + if (!instance.IsProvokingVertexSupported()) { + raster_chain.unlink(); + } + if (!instance.IsDepthClipEnableSupported()) { + raster_chain.unlink(); + } + const vk::PipelineMultisampleStateCreateInfo multisampling = { .rasterizationSamples = LiverpoolToVK::NumSamples(key.num_samples, instance.GetFramebufferSampleCounts()), .sampleShadingEnable = false, }; - const vk::PipelineViewportDepthClipControlCreateInfoEXT clip_control = { - .negativeOneToOne = key.clip_space == Liverpool::ClipSpace::MinusWToW, + const vk::DepthClampRangeEXT depth_clamp_range = { + .minDepthClamp = key.min_depth_clamp, + .maxDepthClamp = key.max_depth_clamp, }; - const vk::PipelineViewportStateCreateInfo viewport_info = { - .pNext = instance.IsDepthClipControlSupported() ? &clip_control : nullptr, + vk::StructureChain viewport_chain = { + vk::PipelineViewportStateCreateInfo{}, + vk::PipelineViewportDepthClipControlCreateInfoEXT{ + .negativeOneToOne = key.clip_space == Liverpool::ClipSpace::MinusWToW, + }, + vk::PipelineViewportDepthClampControlCreateInfoEXT{ + .depthClampMode = key.depth_clamp_user_defined_range + ? vk::DepthClampModeEXT::eUserDefinedRange + : vk::DepthClampModeEXT::eViewportRange, + .pDepthClampRange = &depth_clamp_range, + }, }; - boost::container::static_vector dynamic_states = { + if (!instance.IsDepthClampControlSupported()) { + viewport_chain.unlink(); + } + if (!instance.IsDepthClipControlSupported()) { + viewport_chain.unlink(); + } + + boost::container::static_vector dynamic_states = { vk::DynamicState::eViewportWithCount, vk::DynamicState::eScissorWithCount, vk::DynamicState::eBlendConstants, vk::DynamicState::eDepthTestEnable, vk::DynamicState::eDepthWriteEnable, vk::DynamicState::eDepthCompareOp, @@ -138,7 +173,8 @@ GraphicsPipeline::GraphicsPipeline( vk::DynamicState::eStencilTestEnable, vk::DynamicState::eStencilReference, vk::DynamicState::eStencilCompareMask, vk::DynamicState::eStencilWriteMask, vk::DynamicState::eStencilOp, vk::DynamicState::eCullMode, - vk::DynamicState::eFrontFace, + vk::DynamicState::eFrontFace, vk::DynamicState::eRasterizerDiscardEnable, + vk::DynamicState::eLineWidth, }; if (instance.IsPrimitiveRestartDisableSupported()) { @@ -221,11 +257,19 @@ GraphicsPipeline::GraphicsPipeline( }); } + const auto depth_format = + instance.GetSupportedFormat(LiverpoolToVK::DepthFormat(key.z_format, key.stencil_format), + vk::FormatFeatureFlagBits2::eDepthStencilAttachment); const vk::PipelineRenderingCreateInfo pipeline_rendering_ci = { .colorAttachmentCount = key.num_color_attachments, .pColorAttachmentFormats = key.color_formats.data(), - .depthAttachmentFormat = key.depth_format, - .stencilAttachmentFormat = key.stencil_format, + .depthAttachmentFormat = key.z_format != Liverpool::DepthBuffer::ZFormat::Invalid + ? depth_format + : vk::Format::eUndefined, + .stencilAttachmentFormat = + key.stencil_format != Liverpool::DepthBuffer::StencilFormat::Invalid + ? depth_format + : vk::Format::eUndefined, }; std::array attachments; @@ -280,8 +324,9 @@ GraphicsPipeline::GraphicsPipeline( } const vk::PipelineColorBlendStateCreateInfo color_blending = { - .logicOpEnable = false, - .logicOp = vk::LogicOp::eCopy, + .logicOpEnable = + instance.IsLogicOpSupported() && key.logic_op != Liverpool::ColorControl::LogicOp::Copy, + .logicOp = LiverpoolToVK::LogicOp(key.logic_op), .attachmentCount = key.num_color_attachments, .pAttachments = attachments.data(), .blendConstants = std::array{1.0f, 1.0f, 1.0f, 1.0f}, @@ -294,8 +339,8 @@ GraphicsPipeline::GraphicsPipeline( .pVertexInputState = !instance.IsVertexInputDynamicState() ? &vertex_input_info : nullptr, .pInputAssemblyState = &input_assembly, .pTessellationState = &tessellation_state, - .pViewportState = &viewport_info, - .pRasterizationState = &raster_state, + .pViewportState = &viewport_chain.get(), + .pRasterizationState = &raster_chain.get(), .pMultisampleState = &multisampling, .pColorBlendState = &color_blending, .pDynamicState = &dynamic_info, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index ab67a52b4..1ecfa6b42 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -33,22 +33,32 @@ using VertexInputs = boost::container::static_vector; struct GraphicsPipelineKey { std::array stage_hashes; + std::array vertex_buffer_formats; + u32 patch_control_points; u32 num_color_attachments; std::array color_formats; std::array color_buffers; - vk::Format depth_format; - vk::Format stencil_format; - - u32 num_samples; - u32 mrt_mask; - AmdGpu::PrimitiveType prim_type; - Liverpool::PolygonMode polygon_mode; - Liverpool::ClipSpace clip_space; - Liverpool::ColorBufferMask cb_shader_mask; std::array blend_controls; std::array write_masks; - std::array vertex_buffer_formats; - u32 patch_control_points; + Liverpool::ColorBufferMask cb_shader_mask; + Liverpool::ColorControl::LogicOp logic_op; + u32 num_samples; + u32 mrt_mask; + struct { + Liverpool::DepthBuffer::ZFormat z_format : 2; + Liverpool::DepthBuffer::StencilFormat stencil_format : 1; + u32 depth_clamp_enable : 1; + u32 depth_clamp_user_defined_range : 1; + float min_depth_clamp; + float max_depth_clamp; + }; + struct { + AmdGpu::PrimitiveType prim_type : 5; + Liverpool::PolygonMode polygon_mode : 2; + Liverpool::ClipSpace clip_space : 1; + Liverpool::ProvokingVtxLast provoking_vtx_last : 1; + u32 depth_clip_enable : 1; + }; bool operator==(const GraphicsPipelineKey& key) const noexcept { return std::memcmp(this, &key, sizeof(key)) == 0; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 85fc993a9..3a461b321 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -270,10 +270,13 @@ bool Instance::CreateDevice() { } custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); + depth_clip_enable = add_extension(VK_EXT_DEPTH_CLIP_ENABLE_EXTENSION_NAME); + depth_clamp_control = add_extension(VK_EXT_DEPTH_CLAMP_CONTROL_EXTENSION_NAME); vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); list_restart = add_extension(VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME); fragment_shader_barycentric = add_extension(VK_KHR_FRAGMENT_SHADER_BARYCENTRIC_EXTENSION_NAME); legacy_vertex_attributes = add_extension(VK_EXT_LEGACY_VERTEX_ATTRIBUTES_EXTENSION_NAME); + provoking_vertex = add_extension(VK_EXT_PROVOKING_VERTEX_EXTENSION_NAME); shader_stencil_export = add_extension(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME); image_load_store_lod = add_extension(VK_AMD_SHADER_IMAGE_LOAD_STORE_LOD_EXTENSION_NAME); amd_gcn_shader = add_extension(VK_AMD_GCN_SHADER_EXTENSION_NAME); @@ -362,9 +365,11 @@ bool Instance::CreateDevice() { .dualSrcBlend = features.dualSrcBlend, .logicOp = features.logicOp, .multiDrawIndirect = features.multiDrawIndirect, + .depthClamp = features.depthClamp, .depthBiasClamp = features.depthBiasClamp, .fillModeNonSolid = features.fillModeNonSolid, .depthBounds = features.depthBounds, + .wideLines = features.wideLines, .multiViewport = features.multiViewport, .samplerAnisotropy = features.samplerAnisotropy, .vertexPipelineStoresAndAtomics = features.vertexPipelineStoresAndAtomics, @@ -418,6 +423,12 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceDepthClipControlFeaturesEXT{ .depthClipControl = true, }, + vk::PhysicalDeviceDepthClipEnableFeaturesEXT{ + .depthClipEnable = true, + }, + vk::PhysicalDeviceDepthClampControlFeaturesEXT{ + .depthClampControl = true, + }, vk::PhysicalDeviceRobustness2FeaturesEXT{ .robustBufferAccess2 = robustness2_features.robustBufferAccess2, .robustImageAccess2 = robustness2_features.robustImageAccess2, @@ -437,6 +448,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{ .legacyVertexAttributes = true, }, + vk::PhysicalDeviceProvokingVertexFeaturesEXT{ + .provokingVertexLast = true, + }, vk::PhysicalDeviceVertexAttributeDivisorFeatures{ .vertexAttributeInstanceRateDivisor = true, }, @@ -487,6 +501,12 @@ bool Instance::CreateDevice() { if (!depth_clip_control) { device_chain.unlink(); } + if (!depth_clip_enable) { + device_chain.unlink(); + } + if (!depth_clamp_control) { + device_chain.unlink(); + } if (!robustness2) { device_chain.unlink(); } @@ -502,6 +522,9 @@ bool Instance::CreateDevice() { if (!legacy_vertex_attributes) { device_chain.unlink(); } + if (!provoking_vertex) { + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 830b1d5c2..67dcc183a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -109,6 +109,16 @@ public: return depth_clip_control; } + /// Returns true when VK_EXT_depth_clip_enable is supported + bool IsDepthClipEnableSupported() const { + return depth_clip_enable; + } + + /// Returns true when VK_EXT_depth_clamp_control is supported + bool IsDepthClampControlSupported() const { + return depth_clamp_control; + } + /// Returns true when VK_EXT_depth_range_unrestricted is supported bool IsDepthRangeUnrestrictedSupported() const { return depth_range_unrestricted; @@ -150,6 +160,11 @@ public: return legacy_vertex_attributes; } + /// Returns true when VK_EXT_provoking_vertex is supported. + bool IsProvokingVertexSupported() const { + return provoking_vertex; + } + /// Returns true when VK_AMD_shader_image_load_store_lod is supported. bool IsImageLoadStoreLodSupported() const { return image_load_store_lod; @@ -351,6 +366,11 @@ public: return driver_id != vk::DriverId::eMoltenvk; } + /// Returns true if logic ops are supported by the device. + bool IsLogicOpSupported() const { + return features.logicOp; + } + /// Determines if a format is supported for a set of feature flags. [[nodiscard]] bool IsFormatSupported(vk::Format format, vk::FormatFeatureFlags2 flags) const; @@ -399,12 +419,15 @@ private: bool custom_border_color{}; bool fragment_shader_barycentric{}; bool depth_clip_control{}; + bool depth_clip_enable{}; + bool depth_clamp_control{}; bool depth_range_unrestricted{}; bool dynamic_state_3{}; bool vertex_input_dynamic_state{}; bool robustness2{}; bool list_restart{}; bool legacy_vertex_attributes{}; + bool provoking_vertex{}; bool shader_stencil_export{}; bool image_load_store_lod{}; bool amd_gcn_shader{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 8d12b74f3..d9e01091e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -285,26 +285,21 @@ bool PipelineCache::RefreshGraphicsKey() { auto& regs = liverpool->regs; auto& key = graphics_key; - const auto depth_format = instance.GetSupportedFormat( - LiverpoolToVK::DepthFormat(regs.depth_buffer.z_info.format, - regs.depth_buffer.stencil_info.format), - vk::FormatFeatureFlagBits2::eDepthStencilAttachment); - if (regs.depth_buffer.DepthValid()) { - key.depth_format = depth_format; - } else { - key.depth_format = vk::Format::eUndefined; - } - if (regs.depth_buffer.StencilValid()) { - key.stencil_format = depth_format; - } else { - key.stencil_format = vk::Format::eUndefined; - } - + key.z_format = regs.depth_buffer.DepthValid() ? regs.depth_buffer.z_info.format.Value() + : Liverpool::DepthBuffer::ZFormat::Invalid; + key.stencil_format = regs.depth_buffer.StencilValid() + ? regs.depth_buffer.stencil_info.format.Value() + : Liverpool::DepthBuffer::StencilFormat::Invalid; + key.depth_clip_enable = regs.clipper_control.ZclipEnable(); + key.clip_space = regs.clipper_control.clip_space; + key.provoking_vtx_last = regs.polygon_control.provoking_vtx_last; key.prim_type = regs.primitive_type; key.polygon_mode = regs.polygon_control.PolyMode(); - key.clip_space = regs.clipper_control.clip_space; + key.logic_op = regs.color_control.rop3; key.num_samples = regs.NumSamples(); + RefreshDepthClampRange(); + const bool skip_cb_binding = regs.color_control.mode == AmdGpu::Liverpool::ColorControl::OperationMode::Disable; @@ -491,7 +486,63 @@ bool PipelineCache::RefreshGraphicsKey() { } return true; -} // namespace Vulkan +} + +void PipelineCache::RefreshDepthClampRange() { + auto& regs = liverpool->regs; + auto& key = graphics_key; + + key.depth_clamp_enable = !regs.depth_render_override.disable_viewport_clamp; + if (key.z_format == Liverpool::DepthBuffer::ZFormat::Invalid || !key.depth_clamp_enable) { + return; + } + + bool depth_clamp_can_use_viewport_range = true; + bool depth_clamp_is_same_on_all_viewports = true; + float zmin = std::numeric_limits::max(); + float zmax = std::numeric_limits::max(); + const auto& vp_ctl = regs.viewport_control; + for (u32 i = 0; i < Liverpool::NumViewports; i++) { + const auto& vp = regs.viewports[i]; + const auto& vp_d = regs.viewport_depths[i]; + if (vp.xscale == 0) { + continue; + } + const auto zoffset = vp_ctl.zoffset_enable ? vp.zoffset : 0.f; + const auto zscale = vp_ctl.zscale_enable ? vp.zscale : 1.f; + + float min_depth; + float max_depth; + if (regs.clipper_control.clip_space == AmdGpu::Liverpool::ClipSpace::MinusWToW) { + min_depth = zoffset - zscale; + max_depth = zoffset + zscale; + } else { + min_depth = zoffset; + max_depth = zoffset + zscale; + } + if (zmin == std::numeric_limits::max()) { + zmin = vp_d.zmin; + zmax = vp_d.zmax; + } + depth_clamp_is_same_on_all_viewports &= (zmin == vp_d.zmin && zmax == vp_d.zmax); + depth_clamp_can_use_viewport_range &= (min_depth == vp_d.zmin && max_depth == vp_d.zmax); + } + + if (zmin == std::numeric_limits::max()) { + return; + } + + if (!depth_clamp_can_use_viewport_range && !depth_clamp_is_same_on_all_viewports) { + LOG_ERROR(Render_Vulkan, + "Viewport depth clamping configuration cannot be accurately emulated"); + } + + key.depth_clamp_user_defined_range = !depth_clamp_can_use_viewport_range; + if (key.depth_clamp_user_defined_range) { + key.min_depth_clamp = zmin; + key.max_depth_clamp = zmax; + } +} bool PipelineCache::RefreshComputeKey() { Shader::Backend::Bindings binding{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index ba3407b48..405275439 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -76,6 +76,8 @@ private: bool RefreshGraphicsKey(); bool RefreshComputeKey(); + void RefreshDepthClampRange(); + void DumpShader(std::span code, u64 hash, Shader::Stage stage, size_t perm_idx, std::string_view ext); std::optional> GetShaderPatch(u64 hash, Shader::Stage stage, size_t perm_idx, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 2a645f338..b6130e873 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -1014,9 +1014,10 @@ void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline) const { UpdateViewportScissorState(); UpdateDepthStencilState(); UpdatePrimitiveState(); + UpdateRasterizationState(); auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.SetBlendConstants(&liverpool->regs.blend_constants.red); + dynamic_state.SetBlendConstants(liverpool->regs.blend_constants); dynamic_state.SetColorWriteMasks(pipeline.GetWriteMasks()); // Commit new dynamic state to the command buffer. @@ -1086,12 +1087,6 @@ void Rasterizer::UpdateViewportScissorState() const { viewport.maxDepth = zoffset + zscale; } - if (!regs.depth_render_override.disable_viewport_clamp) { - // Apply depth clamp. - viewport.minDepth = std::max(viewport.minDepth, vp_d.zmin); - viewport.maxDepth = std::min(viewport.maxDepth, vp_d.zmax); - } - if (!instance.IsDepthRangeUnrestrictedSupported()) { // Unrestricted depth range not supported by device. Restrict to valid range. viewport.minDepth = std::max(viewport.minDepth, 0.f); @@ -1231,10 +1226,17 @@ void Rasterizer::UpdatePrimitiveState() const { const auto front_face = LiverpoolToVK::FrontFace(regs.polygon_control.front_face); dynamic_state.SetPrimitiveRestartEnabled(prim_restart); + dynamic_state.SetRasterizerDiscardEnabled(regs.clipper_control.dx_rasterization_kill); dynamic_state.SetCullMode(cull_mode); dynamic_state.SetFrontFace(front_face); } +void Rasterizer::UpdateRasterizationState() const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.SetLineWidth(regs.line_control.Width()); +} + void Rasterizer::ScopeMarkerBegin(const std::string_view& str, bool from_guest) { if ((from_guest && !Config::getVkGuestMarkersEnabled()) || (!from_guest && !Config::getVkHostMarkersEnabled())) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 1e1680258..79e7722b8 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -94,6 +94,7 @@ private: void UpdateViewportScissorState() const; void UpdateDepthStencilState() const; void UpdatePrimitiveState() const; + void UpdateRasterizationState() const; bool FilterDraw(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index ac645c9ce..7c3429297 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -308,6 +308,10 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd cmdbuf.setPrimitiveRestartEnable(primitive_restart_enable); } } + if (dirty_state.rasterizer_discard_enable) { + dirty_state.rasterizer_discard_enable = false; + cmdbuf.setRasterizerDiscardEnable(rasterizer_discard_enable); + } if (dirty_state.cull_mode) { dirty_state.cull_mode = false; cmdbuf.setCullMode(cull_mode); @@ -318,7 +322,7 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd } if (dirty_state.blend_constants) { dirty_state.blend_constants = false; - cmdbuf.setBlendConstants(blend_constants); + cmdbuf.setBlendConstants(blend_constants.data()); } if (dirty_state.color_write_masks) { dirty_state.color_write_masks = false; @@ -326,6 +330,10 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd cmdbuf.setColorWriteMaskEXT(0, color_write_masks); } } + if (dirty_state.line_width) { + dirty_state.line_width = false; + cmdbuf.setLineWidth(line_width); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index b5678edbc..3616d8478 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -96,11 +96,13 @@ struct DynamicState { bool stencil_back_compare_mask : 1; bool primitive_restart_enable : 1; + bool rasterizer_discard_enable : 1; bool cull_mode : 1; bool front_face : 1; bool blend_constants : 1; bool color_write_masks : 1; + bool line_width : 1; } dirty_state{}; Viewports viewports{}; @@ -130,11 +132,13 @@ struct DynamicState { u32 stencil_back_compare_mask{}; bool primitive_restart_enable{}; + bool rasterizer_discard_enable{}; vk::CullModeFlags cull_mode{}; vk::FrontFace front_face{}; - float blend_constants[4]{}; + std::array blend_constants{}; ColorWriteMasks color_write_masks{}; + float line_width{}; /// Commits the dynamic state to the provided command buffer. void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf); @@ -283,19 +287,33 @@ struct DynamicState { } } - void SetBlendConstants(const float blend_constants_[4]) { - if (!std::equal(blend_constants, std::end(blend_constants), blend_constants_)) { - std::memcpy(blend_constants, blend_constants_, sizeof(blend_constants)); + void SetBlendConstants(const std::array blend_constants_) { + if (blend_constants != blend_constants_) { + blend_constants = blend_constants_; dirty_state.blend_constants = true; } } + void SetRasterizerDiscardEnabled(const bool enabled) { + if (rasterizer_discard_enable != enabled) { + rasterizer_discard_enable = enabled; + dirty_state.rasterizer_discard_enable = true; + } + } + void SetColorWriteMasks(const ColorWriteMasks& color_write_masks_) { if (!std::ranges::equal(color_write_masks, color_write_masks_)) { color_write_masks = color_write_masks_; dirty_state.color_write_masks = true; } } + + void SetLineWidth(const float width) { + if (line_width != width) { + line_width = width; + dirty_state.line_width = true; + } + } }; class Scheduler { From b68ca43166be4bcb985f442db465b3fcc7d5e4ab Mon Sep 17 00:00:00 2001 From: kalaposfos13 <153381648+kalaposfos13@users.noreply.github.com> Date: Mon, 14 Jul 2025 22:44:13 +0200 Subject: [PATCH 22/22] Implement sceKernelGetSystemSwVersion (#3243) * Implement sceKernelGetSystemSwVersion * Set the reported firmware version to that of the game executable --- src/core/libraries/kernel/kernel.cpp | 15 +++++++++++++++ src/core/libraries/kernel/kernel.h | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/src/core/libraries/kernel/kernel.cpp b/src/core/libraries/kernel/kernel.cpp index 61d2e2f2b..a4d3accac 100644 --- a/src/core/libraries/kernel/kernel.cpp +++ b/src/core/libraries/kernel/kernel.cpp @@ -6,6 +6,7 @@ #include "common/assert.h" #include "common/debug.h" +#include "common/elf_info.h" #include "common/logging/log.h" #include "common/polyfill_thread.h" #include "common/thread.h" @@ -243,6 +244,19 @@ s32 PS4_SYSV_ABI sceKernelSetGPO() { return ORBIS_OK; } +s32 PS4_SYSV_ABI sceKernelGetSystemSwVersion(SwVersionStruct* ret) { + if (ret == nullptr) { + return ORBIS_OK; // but why? + } + ASSERT(ret->struct_size == 40); + u32 fake_fw = Common::ElfInfo::Instance().RawFirmwareVer(); + ret->hex_representation = fake_fw; + std::snprintf(ret->text_representation, 28, "%2x.%03x.%03x", fake_fw >> 0x18, + fake_fw >> 0xc & 0xfff, fake_fw & 0xfff); // why %2x? + LOG_INFO(Lib_Kernel, "called, returned sw version: {}", ret->text_representation); + return ORBIS_OK; +} + void RegisterKernel(Core::Loader::SymbolsResolver* sym) { service_thread = std::jthread{KernelServiceThread}; @@ -258,6 +272,7 @@ void RegisterKernel(Core::Loader::SymbolsResolver* sym) { Libraries::Kernel::RegisterDebug(sym); LIB_OBJ("f7uOxY9mM1U", "libkernel", 1, "libkernel", 1, 1, &g_stack_chk_guard); + LIB_FUNCTION("Mv1zUObHvXI", "libkernel", 1, "libkernel", 1, 1, sceKernelGetSystemSwVersion); LIB_FUNCTION("PfccT7qURYE", "libkernel", 1, "libkernel", 1, 1, kernel_ioctl); LIB_FUNCTION("JGfTMBOdUJo", "libkernel", 1, "libkernel", 1, 1, sceKernelGetFsSandboxRandomWord); LIB_FUNCTION("6xVpy0Fdq+I", "libkernel", 1, "libkernel", 1, 1, _sigprocmask); diff --git a/src/core/libraries/kernel/kernel.h b/src/core/libraries/kernel/kernel.h index 0529c06d5..018759e14 100644 --- a/src/core/libraries/kernel/kernel.h +++ b/src/core/libraries/kernel/kernel.h @@ -35,6 +35,12 @@ struct OrbisWrapperImpl { s32* PS4_SYSV_ABI __Error(); +struct SwVersionStruct { + u64 struct_size; + char text_representation[0x1c]; + u32 hex_representation; +}; + void RegisterKernel(Core::Loader::SymbolsResolver* sym); } // namespace Libraries::Kernel