From 3e2d4d6b793fd14e918673fbacd40914a0714ea0 Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:29:57 +0200 Subject: [PATCH 01/11] Gnmdriver: More functions (#410) * libraries: gnmdriver: added `sceGnmGetGpuCoreClockFrequency` * libraries: gnmdriver: `sceGnmSetVgtControl` added * amdgpu: gpuclock64 in write eop packet --- src/core/libraries/gnmdriver/gnmdriver.cpp | 20 +++++++++++++++----- src/core/libraries/gnmdriver/gnmdriver.h | 5 +++-- src/video_core/amdgpu/pm4_cmds.h | 18 +++++++++++------- src/video_core/amdgpu/pm4_opcodes.h | 1 + 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 650252f95..c2ee6d592 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -956,9 +956,9 @@ int PS4_SYSV_ABI sceGnmGetGpuBlockStatus() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { - LOG_DEBUG(Lib_GnmDriver, "(STUBBED) called"); - return ORBIS_OK; +u32 PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency() { + LOG_TRACE(Lib_GnmDriver, "called"); + return Config::isNeoMode() ? 911'000'000 : 800'000'000; } int PS4_SYSV_ABI sceGnmGetGpuInfoStatus() { @@ -1706,8 +1706,18 @@ int PS4_SYSV_ABI sceGnmSetupMipStatsReport() { return ORBIS_OK; } -int PS4_SYSV_ABI sceGnmSetVgtControl() { - LOG_ERROR(Lib_GnmDriver, "(STUBBED) called"); +s32 PS4_SYSV_ABI sceGnmSetVgtControl(u32* cmdbuf, u32 size, u32 prim_group_sz_minus_one, + u32 partial_vs_wave_mode, u32 wd_switch_only_on_eop_mode) { + LOG_TRACE(Lib_GnmDriver, "called"); + + if (!cmdbuf || size != 3 || (prim_group_sz_minus_one >= 0x100) || + ((wd_switch_only_on_eop_mode | partial_vs_wave_mode) >= 2)) { + return -1; + } + + const u32 reg_value = + ((partial_vs_wave_mode & 1) << 0x10) | (prim_group_sz_minus_one & 0xffffu); + PM4CmdSetData::SetContextReg(cmdbuf, 0x2aau, reg_value); // IA_MULTI_VGT_PARAM return ORBIS_OK; } diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index 8100b1164..84872297e 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -85,7 +85,7 @@ int PS4_SYSV_ABI sceGnmGetDebugTimestamp(); int PS4_SYSV_ABI sceGnmGetEqEventType(); int PS4_SYSV_ABI sceGnmGetEqTimeStamp(); int PS4_SYSV_ABI sceGnmGetGpuBlockStatus(); -int PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency(); +u32 PS4_SYSV_ABI sceGnmGetGpuCoreClockFrequency(); int PS4_SYSV_ABI sceGnmGetGpuInfoStatus(); int PS4_SYSV_ABI sceGnmGetLastWaitedAddress(); int PS4_SYSV_ABI sceGnmGetNumTcaUnits(); @@ -161,7 +161,8 @@ int PS4_SYSV_ABI sceGnmSetResourceUserData(); int PS4_SYSV_ABI sceGnmSetSpiEnableSqCounters(); int PS4_SYSV_ABI sceGnmSetSpiEnableSqCountersForUnitInstance(); int PS4_SYSV_ABI sceGnmSetupMipStatsReport(); -int PS4_SYSV_ABI sceGnmSetVgtControl(); +s32 PS4_SYSV_ABI sceGnmSetVgtControl(u32* cmdbuf, u32 size, u32 prim_group_sz_minus_one, + u32 partial_vs_wave_mode, u32 wd_switch_only_on_eop_mode); s32 PS4_SYSV_ABI sceGnmSetVsShader(u32* cmdbuf, u32 size, const u32* vs_regs, u32 shader_modifier); int PS4_SYSV_ABI sceGnmSetWaveLimitMultiplier(); int PS4_SYSV_ABI sceGnmSetWaveLimitMultipliers(); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index e5f618cc2..5ab233fdc 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -282,6 +282,13 @@ enum class InterruptSelect : u32 { IrqUndocumented = 3, }; +static u64 GetGpuClock64() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + auto ticks = std::chrono::duration_cast(duration).count(); + return static_cast(ticks); +} + struct PM4CmdEventWriteEop { PM4Type3Header header; union { @@ -325,6 +332,10 @@ struct PM4CmdEventWriteEop { *Address() = DataQWord(); break; } + case DataSelect::GpuClock64: { + *Address() = GetGpuClock64(); + break; + } case DataSelect::PerfCounter: { *Address() = Common::FencedRDTSC(); break; @@ -652,13 +663,6 @@ struct PM4CmdReleaseMem { return data_lo | u64(data_hi) << 32; } - uint64_t GetGpuClock64() const { - auto now = std::chrono::high_resolution_clock::now(); - auto duration = now.time_since_epoch(); - auto ticks = std::chrono::duration_cast(duration).count(); - return static_cast(ticks); - } - void SignalFence(Platform::InterruptId irq_id) const { switch (data_sel.Value()) { case DataSelect::Data32Low: { diff --git a/src/video_core/amdgpu/pm4_opcodes.h b/src/video_core/amdgpu/pm4_opcodes.h index 8922c4ea3..fba0cbb9f 100644 --- a/src/video_core/amdgpu/pm4_opcodes.h +++ b/src/video_core/amdgpu/pm4_opcodes.h @@ -41,6 +41,7 @@ enum class PM4ItOpcode : u32 { CondIndirectBuffer = 0x3F, CopyData = 0x40, CommandProcessorDma = 0x41, + PfpSyncMe = 0x42, SurfaceSync = 0x43, CondWrite = 0x45, EventWrite = 0x46, From ace39957efa1f8b65902b7b197a9e8983dc99334 Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:46:45 +0200 Subject: [PATCH 02/11] Video Core: debug tools (#412) * video_core: better use of rdoc markers * renderer_vulkan: added gpu assisted validation * renderer_vulkan: make nv_checkpoints operational * video_core: unified Vulkan objects names --- src/common/config.cpp | 14 ++++ src/common/config.h | 2 + src/video_core/amdgpu/liverpool.cpp | 36 +++++++--- src/video_core/buffer_cache/buffer.cpp | 12 ++-- .../renderer_vulkan/vk_instance.cpp | 11 ++++ src/video_core/renderer_vulkan/vk_instance.h | 5 ++ .../renderer_vulkan/vk_platform.cpp | 65 ++++++++++++++++--- .../renderer_vulkan/vk_rasterizer.cpp | 30 ++++++++- .../renderer_vulkan/vk_rasterizer.h | 4 +- .../renderer_vulkan/vk_scheduler.cpp | 7 ++ src/video_core/texture_cache/image.cpp | 4 ++ 11 files changed, 161 insertions(+), 29 deletions(-) diff --git a/src/common/config.cpp b/src/common/config.cpp index ebdd9c320..3cf9af150 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -25,7 +25,9 @@ static bool shouldDumpPM4 = false; static u32 vblankDivider = 1; static bool vkValidation = false; static bool vkValidationSync = false; +static bool vkValidationGpu = false; static bool rdocEnable = false; +static bool rdocMarkersEnable = false; // Gui std::string settings_install_dir = ""; u32 main_window_geometry_x = 400; @@ -102,6 +104,10 @@ bool isRdocEnabled() { return rdocEnable; } +bool isMarkersEnabled() { + return rdocMarkersEnable; +} + u32 vblankDiv() { return vblankDivider; } @@ -114,6 +120,10 @@ bool vkValidationSyncEnabled() { return vkValidationSync; } +bool vkValidationGpuEnabled() { + return vkValidationGpu; +} + void setScreenWidth(u32 width) { screenWidth = width; } @@ -319,7 +329,9 @@ void load(const std::filesystem::path& path) { gpuId = toml::find_or(vk, "gpuId", -1); vkValidation = toml::find_or(vk, "validation", false); vkValidationSync = toml::find_or(vk, "validation_sync", false); + vkValidationGpu = toml::find_or(vk, "validation_gpu", true); rdocEnable = toml::find_or(vk, "rdocEnable", false); + rdocMarkersEnable = toml::find_or(vk, "rdocMarkersEnable", false); } if (data.contains("Debug")) { @@ -394,7 +406,9 @@ void save(const std::filesystem::path& path) { data["Vulkan"]["gpuId"] = gpuId; data["Vulkan"]["validation"] = vkValidation; data["Vulkan"]["validation_sync"] = vkValidationSync; + data["Vulkan"]["validation_gpu"] = vkValidationGpu; data["Vulkan"]["rdocEnable"] = rdocEnable; + data["Vulkan"]["rdocMarkersEnable"] = rdocMarkersEnable; data["Debug"]["DebugDump"] = isDebugDump; data["LLE"]["libc"] = isLibc; data["GUI"]["theme"] = mw_themes; diff --git a/src/common/config.h b/src/common/config.h index ad0aad221..37ace79c3 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -27,6 +27,7 @@ bool nullGpu(); bool dumpShaders(); bool dumpPM4(); bool isRdocEnabled(); +bool isMarkersEnabled(); u32 vblankDiv(); void setDebugDump(bool enable); @@ -50,6 +51,7 @@ void setRdocEnabled(bool enable); bool vkValidationEnabled(); bool vkValidationSyncEnabled(); +bool vkValidationGpuEnabled(); // Gui void setMainWindowGeometry(u32 x, u32 y, u32 w, u32 h); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index af1963eec..bd32b5b96 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -180,6 +180,17 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanSignal(Platform::InterruptId::GfxFlip); break; } + case PM4CmdNop::PayloadType::DebugMarkerPush: { + const auto marker_sz = nop->header.count.Value() * 2; + const std::string_view label{reinterpret_cast(&nop->data_block[1]), + marker_sz}; + rasterizer->ScopeMarkerBegin(label); + break; + } + case PM4CmdNop::PayloadType::DebugMarkerPop: { + rasterizer->ScopeMarkerEnd(); + break; + } default: break; } @@ -295,8 +306,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:DrawIndex2", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndex2", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(true); rasterizer->ScopeMarkerEnd(); } @@ -308,8 +320,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index_off->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin(fmt::format( - "dcb:{}:DrawIndexOffset2", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndexOffset2", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(true, draw_index_off->index_offset); rasterizer->ScopeMarkerEnd(); } @@ -320,8 +333,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanindex_count; regs.draw_initiator = draw_index->draw_initiator; if (rasterizer) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:DrawIndexAuto", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndexAuto", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->Draw(false); rasterizer->ScopeMarkerEnd(); } @@ -334,8 +348,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spandim_z; regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { - rasterizer->ScopeMarkerBegin( - fmt::format("dcb:{}:Dispatch", reinterpret_cast(dcb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->DispatchDirect(); rasterizer->ScopeMarkerEnd(); } @@ -486,8 +501,9 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { regs.cs_program.dim_z = dispatch_direct->dim_z; regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { - rasterizer->ScopeMarkerBegin(fmt::format( - "acb[{}]:{}:Dispatch", vqid, reinterpret_cast(acb.data()))); + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); rasterizer->DispatchDirect(); rasterizer->ScopeMarkerEnd(); } diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index e9498b352..d112864d5 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -106,10 +106,8 @@ Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_ VmaAllocationInfo alloc_info{}; buffer.Create(buffer_ci, usage, &alloc_info); - if (instance->HasDebuggingToolAttached()) { - const auto device = instance->GetDevice(); - Vulkan::SetObjectName(device, Handle(), "Buffer {:#x} {} KiB", cpu_addr, size_bytes / 1024); - } + const auto device = instance->GetDevice(); + Vulkan::SetObjectName(device, Handle(), "Buffer {:#x}:{:#x}", cpu_addr, size_bytes); // Map it if it is host visible. VkMemoryPropertyFlags property_flags{}; @@ -152,10 +150,8 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); const auto device = instance.GetDevice(); - if (instance.HasDebuggingToolAttached()) { - Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}): {} KiB", BufferTypeName(usage), - size_bytes / 1024); - } + Vulkan::SetObjectName(device, Handle(), "StreamBuffer({}):{:#x}", BufferTypeName(usage), + size_bytes); } std::pair StreamBuffer::Map(u64 size, u64 alignment) { diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 2d396daf0..eedba4c82 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -8,6 +8,7 @@ #include #include "common/assert.h" +#include "common/config.h" #include "sdl_window.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -213,6 +214,13 @@ bool Instance::CreateDevice() { add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); add_extension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME); add_extension(VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); + const bool has_sync2 = add_extension(VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME); + + if (has_sync2) { + has_nv_checkpoints = Config::isMarkersEnabled() + ? add_extension(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME) + : false; + } const auto family_properties = physical_device.getQueueFamilyProperties(); if (family_properties.empty()) { @@ -308,6 +316,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceRobustness2FeaturesEXT{ .nullDescriptor = true, }, + vk::PhysicalDeviceSynchronization2Features{ + .synchronization2 = has_sync2, + }, }; if (!color_write_en) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index a8c0dcf45..2f2397d64 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -88,6 +88,10 @@ public: return profiler_context; } + bool HasNvCheckpoints() const { + return has_nv_checkpoints; + } + /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { return has_renderdoc || has_nsight_graphics; @@ -259,6 +263,7 @@ private: bool debug_utils_supported{}; bool has_nsight_graphics{}; bool has_renderdoc{}; + bool has_nv_checkpoints{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_platform.cpp b/src/video_core/renderer_vulkan/vk_platform.cpp index 0915514b8..33113c58b 100644 --- a/src/video_core/renderer_vulkan/vk_platform.cpp +++ b/src/video_core/renderer_vulkan/vk_platform.cpp @@ -221,12 +221,61 @@ vk::UniqueInstance CreateInstance(vk::DynamicLoader& dl, Frontend::WindowSystemT vk::Bool32 enable_sync = enable_validation && Config::vkValidationSyncEnabled() ? vk::True : vk::False; - vk::LayerSettingEXT layer_set = { - .pLayerName = VALIDATION_LAYER_NAME, - .pSettingName = "validate_sync", - .type = vk::LayerSettingTypeEXT::eBool32, - .valueCount = 1, - .pValues = &enable_sync, + vk::Bool32 enable_gpuav = + enable_validation && Config::vkValidationSyncEnabled() ? vk::True : vk::False; + const char* gpuav_mode = enable_validation && Config::vkValidationGpuEnabled() + ? "GPU_BASED_GPU_ASSISTED" + : "GPU_BASED_NONE"; + const std::array layer_setings = { + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "validate_sync", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_sync, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "sync_queue_submit", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_sync, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "validate_gpu_based", + .type = vk::LayerSettingTypeEXT::eString, + .valueCount = 1, + .pValues = &gpuav_mode, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_reserve_binding_slot", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_descriptor_checks", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_validate_indirect_buffer", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, + vk::LayerSettingEXT{ + .pLayerName = VALIDATION_LAYER_NAME, + .pSettingName = "gpuav_buffer_copies", + .type = vk::LayerSettingTypeEXT::eBool32, + .valueCount = 1, + .pValues = &enable_gpuav, + }, }; vk::StructureChain instance_ci_chain = { @@ -238,8 +287,8 @@ vk::UniqueInstance CreateInstance(vk::DynamicLoader& dl, Frontend::WindowSystemT .ppEnabledExtensionNames = extensions.data(), }, vk::LayerSettingsCreateInfoEXT{ - .settingCount = 1, - .pSettings = &layer_set, + .settingCount = layer_setings.size(), + .pSettings = layer_setings.data(), }, }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 51de09f7a..b6e43a1ad 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -230,16 +230,42 @@ void Rasterizer::UpdateDepthStencilState() { cmdbuf.setDepthBoundsTestEnable(depth.depth_bounds_enable); } -void Rasterizer::ScopeMarkerBegin(const std::string& str) { +void Rasterizer::ScopeMarkerBegin(const std::string_view& str) { + if (!Config::isMarkersEnabled()) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.beginDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ - .pLabelName = str.c_str(), + .pLabelName = str.data(), }); } void Rasterizer::ScopeMarkerEnd() { + if (!Config::isMarkersEnabled()) { + return; + } + const auto cmdbuf = scheduler.CommandBuffer(); cmdbuf.endDebugUtilsLabelEXT(); } +void Rasterizer::ScopedMarkerInsert(const std::string_view& str) { + if (!Config::isMarkersEnabled()) { + return; + } + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.insertDebugUtilsLabelEXT(vk::DebugUtilsLabelEXT{ + .pLabelName = str.data(), + }); +} + +void Rasterizer::Breadcrumb(u64 id) { + if (!instance.HasNvCheckpoints()) { + return; + } + scheduler.CommandBuffer().setCheckpointNV(id); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 685ba6e00..a151ebc27 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -35,8 +35,10 @@ public: void DispatchDirect(); - void ScopeMarkerBegin(const std::string& str); + void ScopeMarkerBegin(const std::string_view& str); void ScopeMarkerEnd(); + void ScopedMarkerInsert(const std::string_view& str); + void Breadcrumb(u64 id); void InvalidateMemory(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index fb64285f1..c74f3d07d 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -158,6 +158,13 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { try { instance.GetGraphicsQueue().submit(submit_info, info.fence); } catch (vk::DeviceLostError& err) { + if (instance.HasNvCheckpoints()) { + const auto checkpoint_data = instance.GetGraphicsQueue().getCheckpointData2NV(); + for (const auto& cp : checkpoint_data) { + LOG_CRITICAL(Render_Vulkan, "{}: {:#x}", vk::to_string(cp.stage), + reinterpret_cast(cp.pCheckpointMarker)); + } + } UNREACHABLE_MSG("Device lost during submit: {}", err.what()); } diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index f7aef8471..f1148760e 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" +#include "common/config.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -154,6 +155,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, }; image.Create(image_ci); + + Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {:#x}:{:#x}", + info.guest_address, info.guest_size_bytes); } void Image::Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask, From 3d0fdf11f0781ac8b84f5e501d8837137a45e387 Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:23:01 +0200 Subject: [PATCH 03/11] Build stabilization (#413) * shader_recompiler: fix for float convert and debug asserts * libraries: kernel: correct return code on invalid semaphore * amdgpu: additional case for cb extents retrieval heuristic * removed redundant check in assert * amdgpu: fix for linear tiling mode detection fin color buffers * texture_cache: fix for unexpected scheduler flushes by detiler * renderer_vulkan: missing depth barrier * texture_cache: missed slices in rt view; + detiler format --- .../libraries/kernel/threads/semaphore.cpp | 12 +++++++ .../spirv/emit_spirv_context_get_set.cpp | 9 +---- .../backend/spirv/emit_spirv_image.cpp | 4 +-- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../ir/passes/resource_tracking_pass.cpp | 6 ++-- src/shader_recompiler/runtime_info.h | 2 +- src/video_core/amdgpu/liverpool.cpp | 2 +- src/video_core/amdgpu/liverpool.h | 2 +- src/video_core/buffer_cache/buffer.h | 4 +++ .../renderer_vulkan/vk_rasterizer.cpp | 3 +- .../renderer_vulkan/vk_scheduler.cpp | 36 ++++++++++++++++--- src/video_core/renderer_vulkan/vk_scheduler.h | 3 +- src/video_core/texture_cache/image_info.cpp | 4 ++- src/video_core/texture_cache/image_view.cpp | 2 ++ src/video_core/texture_cache/tile_manager.cpp | 3 +- 15 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index 370dba445..5441c641a 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -174,10 +174,16 @@ s32 PS4_SYSV_ABI sceKernelCreateSema(OrbisKernelSema* sem, const char* pName, u3 } s32 PS4_SYSV_ABI sceKernelWaitSema(OrbisKernelSema sem, s32 needCount, u32* pTimeout) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Wait(true, needCount, pTimeout); } s32 PS4_SYSV_ABI sceKernelSignalSema(OrbisKernelSema sem, s32 signalCount) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } if (!sem->Signal(signalCount)) { return ORBIS_KERNEL_ERROR_EINVAL; } @@ -185,10 +191,16 @@ s32 PS4_SYSV_ABI sceKernelSignalSema(OrbisKernelSema sem, s32 signalCount) { } s32 PS4_SYSV_ABI sceKernelPollSema(OrbisKernelSema sem, s32 needCount) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Wait(false, needCount, nullptr); } int PS4_SYSV_ABI sceKernelCancelSema(OrbisKernelSema sem, s32 setCount, s32* pNumWaitThreads) { + if (!sem) { + return ORBIS_KERNEL_ERROR_ESRCH; + } return sem->Cancel(setCount, pNumWaitThreads); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index e85272e96..bd34ed3db 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -386,19 +386,12 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com if (is_signed) { value = ctx.OpBitFieldSExtract(ctx.S32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpConvertSToF(ctx.F32[1], value); } else { value = ctx.OpBitFieldUExtract(ctx.U32[1], value, comp_offset, ctx.ConstU32(bit_width)); - value = ctx.OpConvertUToF(ctx.F32[1], value); - } - } else { - if (is_signed) { - value = ctx.OpConvertSToF(ctx.F32[1], value); - } else { - value = ctx.OpConvertUToF(ctx.F32[1], value); } } + value = ctx.OpBitcast(ctx.F32[1], value); return ConvertValue(ctx, value, num_format, bit_width); } break; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 030d39485..e2d2c1ae0 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -33,14 +33,14 @@ Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id c operands.operands); } -Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, +Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id lod, Id offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, - spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); + spv::ImageOperandsMask::Lod, lod); } Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 51899eb4d..0b2020f18 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -359,7 +359,7 @@ Id EmitConvertU32U16(EmitContext& ctx, Id value); Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, Id offset); -Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, +Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id lod, Id offset); Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, Id bias_lc, const IR::Value& offset); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 97438f80a..6c96faa3d 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -376,9 +376,11 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, return -1; } // We have found this pattern. Build the sharp. - std::array buffer; + std::array buffer; buffer[0] = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32(); - buffer[1] = handle->Arg(2).U32() | handle->Arg(3).U64() << 32; + buffer[1] = 0; + buffer[2] = handle->Arg(2).U32(); + buffer[3] = handle->Arg(3).U32(); cbuf = std::bit_cast(buffer); // Assign a binding to this sharp. return descriptors.Add(BufferResource{ diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 4ab71c3b7..b936e06aa 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -116,7 +116,7 @@ struct PushData { std::array buf_offsets; void AddOffset(u32 binding, u32 offset) { - ASSERT(offset < 64 && binding < 32); + ASSERT(offset < 256 && binding < buf_offsets.size()); buf_offsets[binding] = offset; } }; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index bd32b5b96..517f9d53a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -237,7 +237,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spantype3.count; - if (nop_offset == 0x0e || nop_offset == 0x0d) { + if (nop_offset == 0x0e || nop_offset == 0x0d || nop_offset == 0x0b) { ASSERT_MSG(payload[nop_offset] == 0xc0001000, "NOP hint is missing in CB setup sequence"); last_cb_extent[col_buf_id].raw = payload[nop_offset + 1]; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 3ebd9a971..e28b5680b 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -766,7 +766,7 @@ struct Liverpool { } TilingMode GetTilingMode() const { - return attrib.tile_mode_index; + return info.linear_general ? TilingMode::Display_Linear : attrib.tile_mode_index; } bool IsTiled() const { diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index e0d9da08f..d373fbffb 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -146,6 +146,10 @@ public: return offset; } + u64 GetFreeSize() const { + return size_bytes - offset - mapped_size; + } + private: struct Watch { u64 tick{}; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index b6e43a1ad..542624a0e 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -152,7 +152,8 @@ void Rasterizer::BeginRendering() { .stencil = regs.stencil_clear}}, }; texture_cache.TouchMeta(htile_address, false); - state.num_depth_attachments++; + state.has_depth = true; + state.has_stencil = image.info.usage.stencil; } scheduler.BeginRendering(state); } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index c74f3d07d..a6c2536ba 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -38,8 +38,7 @@ void Scheduler::BeginRendering(const RenderState& new_state) { .layerCount = 1, .colorAttachmentCount = render_state.num_color_attachments, .pColorAttachments = render_state.color_attachments.data(), - .pDepthAttachment = - render_state.num_depth_attachments ? &render_state.depth_attachment : nullptr, + .pDepthAttachment = render_state.has_depth ? &render_state.depth_attachment : nullptr, }; current_cmdbuf.beginRendering(rendering_info); @@ -50,6 +49,8 @@ void Scheduler::EndRendering() { return; } is_rendering = false; + current_cmdbuf.endRendering(); + boost::container::static_vector barriers; for (size_t i = 0; i < render_state.num_color_attachments; ++i) { barriers.push_back(vk::ImageMemoryBarrier{ @@ -70,10 +71,35 @@ void Scheduler::EndRendering() { }, }); } - current_cmdbuf.endRendering(); + if (render_state.has_depth) { + barriers.push_back(vk::ImageMemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eDepthStencilAttachmentWrite, + .dstAccessMask = vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite, + .oldLayout = render_state.depth_attachment.imageLayout, + .newLayout = render_state.depth_attachment.imageLayout, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = render_state.depth_image, + .subresourceRange = + { + .aspectMask = vk::ImageAspectFlagBits::eDepth | + (render_state.has_stencil ? vk::ImageAspectFlagBits::eStencil + : vk::ImageAspectFlagBits::eNone), + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }); + } + if (!barriers.empty()) { - current_cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eColorAttachmentOutput, - vk::PipelineStageFlagBits::eFragmentShader, + const auto src_stages = + vk::PipelineStageFlagBits::eColorAttachmentOutput | + (render_state.has_depth ? vk::PipelineStageFlagBits::eLateFragmentTests | + vk::PipelineStageFlagBits::eEarlyFragmentTests + : vk::PipelineStageFlagBits::eNone); + current_cmdbuf.pipelineBarrier(src_stages, vk::PipelineStageFlagBits::eFragmentShader, vk::DependencyFlagBits::eByRegion, {}, {}, barriers); } } diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index b82d558ca..1140bfbc2 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -20,7 +20,8 @@ struct RenderState { vk::RenderingAttachmentInfo depth_attachment{}; vk::Image depth_image{}; u32 num_color_attachments{}; - u32 num_depth_attachments{}; + bool has_depth{}; + bool has_stencil{}; u32 width = std::numeric_limits::max(); u32 height = std::numeric_limits::max(); diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 94917be0a..17b78a6d5 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -189,6 +189,8 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice resources.layers = num_slices; meta_info.htile_addr = buffer.z_info.tile_surface_en ? htile_address : 0; usage.depth_target = true; + usage.stencil = + buffer.stencil_info.format != AmdGpu::Liverpool::DepthBuffer::StencilFormat::Invalid; guest_address = buffer.Address(); const auto depth_slice_sz = buffer.GetDepthSliceSize(); @@ -260,7 +262,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { case AmdGpu::TilingMode::Display_MacroTiled: case AmdGpu::TilingMode::Texture_MacroTiled: case AmdGpu::TilingMode::Depth_MacroTiled: { - // ASSERT(!props.is_cube && !props.is_block); + ASSERT(!props.is_block); ASSERT(num_samples == 1); std::tie(mip_info.pitch, mip_info.size) = ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index ef6163c4e..cbf77f2d8 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -92,6 +92,8 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer, bool is_vo_surface) noexcept { const auto base_format = Vulkan::LiverpoolToVK::SurfaceFormat(col_buffer.info.format, col_buffer.NumFormat()); + range.base.layer = col_buffer.view.slice_start; + range.extent.layers = col_buffer.NumSlices(); format = Vulkan::LiverpoolToVK::AdjustColorBufferFormat( base_format, col_buffer.info.comp_swap.Value(), is_vo_surface); } diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index d3a7d7960..75fa378cd 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -194,6 +194,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eR32G32Sfloat: case vk::Format::eR32G32Uint: case vk::Format::eR16G16B16A16Unorm: + case vk::Format::eR16G16B16A16Sfloat: return vk::Format::eR32G32Uint; case vk::Format::eBc2SrgbBlock: case vk::Format::eBc2UnormBlock: @@ -397,7 +398,7 @@ std::optional TileManager::TryDetile(Image& image) { const u32 image_size = image.info.guest_size_bytes; const auto [in_buffer, in_offset] = [&] -> std::pair { // Use stream buffer for smaller textures. - if (image_size <= StreamBufferSize) { + if (image_size <= stream_buffer.GetFreeSize()) { u32 offset = stream_buffer.Copy(image.info.guest_address, image_size); return {stream_buffer.Handle(), offset}; } From 2ba3221fc94df83d79103b957fd8888296c4f305 Mon Sep 17 00:00:00 2001 From: psucien Date: Mon, 12 Aug 2024 20:10:42 +0200 Subject: [PATCH 04/11] fix for Linux compilation (#416) --- src/video_core/amdgpu/liverpool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index e28b5680b..779e55368 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -766,7 +766,8 @@ struct Liverpool { } TilingMode GetTilingMode() const { - return info.linear_general ? TilingMode::Display_Linear : attrib.tile_mode_index; + return info.linear_general ? TilingMode::Display_Linear + : attrib.tile_mode_index.Value(); } bool IsTiled() const { From a15a93997cbd4bcf14008926bcaf0db3994137ed Mon Sep 17 00:00:00 2001 From: psucien Date: Mon, 12 Aug 2024 22:52:21 +0200 Subject: [PATCH 05/11] unlink sync2 if not present (tentative fix for #418) --- src/video_core/renderer_vulkan/vk_instance.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index eedba4c82..a54f2e0c4 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -317,7 +317,7 @@ bool Instance::CreateDevice() { .nullDescriptor = true, }, vk::PhysicalDeviceSynchronization2Features{ - .synchronization2 = has_sync2, + .synchronization2 = true, }, }; @@ -328,12 +328,18 @@ bool Instance::CreateDevice() { if (!robustness) { device_chain.unlink(); } + if (!has_sync2) { + device_chain.unlink(); + } try { device = physical_device.createDeviceUnique(device_chain.get()); } catch (vk::ExtensionNotPresentError& err) { LOG_CRITICAL(Render_Vulkan, "Some required extensions are not available {}", err.what()); return false; + } catch (vk::FeatureNotPresentError& err) { + LOG_CRITICAL(Render_Vulkan, "Some required features are not available {}", err.what()); + return false; } VULKAN_HPP_DEFAULT_DISPATCHER.init(*device); From 284035d3e2089beb5a353101b919b3a0fbb410ce Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:41:26 -0700 Subject: [PATCH 06/11] Enable VK_EXT_robustness2 nullDescriptor only if supported. --- src/video_core/renderer_vulkan/vk_instance.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index a54f2e0c4..5beb57c44 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -164,7 +164,7 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceColorWriteEnableFeaturesEXT, vk::PhysicalDeviceVulkan12Features, vk::PhysicalDeviceVulkan13Features, vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR, - vk::PhysicalDeviceDepthClipControlFeaturesEXT>(); + vk::PhysicalDeviceDepthClipControlFeaturesEXT, vk::PhysicalDeviceRobustness2FeaturesEXT>(); const vk::StructureChain properties_chain = physical_device.getProperties2< vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); @@ -325,7 +325,10 @@ bool Instance::CreateDevice() { device_chain.unlink(); device_chain.unlink(); } - if (!robustness) { + if (robustness) { + device_chain.get().nullDescriptor = + feature_chain.get().nullDescriptor; + } else { device_chain.unlink(); } if (!has_sync2) { From 18f179928007c99dd2f5a739cf04a7b49dbab45f Mon Sep 17 00:00:00 2001 From: Borchev <4501931+Borchev@users.noreply.github.com> Date: Mon, 12 Aug 2024 23:05:30 -0700 Subject: [PATCH 07/11] Add partial unmap support (#322) * Add partial unmap support * undo accidental whitespace removal * Fix assertions * Adjust Reserve and Free functions for partial unmapping --- src/core/address_space.cpp | 24 ++++++++++++++++++++++-- src/core/address_space.h | 3 ++- src/core/memory.cpp | 37 +++++++++++++++++++++---------------- src/core/memory.h | 3 ++- 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index c3e0d77aa..91c8b7a1f 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -454,8 +454,28 @@ void* AddressSpace::MapFile(VAddr virtual_addr, size_t size, size_t offset, u32 #endif } -void AddressSpace::Unmap(VAddr virtual_addr, size_t size, bool has_backing) { - return impl->Unmap(virtual_addr, size, has_backing); +void AddressSpace::Unmap(VAddr virtual_addr, size_t size, VAddr start_in_vma, VAddr end_in_vma, + PAddr phys_base, bool is_exec, bool has_backing) { +#ifdef _WIN32 + // There does not appear to be comparable support for partial unmapping on Windows. + // Unfortunately, a least one title was found to require this. The workaround is to unmap + // the entire allocation and remap the portions outside of the requested unmapping range. + impl->Unmap(virtual_addr, size, has_backing); + + // TODO: Determine if any titles require partial unmapping support for flexible allocations. + ASSERT_MSG(has_backing || (start_in_vma == 0 && end_in_vma == size), + "Partial unmapping of flexible allocations is not supported"); + + if (start_in_vma != 0) { + Map(virtual_addr, start_in_vma, 0, phys_base, is_exec); + } + + if (end_in_vma != size) { + Map(virtual_addr + end_in_vma, size - end_in_vma, 0, phys_base + end_in_vma, is_exec); + } +#else + impl->Unmap(virtual_addr + start_in_vma, end_in_vma - start_in_vma, has_backing); +#endif } void AddressSpace::Protect(VAddr virtual_addr, size_t size, MemoryPermission perms) { diff --git a/src/core/address_space.h b/src/core/address_space.h index 29f74f568..53041bccb 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -91,7 +91,8 @@ public: void* MapFile(VAddr virtual_addr, size_t size, size_t offset, u32 prot, uintptr_t fd); /// Unmaps specified virtual memory area. - void Unmap(VAddr virtual_addr, size_t size, bool has_backing); + void Unmap(VAddr virtual_addr, size_t size, VAddr start_in_vma, VAddr end_in_vma, + PAddr phys_base, bool is_exec, bool has_backing); void Protect(VAddr virtual_addr, size_t size, MemoryPermission perms); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index dc5ded41d..eed5126c0 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -54,7 +54,7 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, free_addr = alignment > 0 ? Common::AlignUp(free_addr, alignment) : free_addr; // Add the allocated region to the list and commit its pages. - auto& area = CarveDmemArea(free_addr, size); + auto& area = CarveDmemArea(free_addr, size)->second; area.memory_type = memory_type; area.is_free = false; return free_addr; @@ -63,9 +63,8 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, void MemoryManager::Free(PAddr phys_addr, size_t size) { std::scoped_lock lk{mutex}; - const auto dmem_area = FindDmemArea(phys_addr); - ASSERT(dmem_area != dmem_map.end() && dmem_area->second.base == phys_addr && - dmem_area->second.size == size); + auto dmem_area = CarveDmemArea(phys_addr, size); + ASSERT(dmem_area != dmem_map.end() && dmem_area->second.size >= size); // Release any dmem mappings that reference this physical block. std::vector> remove_list; @@ -74,10 +73,11 @@ void MemoryManager::Free(PAddr phys_addr, size_t size) { continue; } if (mapping.phys_base <= phys_addr && phys_addr < mapping.phys_base + mapping.size) { - LOG_INFO(Kernel_Vmm, "Unmaping direct mapping {:#x} with size {:#x}", addr, - mapping.size); + auto vma_segment_start_addr = phys_addr - mapping.phys_base + addr; + LOG_INFO(Kernel_Vmm, "Unmaping direct mapping {:#x} with size {:#x}", + vma_segment_start_addr, size); // Unmaping might erase from vma_map. We can't do it here. - remove_list.emplace_back(addr, mapping.size); + remove_list.emplace_back(vma_segment_start_addr, size); } } for (const auto& [addr, size] : remove_list) { @@ -104,8 +104,6 @@ int MemoryManager::Reserve(void** out_addr, VAddr virtual_addr, size_t size, Mem const auto& vma = FindVMA(mapped_addr)->second; // If the VMA is mapped, unmap the region first. if (vma.IsMapped()) { - ASSERT_MSG(vma.base == mapped_addr && vma.size == size, - "Region must match when reserving a mapped region"); UnmapMemory(mapped_addr, size); } const size_t remaining_size = vma.base + vma.size - mapped_addr; @@ -169,6 +167,7 @@ int MemoryManager::MapMemory(void** out_addr, VAddr virtual_addr, size_t size, M new_vma.prot = prot; new_vma.name = name; new_vma.type = type; + new_vma.is_exec = is_exec; if (type == VMAType::Direct) { new_vma.phys_base = phys_addr; @@ -216,10 +215,16 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { std::scoped_lock lk{mutex}; const auto it = FindVMA(virtual_addr); - ASSERT_MSG(it->second.Contains(virtual_addr, size), + const auto& vma_base = it->second; + ASSERT_MSG(vma_base.Contains(virtual_addr, size), "Existing mapping does not contain requested unmap range"); - const auto type = it->second.type; + const auto vma_base_addr = vma_base.base; + const auto vma_base_size = vma_base.size; + const auto phys_base = vma_base.phys_base; + const bool is_exec = vma_base.is_exec; + const auto start_in_vma = virtual_addr - vma_base_addr; + const auto type = vma_base.type; const bool has_backing = type == VMAType::Direct || type == VMAType::File; if (type == VMAType::Direct) { rasterizer->UnmapMemory(virtual_addr, size); @@ -239,7 +244,8 @@ void MemoryManager::UnmapMemory(VAddr virtual_addr, size_t size) { MergeAdjacent(vma_map, new_it); // Unmap the memory region. - impl.Unmap(virtual_addr, size, has_backing); + impl.Unmap(vma_base_addr, vma_base_size, start_in_vma, start_in_vma + size, phys_base, is_exec, + has_backing); TRACK_FREE(virtual_addr, "VMEM"); } @@ -397,13 +403,12 @@ MemoryManager::VMAHandle MemoryManager::CarveVMA(VAddr virtual_addr, size_t size return vma_handle; } -DirectMemoryArea& MemoryManager::CarveDmemArea(PAddr addr, size_t size) { +MemoryManager::DMemHandle MemoryManager::CarveDmemArea(PAddr addr, size_t size) { auto dmem_handle = FindDmemArea(addr); ASSERT_MSG(dmem_handle != dmem_map.end(), "Physical address not in dmem_map"); const DirectMemoryArea& area = dmem_handle->second; - ASSERT_MSG(area.is_free && area.base <= addr, - "Adding an allocation to already allocated region"); + ASSERT_MSG(area.base <= addr, "Adding an allocation to already allocated region"); const PAddr start_in_area = addr - area.base; const PAddr end_in_vma = start_in_area + size; @@ -418,7 +423,7 @@ DirectMemoryArea& MemoryManager::CarveDmemArea(PAddr addr, size_t size) { dmem_handle = Split(dmem_handle, start_in_area); } - return dmem_handle->second; + return dmem_handle; } MemoryManager::VMAHandle MemoryManager::Split(VMAHandle vma_handle, size_t offset_in_vma) { diff --git a/src/core/memory.h b/src/core/memory.h index 6d0a977fc..d58269678 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -84,6 +84,7 @@ struct VirtualMemoryArea { bool disallow_merge = false; std::string name = ""; uintptr_t fd = 0; + bool is_exec = false; bool Contains(VAddr addr, size_t size) const { return addr >= base && (addr + size) <= (base + this->size); @@ -205,7 +206,7 @@ private: VMAHandle CarveVMA(VAddr virtual_addr, size_t size); - DirectMemoryArea& CarveDmemArea(PAddr addr, size_t size); + DMemHandle CarveDmemArea(PAddr addr, size_t size); VMAHandle Split(VMAHandle vma_handle, size_t offset_in_vma); From 5eecd089ab259d2ae6f7d4bca48ebfaa04736089 Mon Sep 17 00:00:00 2001 From: Lizardy <6063922+lzardy@users.noreply.github.com> Date: Tue, 13 Aug 2024 02:08:03 -0400 Subject: [PATCH 08/11] thread_management.cpp: Various Mandatory Threading Fixes | Resolve #398 (#394) * Handle empty mutex attribute - scePthreadMutexInit did not return default when the mutex attributes were empty, now it does * fix conditional unsafety * Update thread_management.cpp fix deref * accurate heap api - modified HeapAPI to a struct with preset function fields - utilized the full array parameter passed to _sceKernelRtldSetApplicationHeapAPI * fallback to std malloc * clang format * Declare all HeapAPI replacement functions - calloc, realloc, memalign, reallocalign, malloc_stats, malloc_stats_fast, malloc_usable_size - posix_memalign corrected parameters * resolve suggestions - `using` definition replacement for AppHeapAPI - linux uses heap_malloc, windows uses std::malloc --------- Co-authored-by: microsoftv <6063922+microsoftv@users.noreply.github.com> --- .../libraries/kernel/memory_management.cpp | 4 ++-- src/core/libraries/kernel/memory_management.h | 2 +- .../libraries/kernel/thread_management.cpp | 12 ++++++++-- src/core/linker.cpp | 20 ++++++++++++++--- src/core/linker.h | 22 +++++++++++++++---- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/core/libraries/kernel/memory_management.cpp b/src/core/libraries/kernel/memory_management.cpp index 94762c4a0..54c5860f4 100644 --- a/src/core/libraries/kernel/memory_management.cpp +++ b/src/core/libraries/kernel/memory_management.cpp @@ -212,9 +212,9 @@ s32 PS4_SYSV_ABI sceKernelAvailableFlexibleMemorySize(size_t* out_size) { return ORBIS_OK; } -void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func) { +void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func[]) { auto* linker = Common::Singleton::Instance(); - linker->SetHeapApiFunc(func); + linker->SetHeapAPI(func); } int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut, diff --git a/src/core/libraries/kernel/memory_management.h b/src/core/libraries/kernel/memory_management.h index 6735ead71..378449cc5 100644 --- a/src/core/libraries/kernel/memory_management.h +++ b/src/core/libraries/kernel/memory_management.h @@ -98,7 +98,7 @@ int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void** int PS4_SYSV_ABI sceKernelDirectMemoryQuery(u64 offset, int flags, OrbisQueryInfo* query_info, size_t infoSize); s32 PS4_SYSV_ABI sceKernelAvailableFlexibleMemorySize(size_t* sizeOut); -void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func); +void PS4_SYSV_ABI _sceKernelRtldSetApplicationHeapAPI(void* func[]); int PS4_SYSV_ABI sceKernelGetDirectMemoryType(u64 addr, int* directMemoryTypeOut, void** directMemoryStartOut, void** directMemoryEndOut); diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 85e2d0e66..cdd729da6 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -421,13 +421,21 @@ ScePthreadMutex* createMutex(ScePthreadMutex* addr) { return addr; } -int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMutexattr* attr, +int PS4_SYSV_ABI scePthreadMutexInit(ScePthreadMutex* mutex, const ScePthreadMutexattr* mutex_attr, const char* name) { + const ScePthreadMutexattr* attr; + if (mutex == nullptr) { return SCE_KERNEL_ERROR_EINVAL; } - if (attr == nullptr) { + if (mutex_attr == nullptr) { attr = g_pthread_cxt->getDefaultMutexattr(); + } else { + if (*mutex_attr == nullptr) { + attr = g_pthread_cxt->getDefaultMutexattr(); + } else { + attr = mutex_attr; + } } *mutex = new PthreadMutexInternal{}; diff --git a/src/core/linker.cpp b/src/core/linker.cpp index e4cbe5739..d4a15825b 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -305,7 +305,8 @@ void* Linker::TlsGetAddr(u64 module_index, u64 offset) { // Module was just loaded by above code. Allocate TLS block for it. Module* module = m_modules[module_index - 1].get(); const u32 init_image_size = module->tls.init_image_size; - u8* dest = reinterpret_cast(heap_api_func(module->tls.image_size)); + // TODO: Determine if Windows will crash from this + u8* dest = reinterpret_cast(heap_api->heap_malloc(module->tls.image_size)); const u8* src = reinterpret_cast(module->tls.image_virtual_addr); std::memcpy(dest, src, init_image_size); std::memset(dest + init_image_size, 0, module->tls.image_size - init_image_size); @@ -335,10 +336,23 @@ void Linker::InitTlsForThread(bool is_primary) { &addr_out, tls_aligned, 3, 0, "SceKernelPrimaryTcbTls"); ASSERT_MSG(ret == 0, "Unable to allocate TLS+TCB for the primary thread"); } else { - if (heap_api_func) { - addr_out = heap_api_func(total_tls_size); + if (heap_api) { +#ifndef WIN32 + addr_out = heap_api->heap_malloc(total_tls_size); } else { addr_out = std::malloc(total_tls_size); +#else + // TODO: Windows tls malloc replacement, refer to rtld_tls_block_malloc + LOG_ERROR(Core_Linker, "TLS user malloc called, using std::malloc"); + addr_out = std::malloc(total_tls_size); + if (!addr_out) { + auto pth_id = pthread_self(); + auto handle = pthread_gethandle(pth_id); + ASSERT_MSG(addr_out, + "Cannot allocate TLS block defined for handle=%x, index=%d size=%d", + handle, pth_id, total_tls_size); + } +#endif } } diff --git a/src/core/linker.h b/src/core/linker.h index aee8c8fd3..ed1fe400c 100644 --- a/src/core/linker.h +++ b/src/core/linker.h @@ -46,7 +46,21 @@ struct EntryParams { const char* argv[3]; }; -using HeapApiFunc = PS4_SYSV_ABI void* (*)(size_t); +struct HeapAPI { + PS4_SYSV_ABI void* (*heap_malloc)(size_t); + PS4_SYSV_ABI void (*heap_free)(void*); + PS4_SYSV_ABI void* (*heap_calloc)(size_t, size_t); + PS4_SYSV_ABI void* (*heap_realloc)(void*, size_t); + PS4_SYSV_ABI void* (*heap_memalign)(size_t, size_t); + PS4_SYSV_ABI int (*heap_posix_memalign)(void**, size_t, size_t); + // NOTE: Fields below may be inaccurate + PS4_SYSV_ABI int (*heap_reallocalign)(void); + PS4_SYSV_ABI void (*heap_malloc_stats)(void); + PS4_SYSV_ABI int (*heap_malloc_stats_fast)(void); + PS4_SYSV_ABI size_t (*heap_malloc_usable_size)(void*); +}; + +using AppHeapAPI = HeapAPI*; class Linker { public: @@ -75,8 +89,8 @@ public: } } - void SetHeapApiFunc(void* func) { - heap_api_func = *reinterpret_cast(func); + void SetHeapAPI(void* func[]) { + heap_api = reinterpret_cast(func); } void AdvanceGenerationCounter() noexcept { @@ -104,7 +118,7 @@ private: size_t static_tls_size{}; u32 max_tls_index{}; u32 num_static_modules{}; - HeapApiFunc heap_api_func{}; + AppHeapAPI heap_api{}; std::vector> m_modules; Loader::SymbolsResolver m_hle_symbols{}; }; From dfcfd62d4f76d392b1754ce13e8a11154749a78d Mon Sep 17 00:00:00 2001 From: Vinicius Rangel Date: Tue, 13 Aug 2024 03:12:38 -0300 Subject: [PATCH 09/11] spirv: fix image sample lod/clamp/offset translation (#402) * spirv: fix image sample lod/clamp translation * spirv: fix image sample offsets * fix ImageSample opcodes & offset emission --- .../backend/spirv/emit_spirv_image.cpp | 45 ++++++++++++---- .../backend/spirv/emit_spirv_instructions.h | 6 +-- .../frontend/translate/vector_memory.cpp | 13 +++-- src/shader_recompiler/ir/ir_emitter.cpp | 44 +++++----------- src/shader_recompiler/ir/ir_emitter.h | 22 ++++---- src/shader_recompiler/ir/opcodes.inc | 8 +-- .../ir/passes/resource_tracking_pass.cpp | 51 ++++++++++++++----- 7 files changed, 112 insertions(+), 77 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index e2d2c1ae0..72a603270 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -21,14 +21,19 @@ struct ImageOperands { boost::container::static_vector operands; }; -Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, +Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias, Id offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); ImageOperands operands; - operands.Add(spv::ImageOperandsMask::Offset, offset); + if (Sirit::ValidId(bias)) { + operands.Add(spv::ImageOperandsMask::Bias, bias); + } + if (Sirit::ValidId(offset)) { + operands.Add(spv::ImageOperandsMask::Offset, offset); + } return ctx.OpImageSampleImplicitLod(ctx.F32[4], sampled_image, coords, operands.mask, operands.operands); } @@ -39,27 +44,49 @@ Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id c const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, - spv::ImageOperandsMask::Lod, lod); + ImageOperands operands; + if (Sirit::ValidId(lod)) { + operands.Add(spv::ImageOperandsMask::Lod, lod); + } + if (Sirit::ValidId(offset)) { + operands.Add(spv::ImageOperandsMask::Offset, offset); + } + return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, operands.mask, + operands.operands); } Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, const IR::Value& offset) { + Id bias, Id offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleDrefImplicitLod(ctx.F32[1], sampled_image, coords, dref); + ImageOperands operands; + if (Sirit::ValidId(bias)) { + operands.Add(spv::ImageOperandsMask::Bias, bias); + } + if (Sirit::ValidId(offset)) { + operands.Add(spv::ImageOperandsMask::Offset, offset); + } + return ctx.OpImageSampleDrefImplicitLod(ctx.F32[1], sampled_image, coords, dref, operands.mask, + operands.operands); } Id EmitImageSampleDrefExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, Id offset) { + Id lod, Id offset) { const auto& texture = ctx.images[handle & 0xFFFF]; const Id image = ctx.OpLoad(texture.image_type, texture.id); const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); - return ctx.OpImageSampleDrefExplicitLod(ctx.F32[1], sampled_image, coords, dref, - spv::ImageOperandsMask::Lod, ctx.ConstF32(0.f)); + ImageOperands operands; + if (Sirit::ValidId(lod)) { + operands.Add(spv::ImageOperandsMask::Lod, lod); + } + if (Sirit::ValidId(offset)) { + operands.Add(spv::ImageOperandsMask::Offset, offset); + } + return ctx.OpImageSampleDrefExplicitLod(ctx.F32[1], sampled_image, coords, dref, operands.mask, + operands.operands); } Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id offset2) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 0b2020f18..85c6eaac5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -357,14 +357,14 @@ Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitConvertU32U16(EmitContext& ctx, Id value); -Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias_lc, +Id EmitImageSampleImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id bias, Id offset); Id EmitImageSampleExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id lod, Id offset); Id EmitImageSampleDrefImplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, const IR::Value& offset); + Id bias, Id offset); Id EmitImageSampleDrefExplicitLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id dref, - Id bias_lc, Id offset); + Id lod, Id offset); Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id offset2); Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id offset2, Id dref); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 3c6dfbda4..bb202e426 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -135,8 +135,8 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { // Load first address components as denoted in 8.2.4 VGPR Usage Sea Islands Series Instruction // Set Architecture - const IR::Value offset = - flags.test(MimgModifier::Offset) ? ir.GetVectorReg(addr_reg++) : IR::Value{}; + const IR::U32 offset = + flags.test(MimgModifier::Offset) ? ir.GetVectorReg(addr_reg++) : IR::U32{}; const IR::F32 bias = flags.test(MimgModifier::LodBias) ? ir.GetVectorReg(addr_reg++) : IR::F32{}; const IR::F32 dref = @@ -168,18 +168,17 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { // Issue IR instruction, leaving unknown fields blank to patch later. const IR::Value texel = [&]() -> IR::Value { - const IR::F32 lod = flags.test(MimgModifier::Level0) ? ir.Imm32(0.f) : IR::F32{}; if (!flags.test(MimgModifier::Pcf)) { if (explicit_lod) { - return ir.ImageSampleExplicitLod(handle, body, lod, offset, info); + return ir.ImageSampleExplicitLod(handle, body, offset, info); } else { - return ir.ImageSampleImplicitLod(handle, body, bias, offset, {}, info); + return ir.ImageSampleImplicitLod(handle, body, bias, offset, info); } } if (explicit_lod) { - return ir.ImageSampleDrefExplicitLod(handle, body, dref, lod, offset, info); + return ir.ImageSampleDrefExplicitLod(handle, body, dref, offset, info); } - return ir.ImageSampleDrefImplicitLod(handle, body, dref, bias, offset, {}, info); + return ir.ImageSampleDrefImplicitLod(handle, body, dref, bias, offset, info); }(); for (u32 i = 0; i < 4; i++) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 03404aca0..08b7fbbc0 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -16,18 +16,6 @@ namespace { UNREACHABLE_MSG("Invalid type = {}, functionName = {}, line = {}", u32(type), functionName, lineNumber); } - -Value MakeLodClampPair(IREmitter& ir, const F32& bias_lod, const F32& lod_clamp) { - if (!bias_lod.IsEmpty() && !lod_clamp.IsEmpty()) { - return ir.CompositeConstruct(bias_lod, lod_clamp); - } else if (!bias_lod.IsEmpty()) { - return bias_lod; - } else if (!lod_clamp.IsEmpty()) { - return lod_clamp; - } else { - return Value{}; - } -} } // Anonymous namespace U1 IREmitter::Imm1(bool value) const { @@ -1386,30 +1374,26 @@ Value IREmitter::ImageAtomicExchange(const Value& handle, const Value& coords, c return Inst(Opcode::ImageAtomicExchange32, Flags{info}, handle, coords, value); } -Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& coords, const F32& bias, - const Value& offset, const F32& lod_clamp, +Value IREmitter::ImageSampleImplicitLod(const Value& handle, const Value& body, const F32& bias, + const U32& offset, TextureInstInfo info) { + return Inst(Opcode::ImageSampleImplicitLod, Flags{info}, handle, body, bias, offset); +} + +Value IREmitter::ImageSampleExplicitLod(const Value& handle, const Value& body, const U32& offset, TextureInstInfo info) { - const Value bias_lc{MakeLodClampPair(*this, bias, lod_clamp)}; - return Inst(Opcode::ImageSampleImplicitLod, Flags{info}, handle, coords, bias_lc, offset); + return Inst(Opcode::ImageSampleExplicitLod, Flags{info}, handle, body, IR::F32{}, offset); } -Value IREmitter::ImageSampleExplicitLod(const Value& handle, const Value& coords, const F32& lod, - const Value& offset, TextureInstInfo info) { - return Inst(Opcode::ImageSampleExplicitLod, Flags{info}, handle, coords, lod, offset); -} - -F32 IREmitter::ImageSampleDrefImplicitLod(const Value& handle, const Value& coords, const F32& dref, - const F32& bias, const Value& offset, - const F32& lod_clamp, TextureInstInfo info) { - const Value bias_lc{MakeLodClampPair(*this, bias, lod_clamp)}; - return Inst(Opcode::ImageSampleDrefImplicitLod, Flags{info}, handle, coords, dref, bias_lc, +F32 IREmitter::ImageSampleDrefImplicitLod(const Value& handle, const Value& body, const F32& dref, + const F32& bias, const U32& offset, + TextureInstInfo info) { + return Inst(Opcode::ImageSampleDrefImplicitLod, Flags{info}, handle, body, dref, bias, offset); } -F32 IREmitter::ImageSampleDrefExplicitLod(const Value& handle, const Value& coords, const F32& dref, - const F32& lod, const Value& offset, - TextureInstInfo info) { - return Inst(Opcode::ImageSampleDrefExplicitLod, Flags{info}, handle, coords, dref, lod, +F32 IREmitter::ImageSampleDrefExplicitLod(const Value& handle, const Value& body, const F32& dref, + const U32& offset, TextureInstInfo info) { + return Inst(Opcode::ImageSampleDrefExplicitLod, Flags{info}, handle, body, dref, IR::F32{}, offset); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a65e46136..fda206394 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -241,19 +241,21 @@ public: [[nodiscard]] Value ImageAtomicExchange(const Value& handle, const Value& coords, const Value& value, TextureInstInfo info); - [[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& coords, - const F32& bias, const Value& offset, - const F32& lod_clamp, TextureInstInfo info); - [[nodiscard]] Value ImageSampleExplicitLod(const Value& handle, const Value& coords, - const F32& lod, const Value& offset, + [[nodiscard]] Value ImageSampleImplicitLod(const Value& handle, const Value& body, + const F32& bias, const U32& offset, TextureInstInfo info); - [[nodiscard]] F32 ImageSampleDrefImplicitLod(const Value& handle, const Value& coords, + + [[nodiscard]] Value ImageSampleExplicitLod(const Value& handle, const Value& body, + const U32& offset, TextureInstInfo info); + + [[nodiscard]] F32 ImageSampleDrefImplicitLod(const Value& handle, const Value& body, const F32& dref, const F32& bias, - const Value& offset, const F32& lod_clamp, + const U32& offset, TextureInstInfo info); + + [[nodiscard]] F32 ImageSampleDrefExplicitLod(const Value& handle, const Value& body, + const F32& dref, const U32& offset, TextureInstInfo info); - [[nodiscard]] F32 ImageSampleDrefExplicitLod(const Value& handle, const Value& coords, - const F32& dref, const F32& lod, - const Value& offset, TextureInstInfo info); + [[nodiscard]] Value ImageQueryDimension(const Value& handle, const IR::U32& lod, const IR::U1& skip_mips); [[nodiscard]] Value ImageQueryDimension(const Value& handle, const IR::U32& lod, diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index aa2fd3f82..46918bc39 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -298,10 +298,10 @@ OPCODE(ConvertU16U32, U16, U32, OPCODE(ConvertU32U16, U32, U16, ) // Image operations -OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) -OPCODE(ImageSampleExplicitLod, F32x4, Opaque, Opaque, Opaque, Opaque, ) -OPCODE(ImageSampleDrefImplicitLod, F32, Opaque, Opaque, F32, Opaque, Opaque, ) -OPCODE(ImageSampleDrefExplicitLod, F32, Opaque, Opaque, F32, Opaque, Opaque, ) +OPCODE(ImageSampleImplicitLod, F32x4, Opaque, Opaque, F32, U32, ) +OPCODE(ImageSampleExplicitLod, F32x4, Opaque, Opaque, U32, U32, ) +OPCODE(ImageSampleDrefImplicitLod, F32, Opaque, Opaque, Opaque, F32, U32, ) +OPCODE(ImageSampleDrefExplicitLod, F32, Opaque, Opaque, Opaque, U32, U32, ) OPCODE(ImageGather, F32x4, Opaque, Opaque, Opaque, Opaque, ) OPCODE(ImageGatherDref, F32x4, Opaque, Opaque, Opaque, Opaque, F32, ) OPCODE(ImageFetch, F32x4, Opaque, Opaque, Opaque, U32, Opaque, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 6c96faa3d..bacbac72a 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -567,25 +567,47 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip if (inst_info.has_offset) { // The offsets are six-bit signed integers: X=[5:0], Y=[13:8], and Z=[21:16]. - const bool is_gather = inst.GetOpcode() == IR::Opcode::ImageGather || - inst.GetOpcode() == IR::Opcode::ImageGatherDref; - const u32 arg_pos = is_gather ? 2 : (inst_info.is_depth ? 4 : 3); + const u32 arg_pos = [&]() -> u32 { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageGather: + case IR::Opcode::ImageGatherDref: + return 2; + case IR::Opcode::ImageSampleExplicitLod: + case IR::Opcode::ImageSampleImplicitLod: + return 3; + case IR::Opcode::ImageSampleDrefExplicitLod: + case IR::Opcode::ImageSampleDrefImplicitLod: + return 4; + default: + break; + } + return inst_info.is_depth ? 4 : 3; + }(); const IR::Value arg = inst.Arg(arg_pos); ASSERT_MSG(arg.Type() == IR::Type::U32, "Unexpected offset type"); - const auto sign_ext = [&](u32 value) { return ir.Imm32(s32(value << 24) >> 24); }; - union { - u32 raw; - BitField<0, 6, u32> x; - BitField<8, 6, u32> y; - BitField<16, 6, u32> z; - } offset{arg.U32()}; - const IR::Value value = ir.CompositeConstruct(sign_ext(offset.x), sign_ext(offset.y)); + const auto f = [&](IR::Value value, u32 offset) -> auto { + return ir.BitFieldExtract(IR::U32{arg}, ir.Imm32(offset), ir.Imm32(6), true); + }; + + const auto x = f(arg, 0); + const auto y = f(arg, 8); + const auto z = f(arg, 16); + const IR::Value value = ir.CompositeConstruct(x, y, z); inst.SetArg(arg_pos, value); } if (inst_info.has_lod_clamp) { - // Final argument contains lod_clamp - const u32 arg_pos = inst_info.is_depth ? 5 : 4; + const u32 arg_pos = [&]() -> u32 { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageSampleImplicitLod: + return 2; + case IR::Opcode::ImageSampleDrefImplicitLod: + return 3; + default: + break; + } + return inst_info.is_depth ? 5 : 4; + }(); inst.SetArg(arg_pos, arg); } if (inst_info.explicit_lod) { @@ -593,7 +615,8 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip inst.GetOpcode() == IR::Opcode::ImageSampleExplicitLod || inst.GetOpcode() == IR::Opcode::ImageSampleDrefExplicitLod); const u32 pos = inst.GetOpcode() == IR::Opcode::ImageSampleExplicitLod ? 2 : 3; - inst.SetArg(pos, arg); + const IR::Value value = inst_info.force_level0 ? ir.Imm32(0.f) : arg; + inst.SetArg(pos, value); } } From 1fb0da9b897b9a2b9d31a42898b660ef7d01a848 Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Tue, 13 Aug 2024 09:21:48 +0300 Subject: [PATCH 10/11] video_core: Crucial buffer cache fixes + proper GPU clears (#414) * translator: Use templates for stronger type guarantees * spirv: Define buffer offsets upfront * Saves a lot of shader instructions * buffer_cache: Use dynamic vertex input when available * Fixes issues when games like dark souls rebind vertex buffers with different stride * externals: Update boost * spirv: Use runtime array for ssbos * ssbos can be large and typically their size will vary, especially in generic copy/clear cs shaders * fs: Lock when doing case insensitive search * Dark Souls does fs lookups from different threads * texture_cache: More precise invalidation from compute * Fixes unrelated render targets being cleared * texture_cache: Use hashes for protect gpu modified images from reupload * translator: Treat V_CNDMASK as float * Sometimes it can have input modifiers. Worst this will cause is some extra calls to uintBitsToFloat and opposite. But most often this is used as float anyway * translator: Small optimization for V_SAD_U32 * Fix review * clang format --- externals/ext-boost | 2 +- src/core/file_sys/fs.cpp | 1 + .../libraries/kernel/threads/semaphore.cpp | 3 - .../spirv/emit_spirv_context_get_set.cpp | 15 +- .../backend/spirv/emit_spirv_special.cpp | 4 +- .../backend/spirv/spirv_emit_context.cpp | 26 +- .../backend/spirv/spirv_emit_context.h | 5 +- .../frontend/translate/translate.cpp | 329 ++++++++---------- .../frontend/translate/translate.h | 8 +- .../frontend/translate/vector_alu.cpp | 147 ++++---- src/video_core/buffer_cache/buffer_cache.cpp | 41 +++ src/video_core/buffer_cache/buffer_cache.h | 6 + .../renderer_vulkan/renderer_vulkan.h | 4 +- .../renderer_vulkan/vk_compute_pipeline.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 3 + .../renderer_vulkan/vk_instance.cpp | 9 +- src/video_core/renderer_vulkan/vk_instance.h | 6 + .../renderer_vulkan/vk_pipeline_cache.cpp | 4 + src/video_core/texture_cache/image.cpp | 1 + src/video_core/texture_cache/image.h | 1 + .../texture_cache/texture_cache.cpp | 82 +++-- src/video_core/texture_cache/texture_cache.h | 15 +- src/video_core/texture_cache/tile_manager.cpp | 1 - 23 files changed, 372 insertions(+), 346 deletions(-) diff --git a/externals/ext-boost b/externals/ext-boost index 147b2de77..a04136add 160000 --- a/externals/ext-boost +++ b/externals/ext-boost @@ -1 +1 @@ -Subproject commit 147b2de7734f5dc3b9aeb1f4135ae15fcd44b9d7 +Subproject commit a04136add1e469f46d8ae8d3e8307779240a5c53 diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 2bcff191c..a6d5c3eac 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -54,6 +54,7 @@ std::filesystem::path MntPoints::GetHostPath(const std::string& guest_directory) // If the path does not exist attempt to verify this. // Retrieve parent path until we find one that exists. + std::scoped_lock lk{m_mutex}; path_parts.clear(); auto current_path = host_path; while (!std::filesystem::exists(current_path)) { diff --git a/src/core/libraries/kernel/threads/semaphore.cpp b/src/core/libraries/kernel/threads/semaphore.cpp index 5441c641a..5304dc57b 100644 --- a/src/core/libraries/kernel/threads/semaphore.cpp +++ b/src/core/libraries/kernel/threads/semaphore.cpp @@ -9,7 +9,6 @@ #include "common/assert.h" #include "common/logging/log.h" #include "core/libraries/error_codes.h" -#include "core/libraries/kernel/thread_management.h" #include "core/libraries/libs.h" namespace Libraries::Kernel { @@ -82,7 +81,6 @@ public: public: struct WaitingThread : public ListBaseHook { - std::string name; std::condition_variable cv; u32 priority; s32 need_count; @@ -90,7 +88,6 @@ public: bool was_cancled{}; explicit WaitingThread(s32 need_count, bool is_fifo) : need_count{need_count} { - name = scePthreadSelf()->name; if (is_fifo) { return; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index bd34ed3db..5eae058ac 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -128,11 +128,7 @@ Id EmitReadConst(EmitContext& ctx) { Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } - const Id offset_dwords{ctx.OpShiftRightLogical(ctx.U32[1], buffer.offset, ctx.ConstU32(2U))}; - index = ctx.OpIAdd(ctx.U32[1], index, offset_dwords); + index = ctx.OpIAdd(ctx.U32[1], index, buffer.offset_dwords); const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; return ctx.OpLoad(buffer.data_types->Get(1), ptr); } @@ -229,9 +225,6 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { template static Id EmitLoadBufferF32xN(EmitContext& ctx, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { @@ -404,9 +397,6 @@ static Id GetBufferFormatValue(EmitContext& ctx, u32 handle, Id address, u32 com template static Id EmitLoadBufferFormatF32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); if constexpr (N == 1) { return GetBufferFormatValue(ctx, handle, address, 0); @@ -438,9 +428,6 @@ Id EmitLoadBufferFormatF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id ad template static void EmitStoreBufferF32xN(EmitContext& ctx, u32 handle, Id address, Id value) { auto& buffer = ctx.buffers[handle]; - if (!Sirit::ValidId(buffer.offset)) { - buffer.offset = ctx.GetBufferOffset(buffer.global_binding); - } address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); if constexpr (N == 1) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp index 891e41df7..3ed89692b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_special.cpp @@ -6,7 +6,9 @@ namespace Shader::Backend::SPIRV { -void EmitPrologue(EmitContext& ctx) {} +void EmitPrologue(EmitContext& ctx) { + ctx.DefineBufferOffsets(); +} void EmitEpilogue(EmitContext& ctx) {} diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 61b55437d..55754d455 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -165,14 +165,18 @@ EmitContext::SpirvAttribute EmitContext::GetAttributeInfo(AmdGpu::NumberFormat f throw InvalidArgument("Invalid attribute type {}", fmt); } -Id EmitContext::GetBufferOffset(u32 binding) { - const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); - const u32 comp = (binding & 0xf) >> 2; - const u32 offset = (binding & 0x3) << 3; - const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), - push_data_block, ConstU32(half), ConstU32(comp))}; - const Id value{OpLoad(U32[1], ptr)}; - return OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); +void EmitContext::DefineBufferOffsets() { + for (auto& buffer : buffers) { + const u32 binding = buffer.binding; + const u32 half = Shader::PushData::BufOffsetIndex + (binding >> 4); + const u32 comp = (binding & 0xf) >> 2; + const u32 offset = (binding & 0x3) << 3; + const Id ptr{OpAccessChain(TypePointer(spv::StorageClass::PushConstant, U32[1]), + push_data_block, ConstU32(half), ConstU32(comp))}; + const Id value{OpLoad(U32[1], ptr)}; + buffer.offset = OpBitFieldUExtract(U32[1], value, ConstU32(offset), ConstU32(8U)); + buffer.offset_dwords = OpShiftRightLogical(U32[1], buffer.offset, ConstU32(2U)); + } } Id MakeDefaultValue(EmitContext& ctx, u32 default_value) { @@ -327,7 +331,9 @@ void EmitContext::DefineBuffers() { for (u32 i = 0; const auto& buffer : info.buffers) { const auto* data_types = True(buffer.used_types & IR::Type::F32) ? &F32 : &U32; const Id data_type = (*data_types)[1]; - const Id record_array_type{TypeArray(data_type, ConstU32(buffer.length))}; + const Id record_array_type{buffer.is_storage + ? TypeRuntimeArray(data_type) + : TypeArray(data_type, ConstU32(buffer.length))}; const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) == type_ids.end()) { Decorate(record_array_type, spv::Decoration::ArrayStride, 4); @@ -354,7 +360,7 @@ void EmitContext::DefineBuffers() { buffers.push_back({ .id = id, - .global_binding = binding++, + .binding = binding++, .data_types = data_types, .pointer_type = pointer_type, .buffer = buffer.GetVsharp(info), diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 0d090eb31..81237a9a3 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -40,7 +40,7 @@ public: ~EmitContext(); Id Def(const IR::Value& value); - Id GetBufferOffset(u32 binding); + void DefineBufferOffsets(); [[nodiscard]] Id DefineInput(Id type, u32 location) { const Id input_id{DefineVar(type, spv::StorageClass::Input)}; @@ -203,7 +203,8 @@ public: struct BufferDefinition { Id id; Id offset; - u32 global_binding; + Id offset_dwords; + u32 binding; const VectorIds* data_types; Id pointer_type; AmdGpu::Buffer buffer; diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index b295c1bef..8ffde7fb3 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -73,101 +73,190 @@ void Translator::EmitPrologue() { } } -template <> -IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - IR::U32F32 value{}; +template +T Translator::GetSrc(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; - const bool is_float = operand.type == ScalarType::Float32 || force_flt; + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm32(std::bit_cast(value)); + } else { + return ir.Imm32(std::bit_cast(value)); + } + }; + + T value{}; switch (operand.field) { case OperandField::ScalarGPR: - if (is_float) { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } else { - value = ir.GetScalarReg(IR::ScalarReg(operand.code)); - } + value = ir.GetScalarReg(IR::ScalarReg(operand.code)); break; case OperandField::VectorGPR: - if (is_float) { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } else { - value = ir.GetVectorReg(IR::VectorReg(operand.code)); - } + value = ir.GetVectorReg(IR::VectorReg(operand.code)); break; case OperandField::ConstZero: - if (is_float) { - value = ir.Imm32(0.f); - } else { - value = ir.Imm32(0U); - } + value = get_imm(0U); break; case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - value = ir.Imm32(operand.code - SignedConstIntPosMin + 1); + value = get_imm(operand.code - SignedConstIntPosMin + 1); break; case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - value = ir.Imm32(-s32(operand.code) + SignedConstIntNegMin - 1); + value = get_imm(-s32(operand.code) + SignedConstIntNegMin - 1); break; case OperandField::LiteralConst: - if (is_float) { - value = ir.Imm32(std::bit_cast(operand.code)); - } else { - value = ir.Imm32(operand.code); - } + value = get_imm(operand.code); break; case OperandField::ConstFloatPos_1_0: - if (is_float) { - value = ir.Imm32(1.f); - } else { - value = ir.Imm32(std::bit_cast(1.f)); - } + value = get_imm(1.f); break; case OperandField::ConstFloatPos_0_5: - value = ir.Imm32(0.5f); + value = get_imm(0.5f); break; case OperandField::ConstFloatPos_2_0: - value = ir.Imm32(2.0f); + value = get_imm(2.0f); break; case OperandField::ConstFloatPos_4_0: - value = ir.Imm32(4.0f); + value = get_imm(4.0f); break; case OperandField::ConstFloatNeg_0_5: - value = ir.Imm32(-0.5f); + value = get_imm(-0.5f); break; case OperandField::ConstFloatNeg_1_0: - if (is_float) { - value = ir.Imm32(-1.0f); - } else { - value = ir.Imm32(std::bit_cast(-1.0f)); - } + value = get_imm(-1.0f); break; case OperandField::ConstFloatNeg_2_0: - value = ir.Imm32(-2.0f); + value = get_imm(-2.0f); break; case OperandField::ConstFloatNeg_4_0: - value = ir.Imm32(-4.0f); + value = get_imm(-4.0f); break; case OperandField::VccLo: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccLo()); } else { value = ir.GetVccLo(); } break; case OperandField::VccHi: - if (force_flt) { + if constexpr (is_float) { value = ir.BitCast(ir.GetVccHi()); } else { value = ir.GetVccHi(); } break; case OperandField::M0: - return m0_value; + if constexpr (is_float) { + UNREACHABLE(); + } else { + return m0_value; + } default: UNREACHABLE(); } - if (is_float) { + if constexpr (is_float) { + if (operand.input_modifier.abs) { + value = ir.FPAbs(value); + } + if (operand.input_modifier.neg) { + value = ir.FPNeg(value); + } + } else { + if (operand.input_modifier.abs) { + UNREACHABLE(); + } + if (operand.input_modifier.neg) { + UNREACHABLE(); + } + } + return value; +} + +template IR::U32 Translator::GetSrc(const InstOperand&); +template IR::F32 Translator::GetSrc(const InstOperand&); + +template +T Translator::GetSrc64(const InstOperand& operand) { + constexpr bool is_float = std::is_same_v; + + const auto get_imm = [&](auto value) -> T { + if constexpr (is_float) { + return ir.Imm64(std::bit_cast(value)); + } else { + return ir.Imm64(std::bit_cast(value)); + } + }; + + T value{}; + switch (operand.field) { + case OperandField::ScalarGPR: { + const auto value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); + const auto value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::VectorGPR: { + const auto value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); + const auto value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(value_lo, value_hi)); + } + break; + } + case OperandField::ConstZero: + value = get_imm(0ULL); + break; + case OperandField::SignedConstIntPos: + value = get_imm(s64(operand.code) - SignedConstIntPosMin + 1); + break; + case OperandField::SignedConstIntNeg: + value = get_imm(-s64(operand.code) + SignedConstIntNegMin - 1); + break; + case OperandField::LiteralConst: + value = get_imm(u64(operand.code)); + break; + case OperandField::ConstFloatPos_1_0: + value = get_imm(1.0); + break; + case OperandField::ConstFloatPos_0_5: + value = get_imm(0.5); + break; + case OperandField::ConstFloatPos_2_0: + value = get_imm(2.0); + break; + case OperandField::ConstFloatPos_4_0: + value = get_imm(4.0); + break; + case OperandField::ConstFloatNeg_0_5: + value = get_imm(-0.5); + break; + case OperandField::ConstFloatNeg_1_0: + value = get_imm(-1.0); + break; + case OperandField::ConstFloatNeg_2_0: + value = get_imm(-2.0); + break; + case OperandField::ConstFloatNeg_4_0: + value = get_imm(-4.0); + break; + case OperandField::VccLo: + if constexpr (is_float) { + UNREACHABLE(); + } else { + value = ir.PackUint2x32(ir.CompositeConstruct(ir.GetVccLo(), ir.GetVccHi())); + } + break; + case OperandField::VccHi: + default: + UNREACHABLE(); + } + + if constexpr (is_float) { if (operand.input_modifier.abs) { value = ir.FPAbs(value); } @@ -178,148 +267,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { return value; } -template <> -IR::U32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - return GetSrc(operand, force_flt); -} - -template <> -IR::F32 Translator::GetSrc(const InstOperand& operand, bool) { - return GetSrc(operand, true); -} - -template <> -IR::U64F64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - IR::Value value_hi{}; - IR::Value value_lo{}; - - bool immediate = false; - const bool is_float = operand.type == ScalarType::Float64 || force_flt; - switch (operand.field) { - case OperandField::ScalarGPR: - if (is_float) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetScalarReg(IR::ScalarReg(operand.code)); - value_hi = ir.GetScalarReg(IR::ScalarReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::VectorGPR: - if (is_float) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else if (operand.type == ScalarType::Uint64 || operand.type == ScalarType::Sint64) { - value_lo = ir.GetVectorReg(IR::VectorReg(operand.code)); - value_hi = ir.GetVectorReg(IR::VectorReg(operand.code + 1)); - } else { - UNREACHABLE(); - } - break; - case OperandField::ConstZero: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(0.0); - } else { - value_lo = ir.Imm64(u64(0U)); - } - break; - case OperandField::SignedConstIntPos: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(s64(operand.code) - SignedConstIntPosMin + 1); - break; - case OperandField::SignedConstIntNeg: - ASSERT(!force_flt); - immediate = true; - value_lo = ir.Imm64(-s64(operand.code) + SignedConstIntNegMin - 1); - break; - case OperandField::LiteralConst: - immediate = true; - if (force_flt) { - UNREACHABLE(); // There is a literal double? - } else { - value_lo = ir.Imm64(u64(operand.code)); - } - break; - case OperandField::ConstFloatPos_1_0: - immediate = true; - if (force_flt) { - value_lo = ir.Imm64(1.0); - } else { - value_lo = ir.Imm64(std::bit_cast(f64(1.0))); - } - break; - case OperandField::ConstFloatPos_0_5: - immediate = true; - value_lo = ir.Imm64(0.5); - break; - case OperandField::ConstFloatPos_2_0: - immediate = true; - value_lo = ir.Imm64(2.0); - break; - case OperandField::ConstFloatPos_4_0: - immediate = true; - value_lo = ir.Imm64(4.0); - break; - case OperandField::ConstFloatNeg_0_5: - immediate = true; - value_lo = ir.Imm64(-0.5); - break; - case OperandField::ConstFloatNeg_1_0: - immediate = true; - value_lo = ir.Imm64(-1.0); - break; - case OperandField::ConstFloatNeg_2_0: - immediate = true; - value_lo = ir.Imm64(-2.0); - break; - case OperandField::ConstFloatNeg_4_0: - immediate = true; - value_lo = ir.Imm64(-4.0); - break; - case OperandField::VccLo: { - value_lo = ir.GetVccLo(); - value_hi = ir.GetVccHi(); - } break; - case OperandField::VccHi: - UNREACHABLE(); - default: - UNREACHABLE(); - } - - IR::Value value; - - if (immediate) { - value = value_lo; - } else if (is_float) { - throw NotImplementedException("required OpPackDouble2x32 implementation"); - } else { - IR::Value packed = ir.CompositeConstruct(value_lo, value_hi); - value = ir.PackUint2x32(packed); - } - - if (is_float) { - if (operand.input_modifier.abs) { - value = ir.FPAbs(IR::F32F64(value)); - } - if (operand.input_modifier.neg) { - value = ir.FPNeg(IR::F32F64(value)); - } - } - return IR::U64F64(value); -} - -template <> -IR::U64 Translator::GetSrc64(const InstOperand& operand, bool force_flt) { - return GetSrc64(operand, force_flt); -} -template <> -IR::F64 Translator::GetSrc64(const InstOperand& operand, bool) { - return GetSrc64(operand, true); -} +template IR::U64 Translator::GetSrc64(const InstOperand&); +template IR::F64 Translator::GetSrc64(const InstOperand&); void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { IR::U32F32 result = value; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index fe4457d27..2e12209dc 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -211,10 +211,10 @@ public: void IMAGE_ATOMIC(AtomicOp op, const GcnInst& inst); private: - template - [[nodiscard]] T GetSrc(const InstOperand& operand, bool flt_zero = false); - template - [[nodiscard]] T GetSrc64(const InstOperand& operand, bool flt_zero = false); + template + [[nodiscard]] T GetSrc(const InstOperand& operand); + template + [[nodiscard]] T GetSrc64(const InstOperand& operand); void SetDst(const InstOperand& operand, const IR::U32F32& value); void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 89428c44f..1bbc3c162 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/frontend/translate/translate.h" -#include "shader_recompiler/profile.h" namespace Shader::Gcn { @@ -312,7 +311,7 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { } void Translator::V_MOV(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0])); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_SAD(const GcnInst& inst) { @@ -321,14 +320,14 @@ void Translator::V_SAD(const GcnInst& inst) { } void Translator::V_MAC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true), - GetSrc(inst.dst[0], true))); + SetDst(inst.dst[0], ir.FPFma(GetSrc(inst.src[0]), GetSrc(inst.src[1]), + GetSrc(inst.dst[0]))); } void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value vec_f32 = - ir.CompositeConstruct(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)); + ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); ir.SetVectorReg(dst_reg, ir.PackHalf2x16(vec_f32)); } @@ -339,13 +338,13 @@ void Translator::V_CVT_F32_F16(const GcnInst& inst) { } void Translator::V_CVT_F16_F32(const GcnInst& inst) { - const IR::F32 src0 = GetSrc(inst.src[0], true); + const IR::F32 src0 = GetSrc(inst.src[0]); const IR::F16 src0fp16 = ir.FPConvert(16, src0); SetDst(inst.dst[0], ir.UConvert(32, ir.BitCast(src0fp16))); } void Translator::V_MUL_F32(const GcnInst& inst) { - SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); + SetDst(inst.dst[0], ir.FPMul(GetSrc(inst.src[0]), GetSrc(inst.src[1]))); } void Translator::V_CNDMASK_B32(const GcnInst& inst) { @@ -354,24 +353,8 @@ void Translator::V_CNDMASK_B32(const GcnInst& inst) { const IR::U1 flag = inst.src[2].field == OperandField::ScalarGPR ? ir.GetThreadBitScalarReg(flag_reg) : ir.GetVcc(); - - // We can treat the instruction as integer most of the time, but when a source is - // a floating point constant we will force the other as float for better readability - // The other operand is also higly likely to be float as well. - const auto is_float_const = [](OperandField field) { - return field >= OperandField::ConstFloatPos_0_5 && field <= OperandField::ConstFloatNeg_4_0; - }; - const bool has_flt_source = - is_float_const(inst.src[0].field) || is_float_const(inst.src[1].field); - IR::U32F32 src0 = GetSrc(inst.src[0], has_flt_source); - IR::U32F32 src1 = GetSrc(inst.src[1], has_flt_source); - if (src0.Type() == IR::Type::F32 && src1.Type() == IR::Type::U32) { - src1 = ir.BitCast(src1); - } - if (src1.Type() == IR::Type::F32 && src0.Type() == IR::Type::U32) { - src0 = ir.BitCast(src0); - } - const IR::Value result = ir.Select(flag, src1, src0); + const IR::Value result = + ir.Select(flag, GetSrc(inst.src[1]), GetSrc(inst.src[0])); ir.SetVectorReg(dst_reg, IR::U32F32{result}); } @@ -448,21 +431,21 @@ void Translator::V_CVT_F32_U32(const GcnInst& inst) { } void Translator::V_MAD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_FRACT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.Fract(src0)); } void Translator::V_ADD_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPAdd(src0, src1)); } @@ -476,9 +459,9 @@ void Translator::V_CVT_OFF_F32_I4(const GcnInst& inst) { } void Translator::V_MED3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; const IR::F32 mmx = ir.FPMin(ir.FPMax(src0, src1), src2); SetDst(inst.dst[0], ir.FPMax(ir.FPMin(src0, src1), mmx)); } @@ -492,32 +475,32 @@ void Translator::V_MED3_I32(const GcnInst& inst) { } void Translator::V_FLOOR_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::VectorReg dst_reg{inst.dst[0].code}; ir.SetVectorReg(dst_reg, ir.FPFloor(src0)); } void Translator::V_SUB_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src0, src1)); } void Translator::V_RCP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecip(src0)); } void Translator::V_FMA_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; const IR::U1 result = [&] { switch (op) { case ConditionOp::F: @@ -557,8 +540,8 @@ void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { } void Translator::V_MAX_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMax(src0, src1, is_legacy)); } @@ -569,40 +552,40 @@ void Translator::V_MAX_U32(bool is_signed, const GcnInst& inst) { } void Translator::V_RSQ_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRecipSqrt(src0)); } void Translator::V_SIN_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSin(src0)); } void Translator::V_LOG_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPLog2(src0)); } void Translator::V_EXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPExp2(src0)); } void Translator::V_SQRT_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPSqrt(src0)); } void Translator::V_MIN_F32(const GcnInst& inst, bool is_legacy) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPMin(src0, src1, is_legacy)); } void Translator::V_MIN3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } @@ -614,9 +597,9 @@ void Translator::V_MIN3_I32(const GcnInst& inst) { } void Translator::V_MADMK_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 k{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 k{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPFma(src0, k, src1)); } @@ -625,25 +608,25 @@ void Translator::V_CUBEMA_F32(const GcnInst& inst) { } void Translator::V_CUBESC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[0], true)); + SetDst(inst.dst[0], GetSrc(inst.src[0])); } void Translator::V_CUBETC_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[1], true)); + SetDst(inst.dst[0], GetSrc(inst.src[1])); } void Translator::V_CUBEID_F32(const GcnInst& inst) { - SetDst(inst.dst[0], GetSrc(inst.src[2], true)); + SetDst(inst.dst[0], GetSrc(inst.src[2])); } void Translator::V_CVT_U32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToU(32, src0)); } void Translator::V_SUBREV_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPSub(src1, src0)); } @@ -727,9 +710,17 @@ void Translator::V_SAD_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 src2{GetSrc(inst.src[2])}; - const IR::U32 max{ir.IMax(src0, src1, false)}; - const IR::U32 min{ir.IMin(src0, src1, false)}; - SetDst(inst.dst[0], ir.IAdd(ir.ISub(max, min), src2)); + IR::U32 result; + if (src0.IsImmediate() && src0.U32() == 0U) { + result = src1; + } else if (src1.IsImmediate() && src1.U32() == 0U) { + result = src0; + } else { + const IR::U32 max{ir.IMax(src0, src1, false)}; + const IR::U32 min{ir.IMin(src0, src1, false)}; + result = ir.ISub(max, min); + } + SetDst(inst.dst[0], ir.IAdd(result, src2)); } void Translator::V_BFE_U32(bool is_signed, const GcnInst& inst) { @@ -783,7 +774,7 @@ void Translator::V_MAD_U32_U24(const GcnInst& inst) { } void Translator::V_RNDNE_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPRoundEven(src0)); } @@ -794,14 +785,14 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) { } void Translator::V_COS_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCos(src0)); } void Translator::V_MAX3_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; - const IR::F32 src1{GetSrc(inst.src[1], true)}; - const IR::F32 src2{GetSrc(inst.src[2], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; + const IR::F32 src1{GetSrc(inst.src[1])}; + const IR::F32 src2{GetSrc(inst.src[2])}; SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); } @@ -813,7 +804,7 @@ void Translator::V_MAX3_U32(const GcnInst& inst) { } void Translator::V_CVT_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); } @@ -830,12 +821,12 @@ void Translator::V_MUL_LO_U32(const GcnInst& inst) { } void Translator::V_TRUNC_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPTrunc(src0)); } void Translator::V_CEIL_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.FPCeil(src0)); } @@ -899,18 +890,18 @@ void Translator::V_BFREV_B32(const GcnInst& inst) { } void Translator::V_LDEXP_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; SetDst(inst.dst[0], ir.FPLdexp(src0, src1)); } void Translator::V_CVT_FLR_I32_F32(const GcnInst& inst) { - const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src0{GetSrc(inst.src[0])}; SetDst(inst.dst[0], ir.ConvertFToI(32, true, ir.FPFloor(src0))); } void Translator::V_CMP_CLASS_F32(const GcnInst& inst) { - const IR::F32F64 src0{GetSrc(inst.src[0])}; + const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; IR::U1 value; if (src1.IsImmediate()) { diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 7ab0d8171..2246807ad 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -87,6 +87,15 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { + boost::container::small_vector attributes; + boost::container::small_vector bindings; + SCOPE_EXIT { + if (instance.IsVertexInputDynamicState()) { + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.setVertexInputEXT(bindings, attributes); + } + }; + if (vs_info.vs_inputs.empty()) { return false; } @@ -122,6 +131,21 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { } guest_buffers.emplace_back(buffer); ranges.emplace_back(buffer.base_address, buffer.base_address + buffer.GetSize()); + attributes.push_back({ + .location = input.binding, + .binding = input.binding, + .format = + Vulkan::LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()), + .offset = 0, + }); + bindings.push_back({ + .binding = input.binding, + .stride = buffer.GetStride(), + .inputRate = input.instance_step_rate == Shader::Info::VsInput::None + ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, + .divisor = 1, + }); } std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { @@ -224,6 +248,19 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } +std::pair BufferCache::ObtainTempBuffer(VAddr gpu_addr, u32 size) { + const u64 page = gpu_addr >> CACHING_PAGEBITS; + const BufferId buffer_id = page_table[page]; + if (buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(gpu_addr, size)) { + return {&buffer, buffer.Offset(gpu_addr)}; + } + } + const u32 offset = staging_buffer.Copy(gpu_addr, size, 16); + return {&staging_buffer, offset}; +} + bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); @@ -248,6 +285,10 @@ bool BufferCache::IsRegionCpuModified(VAddr addr, size_t size) { return memory_tracker.IsRegionCpuModified(addr, size); } +bool BufferCache::IsRegionGpuModified(VAddr addr, size_t size) { + return memory_tracker.IsRegionGpuModified(addr, size); +} + BufferId BufferCache::FindBuffer(VAddr device_addr, u32 size) { if (device_addr == 0) { return NULL_BUFFER_ID; diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 0dee87cf5..33ea3f86b 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -69,12 +69,18 @@ public: /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written); + /// Obtains a temporary buffer for usage in texture cache. + [[nodiscard]] std::pair ObtainTempBuffer(VAddr gpu_addr, u32 size); + /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + private: template void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 8178c88de..113b380eb 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -47,7 +47,7 @@ public: Frame* PrepareFrame(const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address, bool is_eop) { const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); auto& image = texture_cache.GetImage(image_id); return PrepareFrameInternal(image, is_eop); } @@ -61,7 +61,7 @@ public: const Libraries::VideoOut::BufferAttributeGroup& attribute, VAddr cpu_address) { vo_buffers_addr.emplace_back(cpu_address); const auto info = VideoCore::ImageInfo{attribute, cpu_address}; - const auto image_id = texture_cache.FindImage(info, false); + const auto image_id = texture_cache.FindImage(info); return texture_cache.GetImage(image_id); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 21710a76a..62b50eeb1 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -96,7 +96,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, Shader::PushData push_data{}; u32 binding{}; - for (u32 i = 0; const auto& buffer : info.buffers) { + for (const auto& buffer : info.buffers) { const auto vsharp = buffer.GetVsharp(info); const VAddr address = vsharp.base_address; // Most of the time when a metadata is updated with a shader it gets cleared. It means we @@ -115,7 +115,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } const u32 size = vsharp.GetSize(); if (buffer.is_written) { - texture_cache.InvalidateMemory(address, size); + texture_cache.InvalidateMemory(address, size, true); } const u32 alignment = buffer.is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); @@ -137,7 +137,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, : vk::DescriptorType::eUniformBuffer, .pBufferInfo = &buffer_infos.back(), }); - i++; } for (const auto& image_desc : info.images) { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5d87a1caf..cf23ade26 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -145,6 +145,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul dynamic_states.push_back(vk::DynamicState::eColorWriteEnableEXT); dynamic_states.push_back(vk::DynamicState::eColorWriteMaskEXT); } + if (instance.IsVertexInputDynamicState()) { + dynamic_states.push_back(vk::DynamicState::eVertexInputEXT); + } const vk::PipelineDynamicStateCreateInfo dynamic_info = { .dynamicStateCount = static_cast(dynamic_states.size()), diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 5beb57c44..b60b78e13 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -202,6 +202,8 @@ bool Instance::CreateDevice() { add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); workgroup_memory_explicit_layout = add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); + // The next two extensions are required to be available together in order to support write masks color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME); color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); @@ -319,6 +321,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceSynchronization2Features{ .synchronization2 = true, }, + vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{ + .vertexInputDynamicState = true, + }, }; if (!color_write_en) { @@ -331,8 +336,8 @@ bool Instance::CreateDevice() { } else { device_chain.unlink(); } - if (!has_sync2) { - device_chain.unlink(); + if (!vertex_input_dynamic_state) { + device_chain.unlink(); } try { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 2f2397d64..4cb4741a5 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -132,6 +132,11 @@ public: return color_write_en; } + /// Returns true when VK_EXT_vertex_input_dynamic_state is supported. + bool IsVertexInputDynamicState() const { + return vertex_input_dynamic_state; + } + /// Returns the vendor ID of the physical device u32 GetVendorID() const { return properties.vendorID; @@ -257,6 +262,7 @@ private: bool external_memory_host{}; bool workgroup_memory_explicit_layout{}; bool color_write_en{}; + bool vertex_input_dynamic_state{}; u64 min_imported_host_pointer_alignment{}; u32 subgroup_size{}; bool tooling_info{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 8d27d252c..8a22b9256 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -209,6 +209,10 @@ void PipelineCache::RefreshGraphicsKey() { continue; } const auto* bininfo = Liverpool::GetBinaryInfo(*pgm); + if (!bininfo->Valid()) { + key.stage_hashes[i] = 0; + continue; + } key.stage_hashes[i] = bininfo->shader_hash; } } diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index f1148760e..bae4b89d4 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -117,6 +117,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, : instance{&instance_}, scheduler{&scheduler_}, info{info_}, image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{info.guest_address}, cpu_addr_end{cpu_addr + info.guest_size_bytes} { + mip_hashes.resize(info.resources.levels); ASSERT(info.pixel_format != vk::Format::eUndefined); // Here we force `eExtendedUsage` as don't know all image usage cases beforehand. In normal case // the texture cache should re-create the resource with the usage requested diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index b18f1002b..5a888346f 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -111,6 +111,7 @@ struct Image { vk::Flags pl_stage = vk::PipelineStageFlagBits::eAllCommands; vk::Flags access_mask = vk::AccessFlagBits::eNone; vk::ImageLayout layout = vk::ImageLayout::eUndefined; + boost::container::small_vector mip_hashes; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 53596f8e3..6b14faac4 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -3,6 +3,7 @@ #include #include "common/assert.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -11,13 +12,11 @@ namespace VideoCore { -static constexpr u64 StreamBufferSize = 512_MB; static constexpr u64 PageShift = 12; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, - staging{instance, scheduler, MemoryUsage::Upload, StreamBufferSize}, tile_manager{instance, scheduler} { ImageInfo info; info.pixel_format = vk::Format::eR8G8B8A8Unorm; @@ -31,9 +30,12 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& TextureCache::~TextureCache() = default; -void TextureCache::InvalidateMemory(VAddr address, size_t size) { +void TextureCache::InvalidateMemory(VAddr address, size_t size, bool from_compute) { std::unique_lock lock{mutex}; ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { + if (from_compute && !image.Overlaps(address, size)) { + return; + } // Ensure image is reuploaded when accessed again. image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. @@ -57,7 +59,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { } } -ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { +ImageId TextureCache::FindImage(const ImageInfo& info) { if (info.guest_address == 0) [[unlikely]] { return NULL_IMAGE_VIEW_ID; } @@ -87,12 +89,6 @@ ImageId TextureCache::FindImage(const ImageInfo& info, bool refresh_on_create) { image_id = image_ids[image_ids.size() > 1 ? 1 : 0]; } - Image& image = slot_images[image_id]; - if (True(image.flags & ImageFlagBits::CpuModified) && refresh_on_create) { - RefreshImage(image); - TrackImage(image, image_id); - } - return image_id; } @@ -119,6 +115,7 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(info); + UpdateImage(image_id); Image& image = slot_images[image_id]; auto& usage = image.info.usage; @@ -165,7 +162,8 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; - image.flags &= ~ImageFlagBits::CpuModified; + image.flags |= ImageFlagBits::GpuModified; + UpdateImage(image_id); image.Transit(vk::ImageLayout::eColorAttachmentOptimal, vk::AccessFlagBits::eColorAttachmentWrite | @@ -198,8 +196,9 @@ ImageView& TextureCache::FindRenderTarget(const ImageInfo& image_info, ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info) { - const ImageId image_id = FindImage(image_info, false); + const ImageId image_id = FindImage(image_info); Image& image = slot_images[image_id]; + image.flags |= ImageFlagBits::GpuModified; image.flags &= ~ImageFlagBits::CpuModified; const auto new_layout = view_info.is_storage ? vk::ImageLayout::eDepthStencilAttachmentOptimal @@ -228,22 +227,6 @@ void TextureCache::RefreshImage(Image& image) { // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; - scheduler.EndRendering(); - - const auto cmdbuf = scheduler.CommandBuffer(); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - - vk::Buffer buffer{staging.Handle()}; - u32 offset{0}; - - auto upload_buffer = tile_manager.TryDetile(image); - if (upload_buffer) { - buffer = *upload_buffer; - } else { - // Upload data to the staging buffer. - offset = staging.Copy(image.info.guest_address, image.info.guest_size_bytes, 16); - } - const auto& num_layers = image.info.resources.layers; const auto& num_mips = image.info.resources.levels; ASSERT(num_mips == image.info.mips_layout.size()); @@ -254,12 +237,23 @@ void TextureCache::RefreshImage(Image& image) { const u32 height = std::max(image.info.size.height >> m, 1u); const u32 depth = image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const auto& [_, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + + // Protect GPU modified resources from accidental reuploads. + if (True(image.flags & ImageFlagBits::GpuModified) && + !buffer_cache.IsRegionGpuModified(image.info.guest_address + mip_ofs, mip_size)) { + const u8* addr = std::bit_cast(image.info.guest_address); + const u64 hash = XXH3_64bits(addr + mip_ofs, mip_size); + if (image.mip_hashes[m] == hash) { + continue; + } + image.mip_hashes[m] = hash; + } image_copy.push_back({ - .bufferOffset = offset + mip_ofs * num_layers, - .bufferRowLength = static_cast(mip_pitch), - .bufferImageHeight = static_cast(mip_height), + .bufferOffset = mip_ofs * num_layers, + .bufferRowLength = static_cast(mip_pitch), + .bufferImageHeight = static_cast(mip_height), .imageSubresource{ .aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = m, @@ -271,6 +265,30 @@ void TextureCache::RefreshImage(Image& image) { }); } + if (image_copy.empty()) { + return; + } + + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite, cmdbuf); + + const VAddr image_addr = image.info.guest_address; + const size_t image_size = image.info.guest_size_bytes; + vk::Buffer buffer{}; + u32 offset{}; + if (auto upload_buffer = tile_manager.TryDetile(image); upload_buffer) { + buffer = *upload_buffer; + } else { + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); + buffer = vk_buffer->Handle(); + offset = buf_offset; + } + + for (auto& copy : image_copy) { + copy.bufferOffset += offset; + } + cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy); } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 17a09898d..b3af0ff11 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -38,13 +38,13 @@ public: ~TextureCache(); /// Invalidates any image in the logical page range. - void InvalidateMemory(VAddr address, size_t size); + void InvalidateMemory(VAddr address, size_t size, bool from_compute = false); /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(const ImageInfo& info, bool refresh_on_create = true); + [[nodiscard]] ImageId FindImage(const ImageInfo& info); /// Retrieves an image view with the properties of the specified image descriptor. [[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info, @@ -58,6 +58,16 @@ public: [[nodiscard]] ImageView& FindDepthTarget(const ImageInfo& image_info, const ImageViewInfo& view_info); + /// Updates image contents if it was modified by CPU. + void UpdateImage(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::CpuModified)) { + return; + } + RefreshImage(image); + TrackImage(image, image_id); + } + /// Reuploads image contents. void RefreshImage(Image& image); @@ -170,7 +180,6 @@ private: Vulkan::Scheduler& scheduler; BufferCache& buffer_cache; PageManager& tracker; - StreamBuffer staging; TileManager tile_manager; Common::SlotVector slot_images; Common::SlotVector slot_image_views; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 75fa378cd..6447fde17 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -5,7 +5,6 @@ #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/texture_cache/image_view.h" -#include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/tile_manager.h" #include "video_core/host_shaders/detile_m32x1_comp.h" From d1a033b6afd93d2d36a176c6d0a91c0e85147e3e Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Tue, 13 Aug 2024 00:30:47 -0700 Subject: [PATCH 11/11] Fix some Vulkan validation errors on macOS. (#420) --- .../renderer_vulkan/vk_instance.cpp | 27 ++++++++++++++++--- .../renderer_vulkan/vk_platform.cpp | 7 +++++ .../renderer_vulkan/vk_swapchain.cpp | 12 ++++++++- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index b60b78e13..66da030f1 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -164,7 +164,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceColorWriteEnableFeaturesEXT, vk::PhysicalDeviceVulkan12Features, vk::PhysicalDeviceVulkan13Features, vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR, - vk::PhysicalDeviceDepthClipControlFeaturesEXT, vk::PhysicalDeviceRobustness2FeaturesEXT>(); + vk::PhysicalDeviceDepthClipControlFeaturesEXT, vk::PhysicalDeviceRobustness2FeaturesEXT, + vk::PhysicalDevicePortabilitySubsetFeaturesKHR>(); const vk::StructureChain properties_chain = physical_device.getProperties2< vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); @@ -198,7 +199,7 @@ bool Instance::CreateDevice() { external_memory_host = add_extension(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME); custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); add_extension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME); - add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); + const bool depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); workgroup_memory_explicit_layout = add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); @@ -213,7 +214,7 @@ bool Instance::CreateDevice() { // These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2 // with extensions. tooling_info = add_extension(VK_EXT_TOOLING_INFO_EXTENSION_NAME); - add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); + const bool maintenance4 = add_extension(VK_KHR_MAINTENANCE_4_EXTENSION_NAME); add_extension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME); add_extension(VK_EXT_SHADER_DEMOTE_TO_HELPER_INVOCATION_EXTENSION_NAME); const bool has_sync2 = add_extension(VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME); @@ -224,6 +225,11 @@ bool Instance::CreateDevice() { : false; } +#ifdef __APPLE__ + // Required by Vulkan spec if supported. + add_extension(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME); +#endif + const auto family_properties = physical_device.getQueueFamilyProperties(); if (family_properties.empty()) { LOG_CRITICAL(Render_Vulkan, "Physical device reported no queues."); @@ -324,12 +330,27 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{ .vertexInputDynamicState = true, }, +#ifdef __APPLE__ + feature_chain.get(), +#endif }; + if (!maintenance4) { + device_chain.unlink(); + } + if (!custom_border_color) { + device_chain.unlink(); + } if (!color_write_en) { device_chain.unlink(); device_chain.unlink(); } + if (!depth_clip_control) { + device_chain.unlink(); + } + if (!workgroup_memory_explicit_layout) { + device_chain.unlink(); + } if (robustness) { device_chain.get().nullDescriptor = feature_chain.get().nullDescriptor; diff --git a/src/video_core/renderer_vulkan/vk_platform.cpp b/src/video_core/renderer_vulkan/vk_platform.cpp index 33113c58b..c73a8139d 100644 --- a/src/video_core/renderer_vulkan/vk_platform.cpp +++ b/src/video_core/renderer_vulkan/vk_platform.cpp @@ -157,6 +157,10 @@ std::vector GetInstanceExtensions(Frontend::WindowSystemType window break; } +#ifdef __APPLE__ + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); +#endif + if (window_type != Frontend::WindowSystemType::Headless) { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } @@ -285,6 +289,9 @@ vk::UniqueInstance CreateInstance(vk::DynamicLoader& dl, Frontend::WindowSystemT .ppEnabledLayerNames = layers.data(), .enabledExtensionCount = static_cast(extensions.size()), .ppEnabledExtensionNames = extensions.data(), +#ifdef __APPLE__ + .flags = vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR, +#endif }, vk::LayerSettingsCreateInfoEXT{ .settingCount = layer_setings.size(), diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 20c99e302..16d5c2372 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -37,6 +37,16 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { instance.GetPresentQueueFamilyIndex(), }; + const auto modes = instance.GetPhysicalDevice().getSurfacePresentModesKHR(surface); + const auto find_mode = [&modes](vk::PresentModeKHR requested) { + const auto it = + std::find_if(modes.begin(), modes.end(), + [&requested](vk::PresentModeKHR mode) { return mode == requested; }); + + return it != modes.end(); + }; + const bool has_mailbox = find_mode(vk::PresentModeKHR::eMailbox); + const bool exclusive = queue_family_indices[0] == queue_family_indices[1]; const u32 queue_family_indices_count = exclusive ? 1u : 2u; const vk::SharingMode sharing_mode = @@ -55,7 +65,7 @@ void Swapchain::Create(u32 width_, u32 height_, vk::SurfaceKHR surface_) { .pQueueFamilyIndices = queue_family_indices.data(), .preTransform = transform, .compositeAlpha = composite_alpha, - .presentMode = vk::PresentModeKHR::eMailbox, + .presentMode = has_mailbox ? vk::PresentModeKHR::eMailbox : vk::PresentModeKHR::eImmediate, .clipped = true, .oldSwapchain = nullptr, };