diff --git a/CMakeLists.txt b/CMakeLists.txt index 30ae5960f..771b0e009 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -870,6 +870,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/ring_access_elimination.cpp src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp + src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp src/shader_recompiler/ir/abstract_syntax_list.cpp @@ -1123,6 +1124,10 @@ if (APPLE) set(MVK_BUNDLE_PATH "Resources/vulkan/icd.d") set_property(TARGET shadps4 APPEND PROPERTY BUILD_RPATH "@executable_path/../${MVK_BUNDLE_PATH}") set(MVK_DST ${CMAKE_CURRENT_BINARY_DIR}/shadps4.app/Contents/${MVK_BUNDLE_PATH}) + + add_custom_command( + OUTPUT ${MVK_DST} + COMMAND ${CMAKE_COMMAND} -E make_directory ${MVK_DST}) else() set_property(TARGET shadps4 APPEND PROPERTY BUILD_RPATH "@executable_path") set(MVK_DST ${CMAKE_CURRENT_BINARY_DIR}) @@ -1133,9 +1138,6 @@ if (APPLE) set(MVK_ICD_SRC ${CMAKE_CURRENT_SOURCE_DIR}/externals/MoltenVK/MoltenVK/MoltenVK/icd/MoltenVK_icd.json) set(MVK_ICD_DST ${MVK_DST}/MoltenVK_icd.json) - add_custom_command( - OUTPUT ${MVK_DST} - COMMAND ${CMAKE_COMMAND} -E make_directory ${MVK_DST}) add_custom_command( OUTPUT ${MVK_ICD_DST} DEPENDS ${MVK_ICD_SRC} ${MVK_DST} @@ -1150,17 +1152,13 @@ if (APPLE) if (ARCHITECTURE STREQUAL "x86_64") # Reserve system-managed memory space. - target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,SYSTEM_MANAGED,0x400000,-segaddr,SYSTEM_RESERVED,0x7FFFFC000,-image_base,0x20000000000) + target_link_options(shadps4 PRIVATE -Wl,-ld_classic,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,SYSTEM_MANAGED,0x400000,-segaddr,SYSTEM_RESERVED,0x7FFFFC000,-image_base,0x20000000000) endif() # Replacement for std::chrono::time_zone target_link_libraries(shadps4 PRIVATE date::date-tz) endif() -if (NOT ENABLE_QT_GUI) - target_link_libraries(shadps4 PRIVATE SDL3::SDL3) -endif() - if (ENABLE_QT_GUI) target_link_libraries(shadps4 PRIVATE Qt6::Widgets Qt6::Concurrent Qt6::Network Qt6::Multimedia) add_definitions(-DENABLE_QT_GUI) diff --git a/README.md b/README.md index 985bba586..9079ead73 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ SPDX-License-Identifier: GPL-2.0-or-later **shadPS4** is an early **PlayStation 4** emulator for **Windows**, **Linux** and **macOS** written in C++. -If you encounter problems or have doubts, do not hesitate to look at the [**Quickstart**](https://github.com/shadps4-emu/shadPS4/blob/main/documents/Quickstart/Quickstart.md).\ +If you encounter problems or have doubts, do not hesitate to look at the [**Quickstart**](https://github.com/shadps4-emu/shadPS4/wiki/I.-Quick-start-%5BUsers%5D).\ To verify that a game works, you can look at [**shadPS4 Game Compatibility**](https://github.com/shadps4-emu/shadps4-game-compatibility).\ To discuss shadPS4 development, suggest ideas or to ask for help, join our [**Discord server**](https://discord.gg/bFJxfftGW6).\ To get the latest news, go to our [**X (Twitter)**](https://x.com/shadps4) or our [**website**](https://shadps4.net/).\ @@ -124,8 +124,8 @@ Keyboard and mouse inputs can be customized in the settings menu by clicking the # Firmware files -shadPS4 can load some PlayStation 4 firmware files, these must be dumped from your legally owned PlayStation 4 console.\ -The following firmware modules are supported and must be placed in shadPS4's `user/sys_modules` folder. +shadPS4 can load some PlayStation 4 firmware files, these must be dumped from your legally owned PlayStation 4 console. +The following firmware modules are supported and must be placed in shadPS4's `sys_modules` folder.
@@ -139,7 +139,6 @@ The following firmware modules are supported and must be placed in shadPS4's `us > [!Caution] > The above modules are required to run the games properly and must be extracted from your PlayStation 4.\ -> **We do not provide any information or support on how to do this**. @@ -148,7 +147,7 @@ The following firmware modules are supported and must be placed in shadPS4's `us - [**georgemoralis**](https://github.com/georgemoralis) - [**psucien**](https://github.com/psucien) - [**viniciuslrangel**](https://github.com/viniciuslrangel) -- [**roamic**](https://github.com/vladmikhalin) +- [**roamic**](https://github.com/roamic) - [**squidbus**](https://github.com/squidbus) - [**frodo**](https://github.com/baggins183) - [**Stephen Miller**](https://github.com/StevenMiller123) @@ -158,7 +157,7 @@ Logo is done by [**Xphalnos**](https://github.com/Xphalnos) # Contributing -If you want to contribute, please look the [**CONTRIBUTING.md**](https://github.com/shadps4-emu/shadPS4/blob/main/CONTRIBUTING.md) file.\ +If you want to contribute, please read the [**CONTRIBUTING.md**](https://github.com/shadps4-emu/shadPS4/blob/main/CONTRIBUTING.md) file.\ Open a PR and we'll check it :) # Translations diff --git a/documents/building-linux.md b/documents/building-linux.md index bd07b2eff..61d067881 100644 --- a/documents/building-linux.md +++ b/documents/building-linux.md @@ -25,7 +25,7 @@ sudo apt install build-essential clang git cmake libasound2-dev \ ```bash sudo dnf install clang git cmake libatomic alsa-lib-devel \ - pipewire-jack-audio-connection-kit-devel openal-devel \ + pipewire-jack-audio-connection-kit-devel openal-soft-devel \ openssl-devel libevdev-devel libudev-devel libXext-devel \ qt6-qtbase-devel qt6-qtbase-private-devel \ qt6-qtmultimedia-devel qt6-qtsvg-devel qt6-qttools-devel \ diff --git a/externals/MoltenVK/MoltenVK b/externals/MoltenVK/MoltenVK index 3a0b07a24..00abd384c 160000 --- a/externals/MoltenVK/MoltenVK +++ b/externals/MoltenVK/MoltenVK @@ -1 +1 @@ -Subproject commit 3a0b07a24a4a681ffe70b461b1f4333b2729e2ef +Subproject commit 00abd384ce01cbd439045905d2fa6cf799dfa2f6 diff --git a/externals/MoltenVK/SPIRV-Cross b/externals/MoltenVK/SPIRV-Cross index 969e75f7c..1a69a919f 160000 --- a/externals/MoltenVK/SPIRV-Cross +++ b/externals/MoltenVK/SPIRV-Cross @@ -1 +1 @@ -Subproject commit 969e75f7cc0718774231d029f9d52fa87d4ae1b2 +Subproject commit 1a69a919fa302e92b337594bd0a8aaea61037d91 diff --git a/src/core/file_sys/fs.cpp b/src/core/file_sys/fs.cpp index 4dad44874..b237ab7d9 100644 --- a/src/core/file_sys/fs.cpp +++ b/src/core/file_sys/fs.cpp @@ -10,6 +10,8 @@ namespace Core::FileSys { +bool MntPoints::ignore_game_patches = false; + std::string RemoveTrailingSlashes(const std::string& path) { // Remove trailing slashes to make comparisons simpler. std::string path_sanitized = path; @@ -77,7 +79,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea patch_path /= rel_path; if ((corrected_path.starts_with("/app0") || corrected_path.starts_with("/hostapp")) && - !force_base_path && std::filesystem::exists(patch_path)) { + !force_base_path && !ignore_game_patches && std::filesystem::exists(patch_path)) { return patch_path; } @@ -137,7 +139,7 @@ std::filesystem::path MntPoints::GetHostPath(std::string_view path, bool* is_rea return std::optional(current_path); }; - if (!force_base_path) { + if (!force_base_path && !ignore_game_patches) { if (const auto path = search(patch_path)) { return *path; } diff --git a/src/core/file_sys/fs.h b/src/core/file_sys/fs.h index 6638b48e8..4a2aa56c1 100644 --- a/src/core/file_sys/fs.h +++ b/src/core/file_sys/fs.h @@ -21,6 +21,7 @@ class MntPoints { static constexpr bool NeedsCaseInsensitiveSearch = true; #endif public: + static bool ignore_game_patches; struct MntPair { std::filesystem::path host_path; std::string mount; // e.g /app0 diff --git a/src/core/libraries/kernel/file_system.cpp b/src/core/libraries/kernel/file_system.cpp index ad372325c..fecc606fd 100644 --- a/src/core/libraries/kernel/file_system.cpp +++ b/src/core/libraries/kernel/file_system.cpp @@ -1050,6 +1050,7 @@ void RegisterFileSystem(Core::Loader::SymbolsResolver* sym) { LIB_FUNCTION("4wSze92BhLI", "libkernel", 1, "libkernel", 1, 1, sceKernelWrite); LIB_FUNCTION("+WRlkKjZvag", "libkernel", 1, "libkernel", 1, 1, readv); LIB_FUNCTION("YSHRBRLn2pI", "libkernel", 1, "libkernel", 1, 1, writev); + LIB_FUNCTION("kAt6VDbHmro", "libkernel", 1, "libkernel", 1, 1, sceKernelWritev); LIB_FUNCTION("Oy6IpwgtYOk", "libScePosix", 1, "libkernel", 1, 1, posix_lseek); LIB_FUNCTION("Oy6IpwgtYOk", "libkernel", 1, "libkernel", 1, 1, posix_lseek); LIB_FUNCTION("oib76F-12fk", "libkernel", 1, "libkernel", 1, 1, sceKernelLseek); diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index f02ddafdc..ea3998ddd 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -99,8 +99,8 @@ s32 PS4_SYSV_ABI sceKernelReleaseDirectMemory(u64 start, size_t len) { s32 PS4_SYSV_ABI sceKernelAvailableDirectMemorySize(u64 searchStart, u64 searchEnd, size_t alignment, u64* physAddrOut, size_t* sizeOut) { - LOG_WARNING(Kernel_Vmm, "called searchStart = {:#x}, searchEnd = {:#x}, alignment = {:#x}", - searchStart, searchEnd, alignment); + LOG_INFO(Kernel_Vmm, "called searchStart = {:#x}, searchEnd = {:#x}, alignment = {:#x}", + searchStart, searchEnd, alignment); if (physAddrOut == nullptr || sizeOut == nullptr) { return ORBIS_KERNEL_ERROR_EINVAL; @@ -287,7 +287,7 @@ s32 PS4_SYSV_ABI sceKernelMtypeprotect(const void* addr, u64 size, s32 mtype, s3 int PS4_SYSV_ABI sceKernelDirectMemoryQuery(u64 offset, int flags, OrbisQueryInfo* query_info, size_t infoSize) { - LOG_WARNING(Kernel_Vmm, "called offset = {:#x}, flags = {:#x}", offset, flags); + LOG_INFO(Kernel_Vmm, "called offset = {:#x}, flags = {:#x}", offset, flags); auto* memory = Core::Memory::Instance(); return memory->DirectMemoryQuery(offset, flags == 1, query_info); } diff --git a/src/core/libraries/np_trophy/np_trophy.cpp b/src/core/libraries/np_trophy/np_trophy.cpp index 6de84bd93..e3c5ce35e 100644 --- a/src/core/libraries/np_trophy/np_trophy.cpp +++ b/src/core/libraries/np_trophy/np_trophy.cpp @@ -164,10 +164,12 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateContext(OrbisNpTrophyContext* context, int32_t } const auto ctx_id = trophy_contexts.insert(user_id, service_label); - contexts_internal[key].context_id = ctx_id.index; - LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", ctx_id.index, - user_id, service_label); - *context = ctx_id.index; + + *context = ctx_id.index + 1; + contexts_internal[key].context_id = *context; + LOG_INFO(Lib_NpTrophy, "New context = {}, user_id = {} service label = {}", *context, user_id, + service_label); + return ORBIS_OK; } @@ -179,21 +181,23 @@ s32 PS4_SYSV_ABI sceNpTrophyCreateHandle(OrbisNpTrophyHandle* handle) { if (trophy_handles.size() >= MaxTrophyHandles) { return ORBIS_NP_TROPHY_ERROR_HANDLE_EXCEEDS_MAX; } - const auto handle_id = trophy_handles.insert(); - LOG_INFO(Lib_NpTrophy, "New handle = {}", handle_id.index); - *handle = handle_id.index; + const auto handle_id = trophy_handles.insert(); + + *handle = handle_id.index + 1; + LOG_INFO(Lib_NpTrophy, "New handle = {}", *handle); return ORBIS_OK; } int PS4_SYSV_ABI sceNpTrophyDestroyContext(OrbisNpTrophyContext context) { LOG_INFO(Lib_NpTrophy, "Destroyed Context {}", context); - if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT) + if (context == ORBIS_NP_TROPHY_INVALID_CONTEXT) { return ORBIS_NP_TROPHY_ERROR_INVALID_CONTEXT; + } Common::SlotId contextId; - contextId.index = context; + contextId.index = context - 1; ContextKey contextkey = trophy_contexts[contextId]; trophy_contexts.erase(contextId); @@ -206,15 +210,17 @@ s32 PS4_SYSV_ABI sceNpTrophyDestroyHandle(OrbisNpTrophyHandle handle) { if (handle == ORBIS_NP_TROPHY_INVALID_HANDLE) return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE; - if (handle >= trophy_handles.size()) { + s32 handle_index = handle - 1; + if (handle_index >= trophy_handles.size()) { LOG_ERROR(Lib_NpTrophy, "Invalid handle {}", handle); return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE; } - if (!trophy_handles.is_allocated({static_cast(handle)})) { + + if (!trophy_handles.is_allocated({static_cast(handle_index)})) { return ORBIS_NP_TROPHY_ERROR_INVALID_HANDLE; } - trophy_handles.erase({static_cast(handle)}); + trophy_handles.erase({static_cast(handle_index)}); LOG_INFO(Lib_NpTrophy, "Handle {} destroyed", handle); return ORBIS_OK; } diff --git a/src/core/libraries/videodec/videodec2.cpp b/src/core/libraries/videodec/videodec2.cpp index 4f9379151..1c6044fe2 100644 --- a/src/core/libraries/videodec/videodec2.cpp +++ b/src/core/libraries/videodec/videodec2.cpp @@ -140,7 +140,7 @@ s32 PS4_SYSV_ABI sceVideodec2Flush(OrbisVideodec2Decoder decoder, return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER; } if (frameBuffer->thisSize != sizeof(OrbisVideodec2FrameBuffer) || - outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) { + (outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) { LOG_ERROR(Lib_Vdec2, "Invalid struct size"); return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE; } @@ -167,7 +167,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp LOG_ERROR(Lib_Vdec2, "Invalid arguments"); return ORBIS_VIDEODEC2_ERROR_ARGUMENT_POINTER; } - if (outputInfo->thisSize != sizeof(OrbisVideodec2OutputInfo)) { + if ((outputInfo->thisSize | 8) != sizeof(OrbisVideodec2OutputInfo)) { LOG_ERROR(Lib_Vdec2, "Invalid struct size"); return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE; } @@ -179,7 +179,7 @@ s32 PS4_SYSV_ABI sceVideodec2GetPictureInfo(const OrbisVideodec2OutputInfo* outp if (p1stPictureInfoOut) { OrbisVideodec2AvcPictureInfo* picInfo = static_cast(p1stPictureInfoOut); - if (picInfo->thisSize != sizeof(OrbisVideodec2AvcPictureInfo)) { + if ((picInfo->thisSize | 16) != sizeof(OrbisVideodec2AvcPictureInfo)) { LOG_ERROR(Lib_Vdec2, "Invalid struct size"); return ORBIS_VIDEODEC2_ERROR_STRUCT_SIZE; } diff --git a/src/core/libraries/videodec/videodec2.h b/src/core/libraries/videodec/videodec2.h index abc8f8ab5..410ee8ea6 100644 --- a/src/core/libraries/videodec/videodec2.h +++ b/src/core/libraries/videodec/videodec2.h @@ -73,8 +73,10 @@ struct OrbisVideodec2OutputInfo { u32 frameHeight; void* frameBuffer; u64 frameBufferSize; + u32 frameFormat; + u32 framePitchInBytes; }; -static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x30); +static_assert(sizeof(OrbisVideodec2OutputInfo) == 0x38); struct OrbisVideodec2FrameBuffer { u64 thisSize; diff --git a/src/core/libraries/videodec/videodec2_avc.h b/src/core/libraries/videodec/videodec2_avc.h index 22293ee93..1975209cb 100644 --- a/src/core/libraries/videodec/videodec2_avc.h +++ b/src/core/libraries/videodec/videodec2_avc.h @@ -55,6 +55,23 @@ struct OrbisVideodec2AvcPictureInfo { u8 pic_struct; u8 field_pic_flag; u8 bottom_field_flag; + + u8 sequenceParameterSetPresentFlag; + u8 pictureParameterSetPresentFlag; + u8 auDelimiterPresentFlag; + u8 endOfSequencePresentFlag; + u8 endOfStreamPresentFlag; + u8 fillerDataPresentFlag; + u8 pictureTimingSeiPresentFlag; + u8 bufferingPeriodSeiPresentFlag; + + u8 constraint_set0_flag; + u8 constraint_set1_flag; + u8 constraint_set2_flag; + u8 constraint_set3_flag; + u8 constraint_set4_flag; + u8 constraint_set5_flag; }; +static_assert(sizeof(OrbisVideodec2AvcPictureInfo) == 0x78); } // namespace Libraries::Vdec2 \ No newline at end of file diff --git a/src/core/libraries/videodec/videodec2_impl.cpp b/src/core/libraries/videodec/videodec2_impl.cpp index 22b17c86c..373809c14 100644 --- a/src/core/libraries/videodec/videodec2_impl.cpp +++ b/src/core/libraries/videodec/videodec2_impl.cpp @@ -44,11 +44,15 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData, OrbisVideodec2FrameBuffer& frameBuffer, OrbisVideodec2OutputInfo& outputInfo) { frameBuffer.isAccepted = false; - outputInfo.thisSize = sizeof(OrbisVideodec2OutputInfo); outputInfo.isValid = false; outputInfo.isErrorFrame = true; outputInfo.pictureCount = 0; + // Only set frameFormat if the game uses the newer struct version. + if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) { + outputInfo.frameFormat = 0; + } + if (!inputData.auData) { return ORBIS_VIDEODEC2_ERROR_ACCESS_UNIT_POINTER; } @@ -113,6 +117,11 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData, outputInfo.isErrorFrame = false; outputInfo.pictureCount = 1; // TODO: 2 pictures for interlaced video + // Only set framePitchInBytes if the game uses the newer struct version. + if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) { + outputInfo.framePitchInBytes = frame->linesize[0]; + } + if (outputInfo.isValid) { OrbisVideodec2AvcPictureInfo pictureInfo = {}; @@ -140,11 +149,15 @@ s32 VdecDecoder::Decode(const OrbisVideodec2InputData& inputData, s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer, OrbisVideodec2OutputInfo& outputInfo) { frameBuffer.isAccepted = false; - outputInfo.thisSize = sizeof(OrbisVideodec2OutputInfo); outputInfo.isValid = false; outputInfo.isErrorFrame = true; outputInfo.pictureCount = 0; + // Only set frameFormat if the game uses the newer struct version. + if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) { + outputInfo.frameFormat = 0; + } + AVFrame* frame = av_frame_alloc(); if (!frame) { LOG_ERROR(Lib_Vdec2, "Failed to allocate frame"); @@ -182,6 +195,11 @@ s32 VdecDecoder::Flush(OrbisVideodec2FrameBuffer& frameBuffer, outputInfo.isErrorFrame = false; outputInfo.pictureCount = 1; // TODO: 2 pictures for interlaced video + // Only set framePitchInBytes if the game uses the newer struct version. + if (outputInfo.thisSize == sizeof(OrbisVideodec2OutputInfo)) { + outputInfo.framePitchInBytes = frame->linesize[0]; + } + // FIXME: Should we add picture info here too? } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index e738f85a1..dad42347a 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -222,6 +222,7 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, auto& area = CarveDmemArea(mapping_start, size)->second; area.memory_type = memory_type; area.is_free = false; + MergeAdjacent(dmem_map, dmem_area); return mapping_start; } diff --git a/src/core/memory.h b/src/core/memory.h index 68f9c26c4..6a9b29382 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -75,6 +75,9 @@ struct DirectMemoryArea { if (base + size != next.base) { return false; } + if (memory_type != next.memory_type) { + return false; + } if (is_free != next.is_free) { return false; } diff --git a/src/core/signals.cpp b/src/core/signals.cpp index e47a78cd2..4099ac237 100644 --- a/src/core/signals.cpp +++ b/src/core/signals.cpp @@ -11,6 +11,7 @@ #include #else #include +#include #ifdef ARCH_X86_64 #include #endif diff --git a/src/core/tls.cpp b/src/core/tls.cpp index e13c683e1..0d1d514cf 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -51,7 +51,7 @@ Tcb* GetTcbBase() { // Apple x86_64 // Reserve space in the 32-bit address range for allocating TCB pages. -asm(".zerofill TCB_SPACE,TCB_SPACE,__guest_system,0x3FC000"); +asm(".zerofill TCB_SPACE,TCB_SPACE,__tcb_space,0x3FC000"); struct LdtPage { void* tcb; diff --git a/src/emulator.cpp b/src/emulator.cpp index bb50b8686..f50147818 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -75,7 +75,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar game_folder_name.ends_with("-UPDATE") || game_folder_name.ends_with("-patch")) { // If an executable was launched from a separate update directory, // use the base game directory as the game folder. - const auto base_name = game_folder_name.substr(0, game_folder_name.size() - 7); + const std::string base_name = game_folder_name.substr(0, game_folder_name.rfind('-')); const auto base_path = game_folder.parent_path() / base_name; if (std::filesystem::is_directory(base_path)) { game_folder = base_path; diff --git a/src/main.cpp b/src/main.cpp index 85581774b..8a251c55a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -35,17 +35,19 @@ int main(int argc, char* argv[]) { std::unordered_map> arg_map = { {"-h", [&](int&) { - std::cout << "Usage: shadps4 [options] \n" - "Options:\n" - " -g, --game Specify game path to launch\n" - " -- ... Parameters passed to the game ELF. " - "Needs to be at the end of the line, and everything after \"--\" is a " - "game argument.\n" - " -p, --patch Apply specified patch file\n" - " -f, --fullscreen Specify window initial fullscreen " - "state. Does not overwrite the config file.\n" - " --add-game-folder Adds a new game folder to the config.\n" - " -h, --help Display this help message\n"; + std::cout + << "Usage: shadps4 [options] \n" + "Options:\n" + " -g, --game Specify game path to launch\n" + " -- ... Parameters passed to the game ELF. " + "Needs to be at the end of the line, and everything after \"--\" is a " + "game argument.\n" + " -p, --patch Apply specified patch file\n" + " -i, --ignore-game-patch Disable automatic loading of game patch\n" + " -f, --fullscreen Specify window initial fullscreen " + "state. Does not overwrite the config file.\n" + " --add-game-folder Adds a new game folder to the config.\n" + " -h, --help Display this help message\n"; exit(0); }}, {"--help", [&](int& i) { arg_map["-h"](i); }}, @@ -72,6 +74,8 @@ int main(int argc, char* argv[]) { } }}, {"--patch", [&](int& i) { arg_map["-p"](i); }}, + {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }}, + {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }}, {"-f", [&](int& i) { if (++i >= argc) { diff --git a/src/qt_gui/main.cpp b/src/qt_gui/main.cpp index bd9dca6ce..b7de517e8 100644 --- a/src/qt_gui/main.cpp +++ b/src/qt_gui/main.cpp @@ -41,20 +41,22 @@ int main(int argc, char* argv[]) { std::unordered_map> arg_map = { {"-h", [&](int&) { - std::cout << "Usage: shadps4 [options]\n" - "Options:\n" - " No arguments: Opens the GUI.\n" - " -g, --game Specify or " - " to launch\n" - " -- ... Parameters passed to the game ELF. " - "Needs to be at the end of the line, and everything after \"--\" is a " - "game argument.\n" - " -p, --patch Apply specified patch file\n" - " -s, --show-gui Show the GUI\n" - " -f, --fullscreen Specify window initial fullscreen " - "state. Does not overwrite the config file.\n" - " --add-game-folder Adds a new game folder to the config.\n" - " -h, --help Display this help message\n"; + std::cout + << "Usage: shadps4 [options]\n" + "Options:\n" + " No arguments: Opens the GUI.\n" + " -g, --game Specify or " + " to launch\n" + " -- ... Parameters passed to the game ELF. " + "Needs to be at the end of the line, and everything after \"--\" is a " + "game argument.\n" + " -p, --patch Apply specified patch file\n" + " -i, --ignore-game-patch Disable automatic loading of game patch\n" + " -s, --show-gui Show the GUI\n" + " -f, --fullscreen Specify window initial fullscreen " + "state. Does not overwrite the config file.\n" + " --add-game-folder Adds a new game folder to the config.\n" + " -h, --help Display this help message\n"; exit(0); }}, {"--help", [&](int& i) { arg_map["-h"](i); }}, // Redirect --help to -h @@ -84,6 +86,8 @@ int main(int argc, char* argv[]) { } }}, {"--patch", [&](int& i) { arg_map["-p"](i); }}, + {"-i", [&](int&) { Core::FileSys::MntPoints::ignore_game_patches = true; }}, + {"--ignore-game-patch", [&](int& i) { arg_map["-i"](i); }}, {"-f", [&](int& i) { if (++i >= argc) { diff --git a/src/qt_gui/translations/ar_SA.ts b/src/qt_gui/translations/ar_SA.ts index 26e768720..7d0c15e6b 100644 --- a/src/qt_gui/translations/ar_SA.ts +++ b/src/qt_gui/translations/ar_SA.ts @@ -2049,7 +2049,7 @@ Nightly: نُسخ تحتوي على أحدث الميزات، لكنها أقل * Unsupported Vulkan Version - * Unsupported Vulkan Version + نسخ Vulkan غير مدعومة diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 37d7eea35..93fb81df4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -303,7 +303,8 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddExtension("SPV_KHR_physical_storage_buffer"); } - if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) { + const auto shared_type_count = std::popcount(static_cast(info.shared_types)); + if (shared_type_count > 1 && profile.supports_workgroup_explicit_memory_layout) { ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index 13fd8e180..47290e7e8 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -27,6 +27,19 @@ Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, }); } +Id SharedAtomicU32IncDec(EmitContext& ctx, Id offset, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(2U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); + }); +} + Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(3U)}; @@ -40,19 +53,6 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, }); } -Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset, - Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { - const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; - const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; - const Id pointer{ - ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; - const auto [scope, semantics]{AtomicArgs(ctx)}; - return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); - }); -} - Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& buffer = ctx.buffers[handle]; @@ -68,6 +68,21 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id }); } +Id BufferAtomicU32IncDec(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { + return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics); + }); +} + Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id cmp_value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id, Id, Id)) { @@ -156,12 +171,12 @@ Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub); } -Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset) { - return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); +Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) { + return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); } -Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset) { - return SharedAtomicU32_IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); +Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) { + return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); } Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { @@ -172,6 +187,10 @@ Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicISub); +} + Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); } @@ -188,14 +207,12 @@ Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicUMax); } -Id EmitBufferAtomicInc32(EmitContext&, IR::Inst*, u32, Id, Id) { - // TODO - UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicInc32); +Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIIncrement); } -Id EmitBufferAtomicDec32(EmitContext&, IR::Inst*, u32, Id, Id) { - // TODO - UNREACHABLE_MSG("Unsupported BUFFER_ATOMIC opcode: ", IR::Opcode::BufferAtomicDec32); +Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + return BufferAtomicU32IncDec(ctx, inst, handle, address, &Sirit::Module::OpAtomicIDecrement); } Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h index 41e70c8c3..e66467c6b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h @@ -1,31 +1,54 @@ // SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#pragma once + #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { -template -auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - Id zero_value{}; +template +std::tuple ResolveTypeAndZero(EmitContext& ctx) { Id result_type{}; - if constexpr (bit_size == 64) { - zero_value = ctx.u64_zero_value; + Id zero_value{}; + if constexpr (bit_size == 64 && num_components == 1 && !is_float) { result_type = ctx.U64; + zero_value = ctx.u64_zero_value; } else if constexpr (bit_size == 32) { - zero_value = ctx.u32_zero_value; - result_type = ctx.U32[1]; - } else if constexpr (bit_size == 16) { - zero_value = ctx.u16_zero_value; + if (is_float) { + result_type = ctx.F32[num_components]; + zero_value = ctx.f32_zero_value; + } else { + result_type = ctx.U32[num_components]; + zero_value = ctx.u32_zero_value; + } + } else if constexpr (bit_size == 16 && num_components == 1 && !is_float) { result_type = ctx.U16; + zero_value = ctx.u16_zero_value; + } else if constexpr (bit_size == 8 && num_components == 1 && !is_float) { + result_type = ctx.U8; + zero_value = ctx.u8_zero_value; } else { - static_assert(false, "type not supported"); + static_assert(false, "Type not supported."); } + if (num_components > 1) { + std::array zero_ids; + zero_ids.fill(zero_value); + zero_value = ctx.ConstantComposite(result_type, zero_ids); + } + return {result_type, zero_value}; +} + +template +auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { if (Sirit::ValidId(buffer_size)) { // Bounds checking enabled, wrap in a conditional branch to make sure that // the atomic is not mistakenly executed when the index is out of bounds. - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); + auto compare_index = index; + if (num_components > 1) { + compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1)); + } + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); const Id ib_label = ctx.OpLabel(); const Id end_label = ctx.OpLabel(); ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); @@ -36,6 +59,8 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun ctx.OpBranch(end_label); ctx.AddLabel(end_label); if (Sirit::ValidId(ib_result)) { + const auto [result_type, zero_value] = + ResolveTypeAndZero(ctx); return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label); } else { return Id{0}; @@ -45,4 +70,21 @@ auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_fun return emit_func(); } +template +static Id LoadAccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result) { + if (Sirit::ValidId(buffer_size)) { + // Bounds checking enabled, wrap in a select. + auto compare_index = index; + if (num_components > 1) { + compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(num_components - 1)); + } + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); + const auto [result_type, zero_value] = + ResolveTypeAndZero(ctx); + return ctx.OpSelect(result_type, in_bounds, result, zero_value); + } + // Bounds checking not enabled, just return the plain value. + return result; +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 658d4759f..ccbe54d0a 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -11,6 +11,8 @@ #include +#include "emit_spirv_bounds.h" + namespace Shader::Backend::SPIRV { namespace { @@ -239,8 +241,8 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { } if (IR::IsParam(attr)) { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& param{ctx.input_params.at(index)}; + const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)}; + const auto& param{ctx.input_params.at(param_index)}; if (param.buffer_handle >= 0) { const auto step_rate = EmitReadStepRate(ctx, param.id.value); const auto offset = ctx.OpIAdd( @@ -415,27 +417,6 @@ void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) { ctx.OpStore(pointer, value); } -template -static Id EmitLoadBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, Id result, - bool is_float) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a select. - const auto result_type = is_float ? ctx.F32[N] : ctx.U32[N]; - auto compare_index = index; - auto zero_value = is_float ? ctx.f32_zero_value : ctx.u32_zero_value; - if (N > 1) { - compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1)); - std::array zero_ids; - zero_ids.fill(zero_value); - zero_value = ctx.ConstantComposite(result_type, zero_ids); - } - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); - return ctx.OpSelect(result_type, in_bounds, result, zero_value); - } - // Bounds checking not enabled, just return the plain value. - return result; -} - template static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto flags = inst->Flags(); @@ -454,8 +435,9 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a const Id result_i = ctx.OpLoad(data_types[1], ptr_i); if (!flags.typed) { // Untyped loads have bounds checking per-component. - ids.push_back(EmitLoadBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, - result_i, alias == PointerType::F32)); + ids.push_back(LoadAccessBoundsCheck < 32, 1, + alias == + PointerType::F32 > (ctx, index_i, spv_buffer.size_dwords, result_i)); } else { ids.push_back(result_i); } @@ -464,8 +446,8 @@ static Id EmitLoadBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a const Id result = N == 1 ? ids[0] : ctx.OpCompositeConstruct(data_types[N], ids); if (flags.typed) { // Typed loads have single bounds check for the whole load. - return EmitLoadBufferBoundsCheck(ctx, index, spv_buffer.size_dwords, result, - alias == PointerType::F32); + return LoadAccessBoundsCheck < 32, N, + alias == PointerType::F32 > (ctx, index, spv_buffer.size_dwords, result); } return result; } @@ -477,8 +459,8 @@ Id EmitLoadBufferU8(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { } const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; - const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U8, ptr))}; - return EmitLoadBufferBoundsCheck<1>(ctx, address, spv_buffer.size, result, false); + const Id result{ctx.OpLoad(ctx.U8, ptr)}; + return LoadAccessBoundsCheck<8>(ctx, address, spv_buffer.size, result); } Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { @@ -489,8 +471,8 @@ Id EmitLoadBufferU16(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpUConvert(ctx.U32[1], ctx.OpLoad(ctx.U16, ptr))}; - return EmitLoadBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, result, false); + const Id result{ctx.OpLoad(ctx.U16, ptr)}; + return LoadAccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, result); } Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { @@ -509,6 +491,18 @@ Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) return EmitLoadBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address); } +Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[PointerType::U64]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)}; + const Id result{ctx.OpLoad(ctx.U64, ptr)}; + return LoadAccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, result); +} + Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { return EmitLoadBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address); } @@ -529,29 +523,6 @@ Id EmitLoadBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addr UNREACHABLE_MSG("SPIR-V instruction"); } -template -void EmitStoreBufferBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a conditional branch. - auto compare_index = index; - if (N > 1) { - compare_index = ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(N - 1)); - } - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], compare_index, buffer_size); - const Id in_bounds_label = ctx.OpLabel(); - const Id merge_label = ctx.OpLabel(); - ctx.OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); - ctx.OpBranchConditional(in_bounds, in_bounds_label, merge_label); - ctx.AddLabel(in_bounds_label); - emit_func(); - ctx.OpBranch(merge_label); - ctx.AddLabel(merge_label); - return; - } - // Bounds checking not enabled, just perform the store. - emit_func(); -} - template static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { @@ -569,19 +540,25 @@ static void EmitStoreBufferB32xN(EmitContext& ctx, IR::Inst* inst, u32 handle, I const Id index_i = i == 0 ? index : ctx.OpIAdd(ctx.U32[1], index, ctx.ConstU32(i)); const Id ptr_i = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index_i); const Id value_i = N == 1 ? value : ctx.OpCompositeExtract(data_types[1], value, i); - auto store_i = [&]() { ctx.OpStore(ptr_i, value_i); }; + auto store_i = [&] { + ctx.OpStore(ptr_i, value_i); + return Id{}; + }; if (!flags.typed) { // Untyped stores have bounds checking per-component. - EmitStoreBufferBoundsCheck<1>(ctx, index_i, spv_buffer.size_dwords, store_i); + AccessBoundsCheck<32, 1, alias == PointerType::F32>( + ctx, index_i, spv_buffer.size_dwords, store_i); } else { store_i(); } } + return Id{}; }; if (flags.typed) { // Typed stores have single bounds check for the whole store. - EmitStoreBufferBoundsCheck(ctx, index, spv_buffer.size_dwords, store); + AccessBoundsCheck<32, N, alias == PointerType::F32>(ctx, index, spv_buffer.size_dwords, + store); } else { store(); } @@ -594,8 +571,10 @@ void EmitStoreBufferU8(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id v } const auto [id, pointer_type] = spv_buffer[PointerType::U8]; const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, address)}; - const Id result{ctx.OpUConvert(ctx.U8, value)}; - EmitStoreBufferBoundsCheck<1>(ctx, address, spv_buffer.size, [&] { ctx.OpStore(ptr, result); }); + AccessBoundsCheck<8>(ctx, address, spv_buffer.size, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); } void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { @@ -606,9 +585,10 @@ void EmitStoreBufferU16(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id const auto [id, pointer_type] = spv_buffer[PointerType::U16]; const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(1u)); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpUConvert(ctx.U16, value)}; - EmitStoreBufferBoundsCheck<1>(ctx, index, spv_buffer.size_shorts, - [&] { ctx.OpStore(ptr, result); }); + AccessBoundsCheck<16>(ctx, index, spv_buffer.size_shorts, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); } void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { @@ -627,6 +607,20 @@ void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre EmitStoreBufferB32xN<4, PointerType::U32>(ctx, inst, handle, address, value); } +void EmitStoreBufferU64(EmitContext& ctx, IR::Inst*, u32 handle, Id address, Id value) { + const auto& spv_buffer = ctx.buffers[handle]; + if (Sirit::ValidId(spv_buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, spv_buffer.offset); + } + const auto [id, pointer_type] = spv_buffer[PointerType::U64]; + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u64_zero_value, index)}; + AccessBoundsCheck<64>(ctx, index, spv_buffer.size_qwords, [&] { + ctx.OpStore(ptr, value); + return Id{}; + }); +} + void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { EmitStoreBufferB32xN<1, PointerType::F32>(ctx, inst, handle, address, value); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp index 945fa6877..c75f43393 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp @@ -263,4 +263,12 @@ Id EmitConvertU32U16(EmitContext& ctx, Id value) { return ctx.OpUConvert(ctx.U32[1], value); } +Id EmitConvertU8U32(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U8, value); +} + +Id EmitConvertU32U8(EmitContext& ctx, Id value) { + return ctx.OpUConvert(ctx.U32[1], value); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 3441c5a23..daf1b973e 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -69,6 +69,7 @@ Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitLoadBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); @@ -80,6 +81,7 @@ void EmitStoreBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address void EmitStoreBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferU32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +void EmitStoreBufferU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -87,12 +89,13 @@ void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicISub32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); -Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); -Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicInc32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); +Id EmitBufferAtomicDec32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -136,8 +139,8 @@ Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); -Id EmitSharedAtomicIIncrement32(EmitContext& ctx, Id offset); -Id EmitSharedAtomicIDecrement32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset); Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); @@ -461,6 +464,8 @@ Id EmitConvertF64U32(EmitContext& ctx, Id value); Id EmitConvertF64U64(EmitContext& ctx, Id value); Id EmitConvertU16U32(EmitContext& ctx, Id value); Id EmitConvertU32U16(EmitContext& ctx, Id value); +Id EmitConvertU8U32(EmitContext& ctx, Id value); +Id EmitConvertU32U8(EmitContext& ctx, Id value); Id EmitImageSampleRaw(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address1, Id address2, Id address3, Id address4); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 672856397..0a8f78f72 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -299,8 +299,7 @@ void EmitContext::DefineInterpolatedAttribs() { // Iterate all input attributes, load them and manually interpolate. for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) { const auto& input = runtime_info.fs_info.inputs[i]; - const u32 semantic = input.param_index; - auto& params = input_params[semantic]; + auto& params = input_params[i]; if (input.is_flat || params.is_loaded) { continue; } @@ -318,7 +317,7 @@ void EmitContext::DefineInterpolatedAttribs() { const Id p10_y{OpVectorTimesScalar(F32[4], p10, bary_coord_y)}; const Id p20_z{OpVectorTimesScalar(F32[4], p20, bary_coord_z)}; params.id = OpFAdd(F32[4], p0, OpFAdd(F32[4], p10_y, p20_z)); - Name(params.id, fmt::format("fs_in_attr{}", semantic)); + Name(params.id, fmt::format("fs_in_attr{}", i)); params.is_loaded = true; } } @@ -427,25 +426,28 @@ void EmitContext::DefineInputs() { } for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) { const auto& input = runtime_info.fs_info.inputs[i]; - const u32 semantic = input.param_index; - ASSERT(semantic < IR::NumParams); if (input.IsDefault()) { - input_params[semantic] = { - MakeDefaultValue(*this, input.default_value), input_f32, F32[1], 4, false, true, + input_params[i] = { + .id = MakeDefaultValue(*this, input.default_value), + .pointer_type = input_f32, + .component_type = F32[1], + .num_components = 4, + .is_integer = false, + .is_loaded = true, }; continue; } - const IR::Attribute param{IR::Attribute::Param0 + input.param_index}; + const IR::Attribute param{IR::Attribute::Param0 + i}; const u32 num_components = info.loads.NumComponents(param); const Id type{F32[num_components]}; Id attr_id{}; if (profile.needs_manual_interpolation && !input.is_flat) { - attr_id = DefineInput(TypeArray(type, ConstU32(3U)), semantic); + attr_id = DefineInput(TypeArray(type, ConstU32(3U)), input.param_index); Decorate(attr_id, spv::Decoration::PerVertexKHR); - Name(attr_id, fmt::format("fs_in_attr{}_p", semantic)); + Name(attr_id, fmt::format("fs_in_attr{}_p", i)); } else { - attr_id = DefineInput(type, semantic); - Name(attr_id, fmt::format("fs_in_attr{}", semantic)); + attr_id = DefineInput(type, input.param_index); + Name(attr_id, fmt::format("fs_in_attr{}", i)); if (input.is_flat) { Decorate(attr_id, spv::Decoration::Flat); @@ -453,7 +455,7 @@ void EmitContext::DefineInputs() { Decorate(attr_id, spv::Decoration::NoPerspective); } } - input_params[semantic] = + input_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, false); } break; @@ -977,32 +979,46 @@ void EmitContext::DefineImagesAndSamplers() { } void EmitContext::DefineSharedMemory() { - if (!info.uses_shared) { + const auto num_types = std::popcount(static_cast(info.shared_types)); + if (num_types == 0) { return; } ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - const auto make_type = [&](Id element_type, u32 element_size) { + const auto make_type = [&](IR::Type type, Id element_type, u32 element_size, + std::string_view name) { + if (False(info.shared_types & type)) { + // Skip unused shared memory types. + return std::make_tuple(Id{}, Id{}, Id{}); + } + const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)}; const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; Decorate(array_type, spv::Decoration::ArrayStride, element_size); const Id struct_type{TypeStruct(array_type)}; MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u); - Decorate(struct_type, spv::Decoration::Block); const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type); const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type); const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup); - Decorate(variable, spv::Decoration::Aliased); + Name(variable, name); interfaces.push_back(variable); + if (num_types > 1) { + Decorate(struct_type, spv::Decoration::Block); + Decorate(variable, spv::Decoration::Aliased); + } + return std::make_tuple(variable, element_pointer, pointer); }; - std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u); - std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u); - std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u); + std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = + make_type(IR::Type::U16, U16, 2u, "shared_mem_u16"); + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = + make_type(IR::Type::U32, U32[1], 4u, "shared_mem_u32"); + std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = + make_type(IR::Type::U64, U64, 8u, "shared_mem_u64"); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 4b6a58fd0..8ead93f78 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -216,34 +216,38 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid if (is_pair) { const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); - if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); - } else { + if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), addr0); + } else if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + } else if (bit_size == 16) { + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); - if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); - } else { + if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), addr1); + } else if (bit_size == 32) { + ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + } else if (bit_size == 16) { + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); } - } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = - ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); - } else if (bit_size == 16) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + if (bit_size == 64) { + const IR::Value data = + ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + } else if (bit_size == 32) { + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + } else if (bit_size == 16) { + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + } } } @@ -264,7 +268,7 @@ void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) { const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIIncrement(addr_offset); + const IR::Value original_val = ir.SharedAtomicInc(addr_offset); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); } @@ -275,7 +279,7 @@ void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) { const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIDecrement(addr_offset); + const IR::Value original_val = ir.SharedAtomicDec(addr_offset); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); } @@ -309,36 +313,38 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data0}); - } else { + if (bit_size == 64) { const auto vector = ir.UnpackUint2x32(IR::U64{data0}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); - if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data1}); - } else { + if (bit_size == 64) { const auto vector = ir.UnpackUint2x32(IR::U64{data1}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); } - } else if (bit_size == 64) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - const auto vector = ir.UnpackUint2x32(IR::U64{data}); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 16) { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)}; - ir.SetVectorReg(dst_reg, ir.UConvert(32, data)); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; - ir.SetVectorReg(dst_reg, data); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); + } } } diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp index 2d7297c12..5a287dbe2 100644 --- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp +++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp @@ -22,15 +22,17 @@ void Translator::EmitVectorInterpolation(const GcnInst& inst) { // VINTRP void Translator::V_INTERP_P2_F32(const GcnInst& inst) { - const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); - info.interp_qualifiers[attr.param_index] = vgpr_to_interp[inst.src[0].code]; - const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; + const u32 attr_index = inst.control.vintrp.attr; + const auto& attr = runtime_info.fs_info.inputs.at(attr_index); + info.interp_qualifiers[attr_index] = vgpr_to_interp[inst.src[0].code]; + const IR::Attribute attrib{IR::Attribute::Param0 + attr_index}; SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan)); } void Translator::V_INTERP_MOV_F32(const GcnInst& inst) { - const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); - const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; + const u32 attr_index = inst.control.vintrp.attr; + const auto& attr = runtime_info.fs_info.inputs.at(attr_index); + const IR::Attribute attrib{IR::Attribute::Param0 + attr_index}; SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan)); } diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 5eb2079a4..54e8b8ee8 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -354,9 +354,9 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { case AtomicOp::Xor: return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info); case AtomicOp::Inc: - return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info); + return ir.BufferAtomicInc(handle, address, buffer_info); case AtomicOp::Dec: - return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info); + return ir.BufferAtomicDec(handle, address, buffer_info); default: UNREACHABLE(); } diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index e14c7988d..f25111350 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -214,7 +214,7 @@ struct Info { bool uses_lane_id{}; bool uses_group_quad{}; bool uses_group_ballot{}; - bool uses_shared{}; + IR::Type shared_types{}; bool uses_fp16{}; bool uses_fp64{}; bool uses_pack_10_11_11{}; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 2c37c8099..3d7cf71dc 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -353,12 +353,12 @@ U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) { return Inst(Opcode::SharedAtomicXor32, address, data); } -U32 IREmitter::SharedAtomicIIncrement(const U32& address) { - return Inst(Opcode::SharedAtomicIIncrement32, address); +U32 IREmitter::SharedAtomicInc(const U32& address) { + return Inst(Opcode::SharedAtomicInc32, address); } -U32 IREmitter::SharedAtomicIDecrement(const U32& address) { - return Inst(Opcode::SharedAtomicIDecrement32, address); +U32 IREmitter::SharedAtomicDec(const U32& address) { + return Inst(Opcode::SharedAtomicDec32, address); } U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) { @@ -373,12 +373,12 @@ U32 IREmitter::ReadConstBuffer(const Value& handle, const U32& index) { return Inst(Opcode::ReadConstBuffer, handle, index); } -U32 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { - return Inst(Opcode::LoadBufferU8, Flags{info}, handle, address); +U8 IREmitter::LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU8, Flags{info}, handle, address); } -U32 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) { - return Inst(Opcode::LoadBufferU16, Flags{info}, handle, address); +U16 IREmitter::LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU16, Flags{info}, handle, address); } Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& address, @@ -397,6 +397,10 @@ Value IREmitter::LoadBufferU32(int num_dwords, const Value& handle, const Value& } } +U64 IREmitter::LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::LoadBufferU64, Flags{info}, handle, address); +} + Value IREmitter::LoadBufferF32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info) { switch (num_dwords) { @@ -417,12 +421,12 @@ Value IREmitter::LoadBufferFormat(const Value& handle, const Value& address, Buf return Inst(Opcode::LoadBufferFormatF32, Flags{info}, handle, address); } -void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U32& data, +void IREmitter::StoreBufferU8(const Value& handle, const Value& address, const U8& data, BufferInstInfo info) { Inst(Opcode::StoreBufferU8, Flags{info}, handle, address, data); } -void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U32& data, +void IREmitter::StoreBufferU16(const Value& handle, const Value& address, const U16& data, BufferInstInfo info) { Inst(Opcode::StoreBufferU16, Flags{info}, handle, address, data); } @@ -447,6 +451,11 @@ void IREmitter::StoreBufferU32(int num_dwords, const Value& handle, const Value& } } +void IREmitter::StoreBufferU64(const Value& handle, const Value& address, const U64& data, + BufferInstInfo info) { + Inst(Opcode::StoreBufferU64, Flags{info}, handle, address, data); +} + void IREmitter::StoreBufferF32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info) { switch (num_dwords) { @@ -474,7 +483,19 @@ void IREmitter::StoreBufferFormat(const Value& handle, const Value& address, con Value IREmitter::BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info) { - return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value); + switch (value.Type()) { + case Type::U32: + return Inst(Opcode::BufferAtomicIAdd32, Flags{info}, handle, address, value); + case Type::U64: + return Inst(Opcode::BufferAtomicIAdd64, Flags{info}, handle, address, value); + default: + ThrowInvalidType(value.Type()); + } +} + +Value IREmitter::BufferAtomicISub(const Value& handle, const Value& address, const Value& value, + BufferInstInfo info) { + return Inst(Opcode::BufferAtomicISub32, Flags{info}, handle, address, value); } Value IREmitter::BufferAtomicIMin(const Value& handle, const Value& address, const Value& value, @@ -489,14 +510,12 @@ Value IREmitter::BufferAtomicIMax(const Value& handle, const Value& address, con : Inst(Opcode::BufferAtomicUMax32, Flags{info}, handle, address, value); } -Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, const Value& value, - BufferInstInfo info) { - return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address, value); +Value IREmitter::BufferAtomicInc(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::BufferAtomicInc32, Flags{info}, handle, address); } -Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, const Value& value, - BufferInstInfo info) { - return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address, value); +Value IREmitter::BufferAtomicDec(const Value& handle, const Value& address, BufferInstInfo info) { + return Inst(Opcode::BufferAtomicDec32, Flags{info}, handle, address); } Value IREmitter::BufferAtomicAnd(const Value& handle, const Value& address, const Value& value, @@ -1804,8 +1823,15 @@ F32F64 IREmitter::ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_s : ConvertUToF(dest_bitsize, src_bitsize, value); } -U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { +U8U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U8U16U32U64& value) { switch (result_bitsize) { + case 8: + switch (value.Type()) { + case Type::U32: + return Inst(Opcode::ConvertU8U32, value); + default: + break; + } case 16: switch (value.Type()) { case Type::U32: @@ -1815,6 +1841,8 @@ U16U32U64 IREmitter::UConvert(size_t result_bitsize, const U16U32U64& value) { } case 32: switch (value.Type()) { + case Type::U8: + return Inst(Opcode::ConvertU32U8, value); case Type::U16: return Inst(Opcode::ConvertU32U16, value); default: diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index eae44ed04..215a35ee9 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -100,33 +100,35 @@ public: void WriteShared(int bit_size, const Value& value, const U32& offset); [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); + [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); + [[nodiscard]] U32 SharedAtomicInc(const U32& address); + [[nodiscard]] U32 SharedAtomicDec(const U32& address); [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data); [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicIIncrement(const U32& address); - [[nodiscard]] U32 SharedAtomicIDecrement(const U32& address); - [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); - [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); - [[nodiscard]] U32 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); - [[nodiscard]] U32 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U8 LoadBufferU8(const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U16 LoadBufferU16(const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferU32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); + [[nodiscard]] U64 LoadBufferU64(const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferF32(int num_dwords, const Value& handle, const Value& address, BufferInstInfo info); [[nodiscard]] Value LoadBufferFormat(const Value& handle, const Value& address, BufferInstInfo info); - void StoreBufferU8(const Value& handle, const Value& address, const U32& data, + void StoreBufferU8(const Value& handle, const Value& address, const U8& data, BufferInstInfo info); - void StoreBufferU16(const Value& handle, const Value& address, const U32& data, + void StoreBufferU16(const Value& handle, const Value& address, const U16& data, BufferInstInfo info); void StoreBufferU32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); + void StoreBufferU64(const Value& handle, const Value& address, const U64& data, + BufferInstInfo info); void StoreBufferF32(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); void StoreBufferFormat(const Value& handle, const Value& address, const Value& data, @@ -134,14 +136,16 @@ public: [[nodiscard]] Value BufferAtomicIAdd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); + [[nodiscard]] Value BufferAtomicISub(const Value& handle, const Value& address, + const Value& value, BufferInstInfo info); [[nodiscard]] Value BufferAtomicIMin(const Value& handle, const Value& address, const Value& value, bool is_signed, BufferInstInfo info); [[nodiscard]] Value BufferAtomicIMax(const Value& handle, const Value& address, const Value& value, bool is_signed, BufferInstInfo info); [[nodiscard]] Value BufferAtomicInc(const Value& handle, const Value& address, - const Value& value, BufferInstInfo info); + BufferInstInfo info); [[nodiscard]] Value BufferAtomicDec(const Value& handle, const Value& address, - const Value& value, BufferInstInfo info); + BufferInstInfo info); [[nodiscard]] Value BufferAtomicAnd(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); [[nodiscard]] Value BufferAtomicOr(const Value& handle, const Value& address, @@ -309,7 +313,7 @@ public: [[nodiscard]] F32F64 ConvertIToF(size_t dest_bitsize, size_t src_bitsize, bool is_signed, const Value& value); - [[nodiscard]] U16U32U64 UConvert(size_t result_bitsize, const U16U32U64& value); + [[nodiscard]] U8U16U32U64 UConvert(size_t result_bitsize, const U8U16U32U64& value); [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value); [[nodiscard]] Value ImageAtomicIAdd(const Value& handle, const Value& coords, diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index a57310fb9..c2311afea 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -60,12 +60,15 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferU32x2: case Opcode::StoreBufferU32x3: case Opcode::StoreBufferU32x4: + case Opcode::StoreBufferU64: case Opcode::StoreBufferF32: case Opcode::StoreBufferF32x2: case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: case Opcode::StoreBufferFormatF32: case Opcode::BufferAtomicIAdd32: + case Opcode::BufferAtomicIAdd64: + case Opcode::BufferAtomicISub32: case Opcode::BufferAtomicSMin32: case Opcode::BufferAtomicUMin32: case Opcode::BufferAtomicSMax32: @@ -76,15 +79,21 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::BufferAtomicOr32: case Opcode::BufferAtomicXor32: case Opcode::BufferAtomicSwap32: + case Opcode::BufferAtomicCmpSwap32: case Opcode::DataAppend: case Opcode::DataConsume: - case Opcode::WriteSharedU64: + case Opcode::WriteSharedU16: case Opcode::WriteSharedU32: + case Opcode::WriteSharedU64: case Opcode::SharedAtomicIAdd32: + case Opcode::SharedAtomicIAdd64: + case Opcode::SharedAtomicISub32: case Opcode::SharedAtomicSMin32: case Opcode::SharedAtomicUMin32: case Opcode::SharedAtomicSMax32: case Opcode::SharedAtomicUMax32: + case Opcode::SharedAtomicInc32: + case Opcode::SharedAtomicDec32: case Opcode::SharedAtomicAnd32: case Opcode::SharedAtomicOr32: case Opcode::SharedAtomicXor32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index e96e32297..1621d2acf 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -35,21 +35,21 @@ OPCODE(LoadSharedU32, U32, U32, OPCODE(LoadSharedU64, U64, U32, ) OPCODE(WriteSharedU16, Void, U32, U16, ) OPCODE(WriteSharedU32, Void, U32, U32, ) -OPCODE(WriteSharedU64, Void, U32, U64, ) +OPCODE(WriteSharedU64, Void, U32, U64, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) +OPCODE(SharedAtomicISub32, U32, U32, U32, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) OPCODE(SharedAtomicUMax32, U32, U32, U32, ) +OPCODE(SharedAtomicInc32, U32, U32, ) +OPCODE(SharedAtomicDec32, U32, U32, ) OPCODE(SharedAtomicAnd32, U32, U32, U32, ) OPCODE(SharedAtomicOr32, U32, U32, U32, ) OPCODE(SharedAtomicXor32, U32, U32, U32, ) -OPCODE(SharedAtomicISub32, U32, U32, U32, ) -OPCODE(SharedAtomicIIncrement32, U32, U32, ) -OPCODE(SharedAtomicIDecrement32, U32, U32, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) @@ -94,23 +94,25 @@ OPCODE(UndefU32, U32, OPCODE(UndefU64, U64, ) // Buffer operations -OPCODE(LoadBufferU8, U32, Opaque, Opaque, ) -OPCODE(LoadBufferU16, U32, Opaque, Opaque, ) +OPCODE(LoadBufferU8, U8, Opaque, Opaque, ) +OPCODE(LoadBufferU16, U16, Opaque, Opaque, ) OPCODE(LoadBufferU32, U32, Opaque, Opaque, ) OPCODE(LoadBufferU32x2, U32x2, Opaque, Opaque, ) OPCODE(LoadBufferU32x3, U32x3, Opaque, Opaque, ) OPCODE(LoadBufferU32x4, U32x4, Opaque, Opaque, ) +OPCODE(LoadBufferU64, U64, Opaque, Opaque, ) OPCODE(LoadBufferF32, F32, Opaque, Opaque, ) OPCODE(LoadBufferF32x2, F32x2, Opaque, Opaque, ) OPCODE(LoadBufferF32x3, F32x3, Opaque, Opaque, ) OPCODE(LoadBufferF32x4, F32x4, Opaque, Opaque, ) OPCODE(LoadBufferFormatF32, F32x4, Opaque, Opaque, ) -OPCODE(StoreBufferU8, Void, Opaque, Opaque, U32, ) -OPCODE(StoreBufferU16, Void, Opaque, Opaque, U32, ) +OPCODE(StoreBufferU8, Void, Opaque, Opaque, U8, ) +OPCODE(StoreBufferU16, Void, Opaque, Opaque, U16, ) OPCODE(StoreBufferU32, Void, Opaque, Opaque, U32, ) OPCODE(StoreBufferU32x2, Void, Opaque, Opaque, U32x2, ) OPCODE(StoreBufferU32x3, Void, Opaque, Opaque, U32x3, ) OPCODE(StoreBufferU32x4, Void, Opaque, Opaque, U32x4, ) +OPCODE(StoreBufferU64, Void, Opaque, Opaque, U64, ) OPCODE(StoreBufferF32, Void, Opaque, Opaque, F32, ) OPCODE(StoreBufferF32x2, Void, Opaque, Opaque, F32x2, ) OPCODE(StoreBufferF32x3, Void, Opaque, Opaque, F32x3, ) @@ -120,12 +122,13 @@ OPCODE(StoreBufferFormatF32, Void, Opaq // Buffer atomic operations OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 ) +OPCODE(BufferAtomicISub32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMax32, U32, Opaque, Opaque, U32 ) -OPCODE(BufferAtomicInc32, U32, Opaque, Opaque, U32, ) -OPCODE(BufferAtomicDec32, U32, Opaque, Opaque, U32, ) +OPCODE(BufferAtomicInc32, U32, Opaque, Opaque, ) +OPCODE(BufferAtomicDec32, U32, Opaque, Opaque, ) OPCODE(BufferAtomicAnd32, U32, Opaque, Opaque, U32, ) OPCODE(BufferAtomicOr32, U32, Opaque, Opaque, U32, ) OPCODE(BufferAtomicXor32, U32, Opaque, Opaque, U32, ) @@ -405,6 +408,8 @@ OPCODE(ConvertF64U32, F64, U32, OPCODE(ConvertF32U16, F32, U16, ) OPCODE(ConvertU16U32, U16, U32, ) OPCODE(ConvertU32U16, U32, U16, ) +OPCODE(ConvertU8U32, U8, U32, ) +OPCODE(ConvertU32U8, U32, U8, ) // Image operations OPCODE(ImageSampleRaw, F32x4, Opaque, F32x4, F32x4, F32x4, F32, ) diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index 5cf8a1525..156cb6628 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -438,7 +438,9 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; const IR::U32 addr{inst.Arg(0)}; - const IR::U32 data{inst.Arg(1).Resolve()}; + const IR::Value data = num_dwords == 2 + ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) + : inst.Arg(1).Resolve(); const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind, u32 off_dw) { @@ -466,10 +468,10 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); if (num_dwords == 1) { - SetOutput(addr, data, region, 0); + SetOutput(addr, IR::U32{data}, region, 0); } else { for (auto i = 0; i < num_dwords; i++) { - SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i); + SetOutput(addr, IR::U32{ir.CompositeExtract(data, i)}, region, i); } } inst.Invalidate(); @@ -499,7 +501,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { ReadTessControlPointAttribute(addr, stride, ir, i, is_tcs_output_read); read_components.push_back(ir.BitCast(component)); } - attr_read = ir.CompositeConstruct(read_components); + attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components)); } inst.ReplaceUsesWithAndRemove(attr_read); break; @@ -578,7 +580,7 @@ void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { const IR::F32 component = GetInput(addr, i); read_components.push_back(ir.BitCast(component)); } - attr_read = ir.CompositeConstruct(read_components); + attr_read = ir.PackUint2x32(ir.CompositeConstruct(read_components)); } inst.ReplaceUsesWithAndRemove(attr_read); break; diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 06e4ac850..57d36f6df 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -28,6 +28,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile); +void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile); void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile); diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp index fcb86e3fb..bb36e2748 100644 --- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp +++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp @@ -34,13 +34,13 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con interpreted = ir.Imm32(0.f); break; case AmdGpu::DataFormat::Format8: { - const auto unpacked = - ir.Unpack4x8(format_info.num_format, ir.LoadBufferU8(handle, address, info)); + const auto raw = ir.UConvert(32, ir.LoadBufferU8(handle, address, info)); + const auto unpacked = ir.Unpack4x8(format_info.num_format, raw); interpreted = ir.CompositeExtract(unpacked, 0); break; } case AmdGpu::DataFormat::Format8_8: { - const auto raw = ir.LoadBufferU16(handle, address, info); + const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info)); const auto unpacked = ir.Unpack4x8(format_info.num_format, raw); interpreted = ir.CompositeConstruct(ir.CompositeExtract(unpacked, 0), ir.CompositeExtract(unpacked, 1)); @@ -51,8 +51,8 @@ static IR::Value LoadBufferFormat(IR::IREmitter& ir, const IR::Value handle, con IR::U32{ir.LoadBufferU32(1, handle, address, info)}); break; case AmdGpu::DataFormat::Format16: { - const auto unpacked = - ir.Unpack2x16(format_info.num_format, ir.LoadBufferU16(handle, address, info)); + const auto raw = ir.UConvert(32, ir.LoadBufferU16(handle, address, info)); + const auto unpacked = ir.Unpack2x16(format_info.num_format, raw); interpreted = ir.CompositeExtract(unpacked, 0); break; } @@ -126,7 +126,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I const auto packed = ir.Pack4x8(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f), ir.Imm32(0.f), ir.Imm32(0.f))); - ir.StoreBufferU8(handle, address, packed, info); + ir.StoreBufferU8(handle, address, ir.UConvert(8, packed), info); break; } case AmdGpu::DataFormat::Format8_8: { @@ -134,7 +134,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I ir.CompositeConstruct(ir.CompositeExtract(real_value, 0), ir.CompositeExtract(real_value, 1), ir.Imm32(0.f), ir.Imm32(0.f))); - ir.StoreBufferU16(handle, address, packed, info); + ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info); break; } case AmdGpu::DataFormat::Format8_8_8_8: { @@ -145,7 +145,7 @@ static void StoreBufferFormat(IR::IREmitter& ir, const IR::Value handle, const I case AmdGpu::DataFormat::Format16: { const auto packed = ir.Pack2x16(format_info.num_format, ir.CompositeConstruct(real_value, ir.Imm32(0.f))); - ir.StoreBufferU16(handle, address, packed, info); + ir.StoreBufferU16(handle, address, ir.UConvert(16, packed), info); break; } case AmdGpu::DataFormat::Format16_16: { diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 18c77e600..ba96d1034 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -17,6 +17,8 @@ using SharpLocation = u32; bool IsBufferAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::BufferAtomicIAdd32: + case IR::Opcode::BufferAtomicIAdd64: + case IR::Opcode::BufferAtomicISub32: case IR::Opcode::BufferAtomicSMin32: case IR::Opcode::BufferAtomicUMin32: case IR::Opcode::BufferAtomicSMax32: @@ -27,6 +29,7 @@ bool IsBufferAtomic(const IR::Inst& inst) { case IR::Opcode::BufferAtomicOr32: case IR::Opcode::BufferAtomicXor32: case IR::Opcode::BufferAtomicSwap32: + case IR::Opcode::BufferAtomicCmpSwap32: return true; default: return false; @@ -41,6 +44,7 @@ bool IsBufferStore(const IR::Inst& inst) { case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: + case IR::Opcode::StoreBufferU64: case IR::Opcode::StoreBufferF32: case IR::Opcode::StoreBufferF32x2: case IR::Opcode::StoreBufferF32x3: @@ -60,6 +64,7 @@ bool IsBufferInstruction(const IR::Inst& inst) { case IR::Opcode::LoadBufferU32x2: case IR::Opcode::LoadBufferU32x3: case IR::Opcode::LoadBufferU32x4: + case IR::Opcode::LoadBufferU64: case IR::Opcode::LoadBufferF32: case IR::Opcode::LoadBufferF32x2: case IR::Opcode::LoadBufferF32x3: @@ -85,6 +90,10 @@ IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { case IR::Opcode::LoadBufferU16: case IR::Opcode::StoreBufferU16: return IR::Type::U16; + case IR::Opcode::LoadBufferU64: + case IR::Opcode::StoreBufferU64: + case IR::Opcode::BufferAtomicIAdd64: + return IR::Type::U64; case IR::Opcode::LoadBufferFormatF32: case IR::Opcode::StoreBufferFormatF32: // Formatted buffer loads can use a variety of types. diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index ba8d1cca6..4cd16d18f 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -35,12 +35,28 @@ void Visit(Info& info, const IR::Inst& inst) { break; } case IR::Opcode::LoadSharedU16: - case IR::Opcode::LoadSharedU32: - case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU16: + info.shared_types |= IR::Type::U16; + break; + case IR::Opcode::LoadSharedU32: case IR::Opcode::WriteSharedU32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicXor32: + info.shared_types |= IR::Type::U32; + break; + case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: - info.uses_shared = true; + case IR::Opcode::SharedAtomicIAdd64: + info.shared_types |= IR::Type::U64; break; case IR::Opcode::ConvertF16F32: case IR::Opcode::ConvertF32F16: diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp index baf6ad0d1..11713d099 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/program.h" @@ -9,12 +10,14 @@ namespace Shader::Optimization { static bool IsLoadShared(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::LoadSharedU32 || + return inst.GetOpcode() == IR::Opcode::LoadSharedU16 || + inst.GetOpcode() == IR::Opcode::LoadSharedU32 || inst.GetOpcode() == IR::Opcode::LoadSharedU64; } static bool IsWriteShared(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::WriteSharedU32 || + return inst.GetOpcode() == IR::Opcode::WriteSharedU16 || + inst.GetOpcode() == IR::Opcode::WriteSharedU32 || inst.GetOpcode() == IR::Opcode::WriteSharedU64; } @@ -49,11 +52,14 @@ static void EmitBarrierInBlock(IR::Block* block) { } } +using NodeSet = std::unordered_set; + // Inserts a barrier after divergent conditional blocks to avoid undefined // behavior when some threads write and others read from shared memory. -static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { +static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data, + NodeSet& divergence_end, u32& divergence_depth) { const IR::U1 cond = data.if_node.cond; - const auto insert_barrier = + const auto is_divergent_cond = IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 && inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) { @@ -61,11 +67,15 @@ static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { } return std::nullopt; }); - if (insert_barrier) { - IR::Block* const merge = data.if_node.merge; - auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); - IR::IREmitter ir{*merge, insert_point}; - ir.Barrier(); + if (is_divergent_cond) { + if (divergence_depth == 0) { + IR::Block* const merge = data.if_node.merge; + auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); + IR::IREmitter ir{*merge, insert_point}; + ir.Barrier(); + } + ++divergence_depth; + divergence_end.emplace(data.if_node.merge); } } @@ -87,19 +97,22 @@ void SharedMemoryBarrierPass(IR::Program& program, const RuntimeInfo& runtime_in return; } using Type = IR::AbstractSyntaxNode::Type; - u32 branch_depth{}; + u32 divergence_depth{}; + NodeSet divergence_end; for (const IR::AbstractSyntaxNode& node : program.syntax_list) { if (node.type == Type::EndIf) { - --branch_depth; + if (divergence_end.contains(node.data.end_if.merge)) { + --divergence_depth; + } continue; } // Check if branch depth is zero, we don't want to insert barrier in potentially divergent // code. - if (node.type == Type::If && branch_depth++ == 0) { - EmitBarrierInMergeBlock(node.data); + if (node.type == Type::If) { + EmitBarrierInMergeBlock(node.data, divergence_end, divergence_depth); continue; } - if (node.type == Type::Block && branch_depth == 0) { + if (node.type == Type::Block && divergence_depth == 0) { EmitBarrierInBlock(node.data.block); } } diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp new file mode 100644 index 000000000..0f80a3b28 --- /dev/null +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/profile.h" + +namespace Shader::Optimization { + +static bool Requires16BitSharedAtomic(const IR::Inst& inst) { + // Nothing yet + return false; +} + +static bool Requires64BitSharedAtomic(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd64: + return true; + default: + return false; + } +} + +static bool IsNon32BitSharedLoadStore(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU64: + return true; + default: + return false; + } +} + +IR::Type CalculateSpecialSharedAtomicTypes(IR::Program& program) { + IR::Type extra_atomic_types{IR::Type::Void}; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (Requires16BitSharedAtomic(inst)) { + extra_atomic_types |= IR::Type::U16; + } + if (Requires64BitSharedAtomic(inst)) { + extra_atomic_types |= IR::Type::U64; + } + } + } + return extra_atomic_types; +} + +// Simplifies down U16 and U64 shared memory operations to U32 when aliasing is not supported and +// atomics of the same type are not used. +void SharedMemorySimplifyPass(IR::Program& program, const Profile& profile) { + if (program.info.stage != Stage::Compute || profile.supports_workgroup_explicit_memory_layout) { + return; + } + + const auto atomic_types = CalculateSpecialSharedAtomicTypes(program); + if (True(atomic_types & IR::Type::U16) && True(atomic_types & IR::Type::U64)) { + // If both other atomic types are used, there is nothing to do. + return; + } + + // Iterate through shared load/store U16/U64 instructions, replacing with + // equivalent U32 ops when the types are not needed for atomics. + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsNon32BitSharedLoadStore(inst)) { + continue; + } + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::U32 offset{inst.Arg(0)}; + if (False(atomic_types & IR::Type::U16)) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: { + const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))}; + const IR::U32 dword_value{ir.LoadShared(32, false, dword_offset)}; + const IR::U32 bit_offset{ + ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))}; + const IR::U32 value{ir.BitFieldExtract(dword_value, bit_offset, ir.Imm32(16U))}; + inst.ReplaceUsesWithAndRemove(ir.UConvert(16, value)); + continue; + } + case IR::Opcode::WriteSharedU16: { + const IR::U32 value{ir.UConvert(32, IR::U16{inst.Arg(1)})}; + const IR::U32 bit_offset{ + ir.IMul(ir.BitwiseAnd(offset, ir.Imm32(2U)), ir.Imm32(8U))}; + const IR::U32 dword_offset{ir.BitwiseAnd(offset, ir.Imm32(~3U))}; + const IR::U32 dword_value{ + ir.LoadShared(32, false, ir.BitwiseAnd(offset, dword_offset))}; + const IR::U32 new_dword_value{ + ir.BitFieldInsert(dword_value, value, bit_offset, ir.Imm32(16U))}; + ir.WriteShared(32, new_dword_value, dword_offset); + inst.Invalidate(); + continue; + } + default: + break; + } + } + if (False(atomic_types & IR::Type::U64)) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU64: { + const IR::U32 value0{ir.LoadShared(32, false, offset)}; + const IR::U32 value1{ir.LoadShared(32, false, ir.IAdd(offset, ir.Imm32(4U)))}; + const IR::Value value{ir.PackUint2x32(ir.CompositeConstruct(value0, value1))}; + inst.ReplaceUsesWithAndRemove(value); + continue; + } + case IR::Opcode::WriteSharedU64: { + const IR::Value value{ir.UnpackUint2x32(IR::U64{inst.Arg(1)})}; + const IR::U32 value0{ir.CompositeExtract(value, 0)}; + const IR::U32 value1{ir.CompositeExtract(value, 1)}; + ir.WriteShared(32, value0, offset); + ir.WriteShared(32, value1, ir.IAdd(offset, ir.Imm32(4U))); + inst.Invalidate(); + continue; + } + default: + break; + } + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index 409c05940..a6900e180 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -10,18 +10,23 @@ namespace Shader::Optimization { static bool IsSharedAccess(const IR::Inst& inst) { const auto opcode = inst.GetOpcode(); switch (opcode) { + case IR::Opcode::LoadSharedU16: case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: - case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicIAdd32: case IR::Opcode::SharedAtomicIAdd64: - case IR::Opcode::SharedAtomicOr32: - case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicISub32: case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicXor32: return true; default: @@ -29,26 +34,74 @@ static bool IsSharedAccess(const IR::Inst& inst) { } } +IR::Type CalculateSharedMemoryTypes(IR::Program& program) { + IR::Type used_types{IR::Type::Void}; + for (IR::Block* const block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (!IsSharedAccess(inst)) { + continue; + } + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + case IR::Opcode::WriteSharedU16: + used_types |= IR::Type::U16; + break; + case IR::Opcode::LoadSharedU32: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicXor32: + used_types |= IR::Type::U32; + break; + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd64: + used_types |= IR::Type::U64; + break; + default: + break; + } + } + } + return used_types; +} + void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_info, const Profile& profile) { if (program.info.stage != Stage::Compute) { return; } - // Only perform the transform if the host shared memory is insufficient - // or the device does not support VK_KHR_workgroup_memory_explicit_layout + + // Run this pass if: + // * There are shared memory instructions. + // * One of the following is true: + // * Requested shared memory size is too large for the host shared memory. + // * Workgroup explicit memory is not supported and multiple shared memory types are used. const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size <= profile.max_shared_memory_size && - profile.supports_workgroup_explicit_memory_layout) { + const auto used_types = CalculateSharedMemoryTypes(program); + if (used_types == IR::Type::Void || (shared_memory_size <= profile.max_shared_memory_size && + (profile.supports_workgroup_explicit_memory_layout || + std::popcount(static_cast(used_types)) == 1))) { return; } - // Add buffer binding for shared memory storage buffer. + + // Add a buffer binding for shared memory storage buffer. const u32 binding = static_cast(program.info.buffers.size()); program.info.buffers.push_back({ - .used_types = IR::Type::U32, + .used_types = used_types, .inline_cbuf = AmdGpu::Buffer::Null(), .buffer_type = BufferType::SharedMemory, .is_written = true, }); + for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { if (!IsSharedAccess(inst)) { @@ -56,47 +109,48 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ } IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; const IR::U32 handle = ir.Imm32(binding); - // Replace shared atomics first - switch (inst.GetOpcode()) { - case IR::Opcode::SharedAtomicAnd32: - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); - continue; - case IR::Opcode::SharedAtomicIAdd32: - case IR::Opcode::SharedAtomicIAdd64: - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); - continue; - case IR::Opcode::SharedAtomicOr32: - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicOr(handle, inst.Arg(0), inst.Arg(1), {})); - continue; - case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicIMax(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); - continue; - } - case IR::Opcode::SharedAtomicSMin32: - case IR::Opcode::SharedAtomicUMin32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicIMin(handle, inst.Arg(0), inst.Arg(1), is_signed, {})); - continue; - } - case IR::Opcode::SharedAtomicXor32: - inst.ReplaceUsesWithAndRemove( - ir.BufferAtomicXor(handle, inst.Arg(0), inst.Arg(1), {})); - continue; - default: - break; - } - // Replace shared operations. const IR::U32 offset = ir.IMul(ir.GetAttributeU32(IR::Attribute::WorkgroupIndex), ir.Imm32(shared_memory_size)); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicISub32: + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWithAndRemove( + ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); + continue; + } + case IR::Opcode::SharedAtomicInc32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicInc(handle, address, {})); + continue; + case IR::Opcode::SharedAtomicDec32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); + continue; + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); + continue; + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); + continue; case IR::Opcode::LoadSharedU16: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); break; @@ -104,10 +158,10 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); break; case IR::Opcode::LoadSharedU64: - inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU64(handle, address, {})); break; case IR::Opcode::WriteSharedU16: - ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {}); + ir.StoreBufferU16(handle, address, IR::U16{inst.Arg(1)}, {}); inst.Invalidate(); break; case IR::Opcode::WriteSharedU32: @@ -115,7 +169,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.Invalidate(); break; case IR::Opcode::WriteSharedU64: - ir.StoreBufferU32(2, handle, address, inst.Arg(1), {}); + ir.StoreBufferU64(handle, address, IR::U64{inst.Arg(1)}, {}); inst.Invalidate(); break; default: diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index ed1e5536a..b92c5d555 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -265,6 +265,7 @@ using U32F32 = TypedValue; using U64F64 = TypedValue; using U32U64 = TypedValue; using U16U32U64 = TypedValue; +using U8U16U32U64 = TypedValue; using F32F64 = TypedValue; using F16F32F64 = TypedValue; using UAny = TypedValue; diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 9f92857d6..e17fb1c9e 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -78,6 +78,7 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::LowerBufferFormatToRaw(program); + Shader::Optimization::SharedMemorySimplifyPass(program, profile); Shader::Optimization::SharedMemoryToStoragePass(program, runtime_info, profile); Shader::Optimization::SharedMemoryBarrierPass(program, runtime_info, profile); Shader::Optimization::IdentityRemovalPass(program.blocks); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index e470f8e77..ffa744b31 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -23,6 +23,7 @@ static constexpr size_t DataShareBufferSize = 64_KB; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 128_MB; static constexpr size_t DownloadBufferSize = 128_MB; +static constexpr size_t DeviceBufferSize = 16_MB; static constexpr size_t MaxPageFaults = 1024; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, @@ -32,7 +33,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, - download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize), + download_buffer{instance, scheduler, MemoryUsage::Download, DownloadBufferSize}, + device_buffer{instance, scheduler, MemoryUsage::DeviceLocal, DeviceBufferSize}, gds_buffer{instance, scheduler, MemoryUsage::Stream, 0, AllFlags, DataShareBufferSize}, bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, BDA_PAGETABLE_SIZE}, @@ -348,7 +350,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b return {&buffer, buffer.Offset(device_addr)}; } -std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) { +std::pair BufferCache::ObtainBufferForImage(VAddr gpu_addr, u32 size) { // Check if any buffer contains the full requested range. const u64 page = gpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page].buffer_id; @@ -361,10 +363,10 @@ std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, } // If no buffer contains the full requested range but some buffer within was GPU-modified, // fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications. - // This is only done if the request prefers to use GPU memory, otherwise we can skip it. - if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) { + if (memory_tracker.IsRegionGpuModified(gpu_addr, size)) { return ObtainBuffer(gpu_addr, size, false, false); } + // In all other cases, just do a CPU copy to the staging buffer. const auto [data, offset] = staging_buffer.Map(size, 16); memory->CopySparseMemory(gpu_addr, data, size); diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index c2faf12c8..d7d753213 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -80,11 +80,6 @@ public: return &gds_buffer; } - /// Retrieves the host visible device local stream buffer. - [[nodiscard]] StreamBuffer& GetStreamBuffer() noexcept { - return stream_buffer; - } - /// Retrieves the device local DBA page table buffer. [[nodiscard]] Buffer* GetBdaPageTableBuffer() noexcept { return &bda_pagetable_buffer; @@ -100,6 +95,20 @@ public: return slot_buffers[id]; } + /// Retrieves a utility buffer optimized for specified memory usage. + StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept { + switch (usage) { + case MemoryUsage::Stream: + return stream_buffer; + case MemoryUsage::Download: + return download_buffer; + case MemoryUsage::Upload: + return staging_buffer; + case MemoryUsage::DeviceLocal: + return device_buffer; + } + } + /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size, bool unmap); @@ -121,8 +130,7 @@ public: BufferId buffer_id = {}); /// Attempts to obtain a buffer without modifying the cache contents. - [[nodiscard]] std::pair ObtainViewBuffer(VAddr gpu_addr, u32 size, - bool prefer_gpu); + [[nodiscard]] std::pair ObtainBufferForImage(VAddr gpu_addr, u32 size); /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); @@ -193,6 +201,7 @@ private: StreamBuffer staging_buffer; StreamBuffer stream_buffer; StreamBuffer download_buffer; + StreamBuffer device_buffer; Buffer gds_buffer; Buffer bda_pagetable_buffer; Buffer fault_buffer; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 0591e06ce..63c0a38d6 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -445,7 +445,25 @@ bool Instance::CreateDevice() { workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess, }, #ifdef __APPLE__ - portability_features, + vk::PhysicalDevicePortabilitySubsetFeaturesKHR{ + .constantAlphaColorBlendFactors = portability_features.constantAlphaColorBlendFactors, + .events = portability_features.events, + .imageViewFormatReinterpretation = portability_features.imageViewFormatReinterpretation, + .imageViewFormatSwizzle = portability_features.imageViewFormatSwizzle, + .imageView2DOn3DImage = portability_features.imageView2DOn3DImage, + .multisampleArrayImage = portability_features.multisampleArrayImage, + .mutableComparisonSamplers = portability_features.mutableComparisonSamplers, + .pointPolygons = portability_features.pointPolygons, + .samplerMipLodBias = portability_features.samplerMipLodBias, + .separateStencilMaskRef = portability_features.separateStencilMaskRef, + .shaderSampleRateInterpolationFunctions = + portability_features.shaderSampleRateInterpolationFunctions, + .tessellationIsolines = portability_features.tessellationIsolines, + .tessellationPointMode = portability_features.tessellationPointMode, + .triangleFans = portability_features.triangleFans, + .vertexAttributeAccessBeyondStride = + portability_features.vertexAttributeAccessBeyondStride, + }, #endif }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index dff4e5a5f..9dea5ceea 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -549,7 +549,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto* gds_buf = buffer_cache.GetGdsBuffer(); buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::Flatbuf) { - auto& vk_buffer = buffer_cache.GetStreamBuffer(); + auto& vk_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); const u32 ubo_size = stage.flattened_ud_buf.size() * sizeof(u32); const u64 offset = vk_buffer.Copy(stage.flattened_ud_buf.data(), ubo_size, instance.UniformMinAlignment()); @@ -561,7 +561,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding const auto* fault_buffer = buffer_cache.GetFaultBuffer(); buffer_infos.emplace_back(fault_buffer->Handle(), 0, fault_buffer->SizeBytes()); } else if (desc.buffer_type == Shader::BufferType::SharedMemory) { - auto& lds_buffer = buffer_cache.GetStreamBuffer(); + auto& lds_buffer = buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::Stream); const auto& cs_program = liverpool->GetCsRegs(); const auto lds_size = cs_program.SharedMemSize() * cs_program.NumWorkgroups(); const auto [data, offset] = diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index d8070da61..ab9111e6b 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -312,42 +312,121 @@ void Image::Upload(vk::Buffer buffer, u64 offset) { vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyImage(const Image& image) { +void Image::CopyImage(const Image& src_image) { scheduler->EndRendering(); Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); auto cmdbuf = scheduler->CommandBuffer(); + const auto& src_info = src_image.info; boost::container::small_vector image_copy{}; - for (u32 m = 0; m < image.info.resources.levels; ++m) { - const auto mip_w = std::max(image.info.size.width >> m, 1u); - const auto mip_h = std::max(image.info.size.height >> m, 1u); - const auto mip_d = std::max(image.info.size.depth >> m, 1u); + const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); + for (u32 m = 0; m < num_mips; ++m) { + const auto mip_w = std::max(src_info.size.width >> m, 1u); + const auto mip_h = std::max(src_info.size.height >> m, 1u); + const auto mip_d = std::max(src_info.size.depth >> m, 1u); image_copy.emplace_back(vk::ImageCopy{ .srcSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = m, .baseArrayLayer = 0, - .layerCount = image.info.resources.layers, + .layerCount = src_info.resources.layers, }, .dstSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = m, .baseArrayLayer = 0, - .layerCount = image.info.resources.layers, + .layerCount = src_info.resources.layers, }, .extent = {mip_w, mip_h, mip_d}, }); } - cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout, + cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, image_copy); Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyMip(const Image& image, u32 mip, u32 slice) { +void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) { + const auto& src_info = src_image.info; + + vk::BufferImageCopy buffer_image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource = + { + .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .imageExtent = + { + .width = src_info.size.width, + .height = src_info.size.height, + .depth = src_info.size.depth, + }, + }; + + const vk::BufferMemoryBarrier2 pre_copy_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer, + .offset = offset, + .size = VK_WHOLE_SIZE, + }; + + const vk::BufferMemoryBarrier2 post_copy_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferRead, + .buffer = buffer, + .offset = offset, + .size = VK_WHOLE_SIZE, + }; + + scheduler->EndRendering(); + src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); + + auto cmdbuf = scheduler->CommandBuffer(); + + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_copy_barrier, + }); + + cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer, + buffer_image_copy); + + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_copy_barrier, + }); + + buffer_image_copy.imageSubresource.aspectMask = + info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor; + + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + buffer_image_copy); +} + +void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) { scheduler->EndRendering(); Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); @@ -357,26 +436,27 @@ void Image::CopyMip(const Image& image, u32 mip, u32 slice) { const auto mip_h = std::max(info.size.height >> mip, 1u); const auto mip_d = std::max(info.size.depth >> mip, 1u); - ASSERT(mip_w == image.info.size.width); - ASSERT(mip_h == image.info.size.height); + const auto& src_info = src_image.info; + ASSERT(mip_w == src_info.size.width); + ASSERT(mip_h == src_info.size.height); - const u32 num_layers = std::min(image.info.resources.layers, info.resources.layers); + const u32 num_layers = std::min(src_info.resources.layers, info.resources.layers); const vk::ImageCopy image_copy{ .srcSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = num_layers, }, .dstSubresource{ - .aspectMask = image.aspect_mask, + .aspectMask = src_image.aspect_mask, .mipLevel = mip, .baseArrayLayer = slice, .layerCount = num_layers, }, .extent = {mip_w, mip_h, mip_d}, }; - cmdbuf.copyImage(image.image, image.last_state.layout, this->image, this->last_state.layout, + cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, image_copy); Transit(vk::ImageLayout::eGeneral, diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index 404e25e88..31b67e021 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -104,7 +104,8 @@ struct Image { std::optional range, vk::CommandBuffer cmdbuf = {}); void Upload(vk::Buffer buffer, u64 offset); - void CopyImage(const Image& image); + void CopyImage(const Image& src_image); + void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset); void CopyMip(const Image& src_image, u32 mip, u32 slice); bool IsTracked() { diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index f070b9132..a47e858ab 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -8,7 +8,6 @@ #include "common/debug.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" -#include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/texture_cache/host_compatibility.h" @@ -126,7 +125,7 @@ void TextureCache::UnmapMemory(VAddr cpu_addr, size_t size) { ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding, ImageId cache_image_id) { - const auto& cache_image = slot_images[cache_image_id]; + auto& cache_image = slot_images[cache_image_id]; if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) { return {}; @@ -169,18 +168,21 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi } if (recreate) { - auto new_info{requested_info}; - new_info.resources = std::max(requested_info.resources, cache_image.info.resources); - new_info.UpdateSize(); + auto new_info = requested_info; + new_info.resources = std::min(requested_info.resources, cache_image.info.resources); const auto new_image_id = slot_images.insert(instance, scheduler, new_info); RegisterImage(new_image_id); // Inherit image usage - auto& new_image = GetImage(new_image_id); + auto& new_image = slot_images[new_image_id]; new_image.usage = cache_image.usage; + new_image.flags &= ~ImageFlagBits::Dirty; - // TODO: perform a depth copy here + // Perform depth<->color copy using the intermediate copy buffer. + const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); + new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); + // Free the cache image. FreeImage(cache_image_id); return new_image_id; } @@ -461,9 +463,9 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; - image.flags &= ~ImageFlagBits::Dirty; image.usage.depth_target = 1u; image.usage.stencil = image.info.HasStencil(); + UpdateImage(image_id); // Register meta data for this depth buffer if (!(image.flags & ImageFlagBits::MetaRegistered)) { @@ -584,12 +586,11 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size; - const auto [vk_buffer, buf_offset] = - buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty); + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); const auto cmdbuf = sched_ptr->CommandBuffer(); - // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW - // hazard + + // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, vk::PipelineStageFlagBits2::eTransfer)) { cmdbuf.pipelineBarrier2(vk::DependencyInfo{