diff --git a/cmake/DetectQtInstallation.cmake b/cmake/DetectQtInstallation.cmake index e95e8980f..650cc9745 100644 --- a/cmake/DetectQtInstallation.cmake +++ b/cmake/DetectQtInstallation.cmake @@ -1,14 +1,28 @@ # SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project # SPDX-License-Identifier: GPL-2.0-or-later -file(GLOB QT_KITS LIST_DIRECTORIES true "C:/Qt/*/msvc*_64") -list(SORT QT_KITS COMPARE NATURAL) -list(REVERSE QT_KITS) -if(QT_KITS) - list(GET QT_KITS 0 QT_PREFIX) - set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) - message(STATUS "Auto-detected Qt prefix: ${QT_PREFIX}") -else() - message(STATUS "findQt.cmake: no Qt‑Directory found in C:/Qt – please set CMAKE_PREFIX_PATH manually") -endif() +set(highest_version "0") +set(CANDIDATE_DRIVES A B C D E F G H I J K L M N O P Q R S T U V W X Y Z) +foreach(drive ${CANDIDATE_DRIVES}) + file(GLOB kits LIST_DIRECTORIES true CONFIGURE_DEPENDS "${drive}:/Qt/*/msvc*_64") + foreach(kit IN LISTS kits) + get_filename_component(version_dir "${kit}" DIRECTORY) + get_filename_component(kit_version "${version_dir}" NAME) + + message(STATUS "DetectQtInstallation.cmake: Detected Qt: ${kit}") + + if (kit_version VERSION_GREATER highest_version) + set(highest_version "${kit_version}") + set(QT_PREFIX "${kit}") + + endif() + endforeach() +endforeach() + +if(QT_PREFIX) + set(CMAKE_PREFIX_PATH "${QT_PREFIX}" CACHE PATH "Qt prefix auto‑detected" FORCE) + message(STATUS "DetectQtInstallation.cmake: Choose newest Qt: ${QT_PREFIX}") +else() + message(STATUS "DetectQtInstallation.cmake: No Qt‑Directory found in :/Qt – please set CMAKE_PREFIX_PATH manually") +endif() diff --git a/externals/sirit b/externals/sirit index 6b450704f..b4eccb336 160000 --- a/externals/sirit +++ b/externals/sirit @@ -1 +1 @@ -Subproject commit 6b450704f6fedb9413d0c89a9eb59d028eb1e6c0 +Subproject commit b4eccb336f1b1169af48dac1e04015985af86e3e diff --git a/src/common/config.cpp b/src/common/config.cpp index d3a5fa6a1..6f8563377 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -60,11 +60,14 @@ static bool overrideControllerColor = false; static int controllerCustomColorRGB[3] = {0, 0, 255}; // GPU -static u32 screenWidth = 1280; -static u32 screenHeight = 720; +static u32 windowWidth = 1280; +static u32 windowHeight = 720; +static u32 internalScreenWidth = 1280; +static u32 internalScreenHeight = 720; static bool isNullGpu = false; static bool shouldCopyGPUBuffers = false; static bool readbacksEnabled = false; +static bool readbackLinearImagesEnabled = false; static bool directMemoryAccessEnabled = false; static bool shouldDumpShaders = false; static bool shouldPatchShaders = false; @@ -103,7 +106,7 @@ u32 m_language = 1; // english static std::string trophyKey = ""; // Expected number of items in the config file -static constexpr u64 total_entries = 51; +static constexpr u64 total_entries = 54; bool allowHDR() { return isHDRAllowed; @@ -194,12 +197,20 @@ double getTrophyNotificationDuration() { return trophyNotificationDuration; } -u32 getScreenWidth() { - return screenWidth; +u32 getWindowWidth() { + return windowWidth; } -u32 getScreenHeight() { - return screenHeight; +u32 getWindowHeight() { + return windowHeight; +} + +u32 getInternalScreenWidth() { + return internalScreenHeight; +} + +u32 getInternalScreenHeight() { + return internalScreenHeight; } s32 getGpuId() { @@ -262,6 +273,10 @@ bool readbacks() { return readbacksEnabled; } +bool readbackLinearImages() { + return readbackLinearImagesEnabled; +} + bool directMemoryAccess() { return directMemoryAccessEnabled; } @@ -334,12 +349,20 @@ void setGpuId(s32 selectedGpuId) { gpuId = selectedGpuId; } -void setScreenWidth(u32 width) { - screenWidth = width; +void setWindowWidth(u32 width) { + windowWidth = width; } -void setScreenHeight(u32 height) { - screenHeight = height; +void setWindowHeight(u32 height) { + windowHeight = height; +} + +void setInternalScreenWidth(u32 width) { + internalScreenWidth = width; +} + +void setInternalScreenHeight(u32 height) { + internalScreenHeight = height; } void setDebugDump(bool enable) { @@ -421,6 +444,7 @@ void setCursorState(s16 newCursorState) { void setCursorHideTimeout(int newcursorHideTimeout) { cursorHideTimeout = newcursorHideTimeout; } + void setTrophyNotificationDuration(double newTrophyNotificationDuration) { trophyNotificationDuration = newTrophyNotificationDuration; } @@ -626,11 +650,16 @@ void load(const std::filesystem::path& path) { if (data.contains("GPU")) { const toml::value& gpu = data.at("GPU"); - screenWidth = toml::find_or(gpu, "screenWidth", screenWidth); - screenHeight = toml::find_or(gpu, "screenHeight", screenHeight); + windowWidth = toml::find_or(gpu, "screenWidth", windowWidth); + windowHeight = toml::find_or(gpu, "screenHeight", windowHeight); + internalScreenWidth = toml::find_or(gpu, "internalScreenWidth", internalScreenWidth); + internalScreenHeight = + toml::find_or(gpu, "internalScreenHeight", internalScreenHeight); isNullGpu = toml::find_or(gpu, "nullGpu", isNullGpu); shouldCopyGPUBuffers = toml::find_or(gpu, "copyGPUBuffers", shouldCopyGPUBuffers); readbacksEnabled = toml::find_or(gpu, "readbacks", readbacksEnabled); + readbackLinearImagesEnabled = + toml::find_or(gpu, "readbackLinearImages", readbackLinearImagesEnabled); directMemoryAccessEnabled = toml::find_or(gpu, "directMemoryAccess", directMemoryAccessEnabled); shouldDumpShaders = toml::find_or(gpu, "dumpShaders", shouldDumpShaders); @@ -797,11 +826,14 @@ void save(const std::filesystem::path& path) { data["Input"]["specialPadClass"] = specialPadClass; data["Input"]["isMotionControlsEnabled"] = isMotionControlsEnabled; data["Input"]["useUnifiedInputConfig"] = useUnifiedInputConfig; - data["GPU"]["screenWidth"] = screenWidth; - data["GPU"]["screenHeight"] = screenHeight; + data["GPU"]["screenWidth"] = windowWidth; + data["GPU"]["screenHeight"] = windowHeight; + data["GPU"]["internalScreenWidth"] = internalScreenWidth; + data["GPU"]["internalScreenHeight"] = internalScreenHeight; data["GPU"]["nullGpu"] = isNullGpu; data["GPU"]["copyGPUBuffers"] = shouldCopyGPUBuffers; data["GPU"]["readbacks"] = readbacksEnabled; + data["GPU"]["readbackLinearImages"] = readbackLinearImagesEnabled; data["GPU"]["directMemoryAccess"] = directMemoryAccessEnabled; data["GPU"]["dumpShaders"] = shouldDumpShaders; data["GPU"]["patchShaders"] = shouldPatchShaders; @@ -897,11 +929,14 @@ void setDefaultValues() { controllerCustomColorRGB[2] = 255; // GPU - screenWidth = 1280; - screenHeight = 720; + windowWidth = 1280; + windowHeight = 720; + internalScreenWidth = 1280; + internalScreenHeight = 720; isNullGpu = false; shouldCopyGPUBuffers = false; readbacksEnabled = false; + readbackLinearImagesEnabled = false; directMemoryAccessEnabled = false; shouldDumpShaders = false; shouldPatchShaders = false; diff --git a/src/common/config.h b/src/common/config.h index 931fa68e2..e54425676 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -25,10 +25,14 @@ bool getIsFullscreen(); void setIsFullscreen(bool enable); std::string getFullscreenMode(); void setFullscreenMode(std::string mode); -u32 getScreenWidth(); -u32 getScreenHeight(); -void setScreenWidth(u32 width); -void setScreenHeight(u32 height); +u32 getWindowWidth(); +u32 getWindowHeight(); +void setWindowWidth(u32 width); +void setWindowHeight(u32 height); +u32 getInternalScreenWidth(); +u32 getInternalScreenHeight(); +void setInternalScreenWidth(u32 width); +void setInternalScreenHeight(u32 height); bool debugDump(); void setDebugDump(bool enable); s32 getGpuId(); @@ -47,6 +51,7 @@ bool copyGPUCmdBuffers(); void setCopyGPUCmdBuffers(bool enable); bool readbacks(); void setReadbacks(bool enable); +bool readbackLinearImages(); bool directMemoryAccess(); void setDirectMemoryAccess(bool enable); bool dumpShaders(); diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 2e29f70ee..846bb5eb4 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -358,9 +358,17 @@ enum PosixPageProtection { [[nodiscard]] constexpr PosixPageProtection ToPosixProt(Core::MemoryProt prot) { if (True(prot & Core::MemoryProt::CpuReadWrite) || True(prot & Core::MemoryProt::GpuReadWrite)) { - return PAGE_READWRITE; + if (True(prot & Core::MemoryProt::CpuExec)) { + return PAGE_EXECUTE_READWRITE; + } else { + return PAGE_READWRITE; + } } else if (True(prot & Core::MemoryProt::CpuRead) || True(prot & Core::MemoryProt::GpuRead)) { - return PAGE_READONLY; + if (True(prot & Core::MemoryProt::CpuExec)) { + return PAGE_EXECUTE_READ; + } else { + return PAGE_READONLY; + } } else { return PAGE_NOACCESS; } diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 8512858e9..e4f65cd31 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -163,7 +163,9 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan mask = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask = 0xFFFF'FFFF'FFFF'FFFF; + } // Get lower qword from xmm register c.vmovq(scratch1, xmm_dst); @@ -177,8 +179,8 @@ static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operan c.mov(scratch2, mask); c.and_(scratch1, scratch2); - // Writeback to xmm register, extrq instruction says top 64-bits are undefined so we don't - // care to preserve them + // Writeback to xmm register, extrq instruction says top 64-bits are undefined but zeroed on + // AMD CPUs c.vmovq(xmm_dst, scratch1); c.pop(scratch2); @@ -287,7 +289,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper mask_value = (1ULL << length) - 1; } - ASSERT_MSG(length + index <= 64, "length + index must be less than or equal to 64."); + if (length + index > 64) { + mask_value = 0xFFFF'FFFF'FFFF'FFFF; + } c.vmovq(scratch1, xmm_src); c.vmovq(scratch2, xmm_dst); @@ -307,8 +311,9 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper // dst |= src c.or_(scratch2, scratch1); - // Insert scratch2 into low 64 bits of dst, upper 64 bits are unaffected - c.vpinsrq(xmm_dst, xmm_dst, scratch2, 0); + // Insert scratch2 into low 64 bits of dst, upper 64 bits are undefined but zeroed on AMD + // CPUs + c.vmovq(xmm_dst, scratch2); c.pop(mask); c.pop(scratch2); @@ -374,7 +379,7 @@ static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* oper c.and_(scratch2, mask); c.or_(scratch2, scratch1); - // Upper 64 bits are undefined in insertq + // Upper 64 bits are undefined in insertq but AMD CPUs zero them c.vmovq(xmm_dst, scratch2); c.pop(mask); @@ -635,6 +640,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst >>= index; lowQWordDst &= mask; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); @@ -675,6 +681,7 @@ static bool TryExecuteIllegalInstruction(void* ctx, void* code_address) { lowQWordDst &= ~(mask << index); lowQWordDst |= lowQWordSrc << index; + memset((u8*)dst + sizeof(u64), 0, sizeof(u64)); memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); Common::IncrementRip(ctx, 4); diff --git a/src/core/libraries/kernel/kernel.cpp b/src/core/libraries/kernel/kernel.cpp index 61d2e2f2b..a4d3accac 100644 --- a/src/core/libraries/kernel/kernel.cpp +++ b/src/core/libraries/kernel/kernel.cpp @@ -6,6 +6,7 @@ #include "common/assert.h" #include "common/debug.h" +#include "common/elf_info.h" #include "common/logging/log.h" #include "common/polyfill_thread.h" #include "common/thread.h" @@ -243,6 +244,19 @@ s32 PS4_SYSV_ABI sceKernelSetGPO() { return ORBIS_OK; } +s32 PS4_SYSV_ABI sceKernelGetSystemSwVersion(SwVersionStruct* ret) { + if (ret == nullptr) { + return ORBIS_OK; // but why? + } + ASSERT(ret->struct_size == 40); + u32 fake_fw = Common::ElfInfo::Instance().RawFirmwareVer(); + ret->hex_representation = fake_fw; + std::snprintf(ret->text_representation, 28, "%2x.%03x.%03x", fake_fw >> 0x18, + fake_fw >> 0xc & 0xfff, fake_fw & 0xfff); // why %2x? + LOG_INFO(Lib_Kernel, "called, returned sw version: {}", ret->text_representation); + return ORBIS_OK; +} + void RegisterKernel(Core::Loader::SymbolsResolver* sym) { service_thread = std::jthread{KernelServiceThread}; @@ -258,6 +272,7 @@ void RegisterKernel(Core::Loader::SymbolsResolver* sym) { Libraries::Kernel::RegisterDebug(sym); LIB_OBJ("f7uOxY9mM1U", "libkernel", 1, "libkernel", 1, 1, &g_stack_chk_guard); + LIB_FUNCTION("Mv1zUObHvXI", "libkernel", 1, "libkernel", 1, 1, sceKernelGetSystemSwVersion); LIB_FUNCTION("PfccT7qURYE", "libkernel", 1, "libkernel", 1, 1, kernel_ioctl); LIB_FUNCTION("JGfTMBOdUJo", "libkernel", 1, "libkernel", 1, 1, sceKernelGetFsSandboxRandomWord); LIB_FUNCTION("6xVpy0Fdq+I", "libkernel", 1, "libkernel", 1, 1, _sigprocmask); diff --git a/src/core/libraries/kernel/kernel.h b/src/core/libraries/kernel/kernel.h index 0529c06d5..018759e14 100644 --- a/src/core/libraries/kernel/kernel.h +++ b/src/core/libraries/kernel/kernel.h @@ -35,6 +35,12 @@ struct OrbisWrapperImpl { s32* PS4_SYSV_ABI __Error(); +struct SwVersionStruct { + u64 struct_size; + char text_representation[0x1c]; + u32 hex_representation; +}; + void RegisterKernel(Core::Loader::SymbolsResolver* sym); } // namespace Libraries::Kernel diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index 8153b7610..e0c359f2c 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -573,11 +573,12 @@ void* PS4_SYSV_ABI posix_mmap(void* addr, u64 len, s32 prot, s32 flags, s32 fd, auto* memory = Core::Memory::Instance(); const auto mem_prot = static_cast(prot); const auto mem_flags = static_cast(flags); + const auto is_exec = True(mem_prot & Core::MemoryProt::CpuExec); s32 result = ORBIS_OK; if (fd == -1) { result = memory->MapMemory(&addr_out, std::bit_cast(addr), len, mem_prot, mem_flags, - Core::VMAType::Flexible); + Core::VMAType::Flexible, "anon", is_exec); } else { result = memory->MapFile(&addr_out, std::bit_cast(addr), len, mem_prot, mem_flags, fd, phys_addr); diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index da715b3bf..0f961923a 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -445,7 +445,8 @@ s32 PS4_SYSV_ABI sceVideoOutConfigureOutputMode_(s32 handle, u32 reserved, const } void RegisterLib(Core::Loader::SymbolsResolver* sym) { - driver = std::make_unique(Config::getScreenWidth(), Config::getScreenHeight()); + driver = std::make_unique(Config::getInternalScreenWidth(), + Config::getInternalScreenHeight()); LIB_FUNCTION("SbU3dwp80lQ", "libSceVideoOut", 1, "libSceVideoOut", 0, 0, sceVideoOutGetFlipStatus); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index e7ecf8d80..3d9bf58a7 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -631,6 +631,9 @@ s64 MemoryManager::ProtectBytes(VAddr addr, VirtualMemoryArea vma_base, u64 size if (True(prot & MemoryProt::CpuReadWrite)) { perms |= Core::MemoryPermission::ReadWrite; } + if (True(prot & MemoryProt::CpuExec)) { + perms |= Core::MemoryPermission::Execute; + } if (True(prot & MemoryProt::GpuRead)) { perms |= Core::MemoryPermission::Read; } @@ -650,9 +653,9 @@ s32 MemoryManager::Protect(VAddr addr, u64 size, MemoryProt prot) { std::scoped_lock lk{mutex}; // Validate protection flags - constexpr static MemoryProt valid_flags = MemoryProt::NoAccess | MemoryProt::CpuRead | - MemoryProt::CpuReadWrite | MemoryProt::GpuRead | - MemoryProt::GpuWrite | MemoryProt::GpuReadWrite; + constexpr static MemoryProt valid_flags = + MemoryProt::NoAccess | MemoryProt::CpuRead | MemoryProt::CpuReadWrite | + MemoryProt::CpuExec | MemoryProt::GpuRead | MemoryProt::GpuWrite | MemoryProt::GpuReadWrite; MemoryProt invalid_flags = prot & ~valid_flags; if (invalid_flags != MemoryProt::NoAccess) { diff --git a/src/core/memory.h b/src/core/memory.h index c800ef763..285d7dbed 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -31,6 +31,7 @@ enum class MemoryProt : u32 { NoAccess = 0, CpuRead = 1, CpuReadWrite = 2, + CpuExec = 4, GpuRead = 16, GpuWrite = 32, GpuReadWrite = 48, diff --git a/src/emulator.cpp b/src/emulator.cpp index fbab5929b..480ceee0b 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -133,6 +133,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar LOG_INFO(Config, "General isNeo: {}", Config::isNeoModeConsole()); LOG_INFO(Config, "GPU isNullGpu: {}", Config::nullGpu()); LOG_INFO(Config, "GPU readbacks: {}", Config::readbacks()); + LOG_INFO(Config, "GPU readbackLinearImages: {}", Config::readbackLinearImages()); LOG_INFO(Config, "GPU directMemoryAccess: {}", Config::directMemoryAccess()); LOG_INFO(Config, "GPU shouldDumpShaders: {}", Config::dumpShaders()); LOG_INFO(Config, "GPU vblankDivider: {}", Config::vblankDiv()); @@ -222,7 +223,7 @@ void Emulator::Run(std::filesystem::path file, const std::vector ar } } window = std::make_unique( - Config::getScreenWidth(), Config::getScreenHeight(), controller, window_title); + Config::getWindowWidth(), Config::getWindowHeight(), controller, window_title); g_window = window.get(); diff --git a/src/qt_gui/settings_dialog.cpp b/src/qt_gui/settings_dialog.cpp index c9d264587..ed2a17e25 100644 --- a/src/qt_gui/settings_dialog.cpp +++ b/src/qt_gui/settings_dialog.cpp @@ -762,8 +762,8 @@ void SettingsDialog::UpdateSettings() { m_gui_settings->SetValue(gui::gl_backgroundMusicVolume, ui->BGMVolumeSlider->value()); Config::setLanguage(languageIndexes[ui->consoleLanguageComboBox->currentIndex()]); Config::setEnableDiscordRPC(ui->discordRPCCheckbox->isChecked()); - Config::setScreenWidth(ui->widthSpinBox->value()); - Config::setScreenHeight(ui->heightSpinBox->value()); + Config::setWindowWidth(ui->widthSpinBox->value()); + Config::setWindowHeight(ui->heightSpinBox->value()); Config::setVblankDiv(ui->vblankSpinBox->value()); Config::setDumpShaders(ui->dumpShadersCheckBox->isChecked()); Config::setNullGpu(ui->nullGpuCheckBox->isChecked()); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index e37acb2e4..80c8b836b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -54,17 +54,23 @@ Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, }); } +Id SharedAtomicU64IncDec(EmitContext& ctx, Id offset, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ctx.EmitSharedMemoryAccess(ctx.shared_u64, ctx.shared_memory_u64, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics); + }); +} + template Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& buffer = ctx.buffers[handle]; - const auto type = [&] { - if constexpr (is_float) { - return ctx.F32[1]; - } else { - return ctx.U32[1]; - } - }(); + const Id type = is_float ? ctx.F32[1] : ctx.U32[1]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { address = ctx.OpIAdd(ctx.U32[1], address, offset); } @@ -148,42 +154,82 @@ Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMax); +} + Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMax); } +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMax); +} + Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMin); } +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicUMin); +} + Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicSMin); } +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicSMin); +} + Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicAnd); } +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicAnd); +} + Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicOr); } +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicOr); +} + Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicXor); } +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicXor); +} + Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicISub); } +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicISub); +} + Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); } +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIIncrement); +} + Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset) { return SharedAtomicU32IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); } +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset) { + return SharedAtomicU64IncDec(ctx, offset, &Sirit::Module::OpAtomicIDecrement); +} + Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index f3a8c518c..40f8d307c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -52,7 +52,7 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { if (IR::IsParam(attr)) { const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)}; - if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { + if (ctx.stage == Stage::Local) { const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); return ctx.OpAccessChain(component_ptr, ctx.output_attr_array, ctx.ConstU32(attr_index), ctx.ConstU32(element)); @@ -94,13 +94,9 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { if (IR::IsParam(attr)) { - if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { - return {ctx.F32[1], false}; - } else { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& info{ctx.output_params.at(index)}; - return {info.component_type, info.is_integer}; - } + const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; + const auto& info{ctx.output_params.at(index)}; + return {info.component_type, info.is_integer}; } if (IR::IsMrt(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)}; @@ -120,6 +116,9 @@ std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr } } // Anonymous namespace +using PointerType = EmitContext::PointerType; +using PointerSize = EmitContext::PointerSize; + Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { const u32 index = ctx.binding.user_data + ctx.info.ud_mask.Index(reg); const u32 half = PushData::UdRegsIndex + (index >> 2); @@ -131,41 +130,6 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { return ud_reg; } -void EmitGetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetThreadBitScalarReg(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetScalarRegister(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetVectorRegister(EmitContext& ctx) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitSetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -void EmitGetGotoVariable(EmitContext&) { - UNREACHABLE_MSG("Unreachable instruction"); -} - -using PointerType = EmitContext::PointerType; -using PointerSize = EmitContext::PointerSize; - Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { const u32 flatbuf_off_dw = inst->Flags(); if (!Config::directMemoryAccess()) { @@ -180,39 +144,27 @@ Id EmitReadConst(EmitContext& ctx, IR::Inst* inst, Id addr, Id offset) { } } -template -Id ReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { +Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { const auto& buffer = ctx.buffers[handle]; if (const Id offset = buffer.Offset(PointerSize::B32); Sirit::ValidId(offset)) { index = ctx.OpIAdd(ctx.U32[1], index, offset); } - const auto [id, pointer_type] = buffer.Alias(type); - const auto value_type = type == PointerType::U32 ? ctx.U32[1] : ctx.F32[1]; + const auto [id, pointer_type] = buffer.Alias(PointerType::U32); const Id ptr{ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index)}; - const Id result{ctx.OpLoad(value_type, ptr)}; + const Id result{ctx.OpLoad(ctx.U32[1], ptr)}; if (const Id size = buffer.Size(PointerSize::B32); Sirit::ValidId(size)) { const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, size); - return ctx.OpSelect(value_type, in_bounds, result, ctx.u32_zero_value); + return ctx.OpSelect(ctx.U32[1], in_bounds, result, ctx.u32_zero_value); } return result; } -Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { - return ReadConstBuffer(ctx, handle, index); -} - -Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { - const auto index{rate_idx == 0 ? PushData::Step0Index : PushData::Step1Index}; - return ctx.OpLoad( - ctx.U32[1], ctx.OpAccessChain(ctx.TypePointer(spv::StorageClass::PushConstant, ctx.U32[1]), - ctx.push_data_block, ctx.ConstU32(index))); -} - -static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))}; + const auto pointer{ + ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -222,7 +174,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; const auto param = ctx.input_params.at(param_id).id; const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -230,7 +182,7 @@ static Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 UNREACHABLE(); } -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { if (ctx.info.l_stage == LogicalStage::Geometry) { return EmitGetAttributeForGeometry(ctx, attr, comp, index); } else if (ctx.info.l_stage == LogicalStage::TessellationControl || @@ -248,18 +200,6 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsParam(attr)) { const u32 param_index{u32(attr) - u32(IR::Attribute::Param0)}; const auto& param{ctx.input_params.at(param_index)}; - if (param.buffer_handle >= 0) { - const auto step_rate = EmitReadStepRate(ctx, param.id.value); - const auto offset = ctx.OpIAdd( - ctx.U32[1], - ctx.OpIMul( - ctx.U32[1], - ctx.OpUDiv(ctx.U32[1], ctx.OpLoad(ctx.U32[1], ctx.instance_id), step_rate), - ctx.ConstU32(param.num_components)), - ctx.ConstU32(comp)); - return ReadConstBuffer(ctx, param.buffer_handle, offset); - } - Id result; if (param.is_loaded) { // Attribute is either default or manually interpolated. The id points to an already @@ -305,10 +245,6 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpLoad(ctx.U32[1], ctx.vertex_index); case IR::Attribute::InstanceId: return ctx.OpLoad(ctx.U32[1], ctx.instance_id); - case IR::Attribute::InstanceId0: - return EmitReadStepRate(ctx, 0); - case IR::Attribute::InstanceId1: - return EmitReadStepRate(ctx, 1); case IR::Attribute::WorkgroupIndex: return ctx.workgroup_index_id; case IR::Attribute::WorkgroupId: @@ -640,4 +576,36 @@ void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a UNREACHABLE_MSG("SPIR-V instruction"); } +void EmitGetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetThreadBitScalarReg(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetScalarRegister(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetVectorRegister(EmitContext& ctx) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitSetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + +void EmitGetGotoVariable(EmitContext&) { + UNREACHABLE_MSG("Unreachable instruction"); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 1ac2266bd..37d5d84c9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -108,7 +108,7 @@ Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, Id cmp_value); -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index); @@ -139,15 +139,25 @@ void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMax64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicUMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMin32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicSMin64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicAnd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicAnd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicOr32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicOr64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicXor32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicXor64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicInc32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicInc64(EmitContext& ctx, Id offset); Id EmitSharedAtomicDec32(EmitContext& ctx, Id offset); +Id EmitSharedAtomicDec64(EmitContext& ctx, Id offset); Id EmitSharedAtomicISub32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicISub64(EmitContext& ctx, Id offset, Id value); Id EmitCompositeConstructU32x2(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2); Id EmitCompositeConstructU32x3(EmitContext& ctx, IR::Inst* inst, Id e1, Id e2, Id e3); @@ -353,7 +363,7 @@ Id EmitFPIsInf32(EmitContext& ctx, Id value); Id EmitFPIsInf64(EmitContext& ctx, Id value); Id EmitIAdd32(EmitContext& ctx, IR::Inst* inst, Id a, Id b); Id EmitIAdd64(EmitContext& ctx, Id a, Id b); -Id EmitIAddCary32(EmitContext& ctx, Id a, Id b); +Id EmitIAddCarry32(EmitContext& ctx, Id a, Id b); Id EmitISub32(EmitContext& ctx, Id a, Id b); Id EmitISub64(EmitContext& ctx, Id a, Id b); Id EmitSMulHi(EmitContext& ctx, Id a, Id b); @@ -519,8 +529,10 @@ Id EmitLaneId(EmitContext& ctx); Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); Id EmitReadFirstLane(EmitContext& ctx, Id value); -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane); Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); +Id EmitBallot(EmitContext& ctx, Id bit); +Id EmitBallotFindLsb(EmitContext& ctx, Id mask); Id EmitDataAppend(EmitContext& ctx, u32 gds_addr, u32 binding); Id EmitDataConsume(EmitContext& ctx, u32 gds_addr, u32 binding); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index ddc1e7574..01652c1cf 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -60,7 +60,7 @@ Id EmitIAdd64(EmitContext& ctx, Id a, Id b) { return ctx.OpIAdd(ctx.U64, a, b); } -Id EmitIAddCary32(EmitContext& ctx, Id a, Id b) { +Id EmitIAddCarry32(EmitContext& ctx, Id a, Id b) { return ctx.OpIAddCarry(ctx.full_result_u32x2, a, b); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 2d13d09f0..951c76001 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -26,13 +26,20 @@ Id EmitReadFirstLane(EmitContext& ctx, Id value) { return ctx.OpGroupNonUniformBroadcastFirst(ctx.U32[1], SubgroupScope(ctx), value); } -Id EmitReadLane(EmitContext& ctx, Id value, u32 lane) { - return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, - ctx.ConstU32(lane)); +Id EmitReadLane(EmitContext& ctx, Id value, Id lane) { + return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, lane); } Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { return ctx.u32_zero_value; } +Id EmitBallot(EmitContext& ctx, Id bit) { + return ctx.OpGroupNonUniformBallot(ctx.U32[4], SubgroupScope(ctx), bit); +} + +Id EmitBallotFindLsb(EmitContext& ctx, Id mask) { + return ctx.OpGroupNonUniformBallotFindLSB(ctx.U32[1], SubgroupScope(ctx), mask); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 77336c9ec..e16bba755 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -76,6 +76,7 @@ EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_inf } else { SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); } + String(fmt::format("{:#x}", info.pgm_hash)); AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); @@ -376,35 +377,13 @@ void EmitContext::DefineInputs() { ASSERT(attrib.semantic < IR::NumParams); const auto sharp = attrib.GetSharp(info); const Id type{GetAttributeType(*this, sharp.GetNumberFmt())[4]}; - if (attrib.UsesStepRates()) { - const u32 rate_idx = - attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::OverStepRate0 ? 0 - : 1; - const u32 num_components = AmdGpu::NumComponents(sharp.GetDataFmt()); - const auto buffer = - std::ranges::find_if(info.buffers, [&attrib](const auto& buffer) { - return buffer.instance_attrib == attrib.semantic; - }); - // Note that we pass index rather than Id - input_params[attrib.semantic] = SpirvAttribute{ - .id = {rate_idx}, - .pointer_type = input_u32, - .component_type = U32[1], - .num_components = std::min(attrib.num_elements, num_components), - .is_integer = true, - .is_loaded = false, - .buffer_handle = int(buffer - info.buffers.begin()), - }; + Id id{DefineInput(type, attrib.semantic)}; + if (attrib.GetStepRate() != Gcn::VertexAttribute::InstanceIdType::None) { + Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); } else { - Id id{DefineInput(type, attrib.semantic)}; - if (attrib.GetStepRate() == Gcn::VertexAttribute::InstanceIdType::Plain) { - Name(id, fmt::format("vs_instance_attr{}", attrib.semantic)); - } else { - Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); - } - input_params[attrib.semantic] = - GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); + Name(id, fmt::format("vs_in_attr{}", attrib.semantic)); } + input_params[attrib.semantic] = GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); } break; } @@ -572,7 +551,7 @@ void EmitContext::DefineOutputs() { cull_distances = DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output); } - if (stage == Shader::Stage::Local && runtime_info.ls_info.links_with_tcs) { + if (stage == Stage::Local) { const u32 num_attrs = Common::AlignUp(runtime_info.ls_info.ls_stride, 16) >> 4; if (num_attrs > 0) { const Id type{TypeArray(F32[4], ConstU32(num_attrs))}; @@ -699,12 +678,10 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates - const Id struct_type{Name(TypeStruct(U32[1], U32[1], F32[1], F32[1], F32[1], F32[1], U32[4], - U32[4], U32[4], U32[4], U32[4], U32[4]), + const Id struct_type{Name(TypeStruct(F32[1], F32[1], F32[1], F32[1], U32[4], U32[4], U32[4], + U32[4], U32[4], U32[4], U32[2]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); - MemberName(struct_type, PushData::Step0Index, "sr0"); - MemberName(struct_type, PushData::Step1Index, "sr1"); MemberName(struct_type, PushData::XOffsetIndex, "xoffset"); MemberName(struct_type, PushData::YOffsetIndex, "yoffset"); MemberName(struct_type, PushData::XScaleIndex, "xscale"); @@ -715,18 +692,18 @@ void EmitContext::DefinePushDataBlock() { MemberName(struct_type, PushData::UdRegsIndex + 3, "ud_regs3"); MemberName(struct_type, PushData::BufOffsetIndex + 0, "buf_offsets0"); MemberName(struct_type, PushData::BufOffsetIndex + 1, "buf_offsets1"); - MemberDecorate(struct_type, PushData::Step0Index, spv::Decoration::Offset, 0U); - MemberDecorate(struct_type, PushData::Step1Index, spv::Decoration::Offset, 4U); - MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 8U); - MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 12U); - MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 16U); - MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 20U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 24U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 40U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 56U); - MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 72U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 88U); - MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 104U); + MemberName(struct_type, PushData::BufOffsetIndex + 2, "buf_offsets2"); + MemberDecorate(struct_type, PushData::XOffsetIndex, spv::Decoration::Offset, 0U); + MemberDecorate(struct_type, PushData::YOffsetIndex, spv::Decoration::Offset, 4U); + MemberDecorate(struct_type, PushData::XScaleIndex, spv::Decoration::Offset, 8U); + MemberDecorate(struct_type, PushData::YScaleIndex, spv::Decoration::Offset, 12U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 0, spv::Decoration::Offset, 16U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 1, spv::Decoration::Offset, 32U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 2, spv::Decoration::Offset, 48U); + MemberDecorate(struct_type, PushData::UdRegsIndex + 3, spv::Decoration::Offset, 64U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 0, spv::Decoration::Offset, 80U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 1, spv::Decoration::Offset, 96U); + MemberDecorate(struct_type, PushData::BufOffsetIndex + 2, spv::Decoration::Offset, 112U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); @@ -760,19 +737,19 @@ EmitContext::BufferSpv EmitContext::DefineBuffer(bool is_storage, bool is_writte Decorate(id, spv::Decoration::NonWritable); } switch (buffer_type) { - case Shader::BufferType::GdsBuffer: + case BufferType::GdsBuffer: Name(id, "gds_buffer"); break; - case Shader::BufferType::Flatbuf: + case BufferType::Flatbuf: Name(id, "srt_flatbuf"); break; - case Shader::BufferType::BdaPagetable: + case BufferType::BdaPagetable: Name(id, "bda_pagetable"); break; - case Shader::BufferType::FaultBuffer: + case BufferType::FaultBuffer: Name(id, "fault_buffer"); break; - case Shader::BufferType::SharedMemory: + case BufferType::SharedMemory: Name(id, "ssbo_shmem"); break; default: diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 28e9099d8..186925706 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -361,7 +361,6 @@ public: u32 num_components; bool is_integer{}; bool is_loaded{}; - s32 buffer_handle{-1}; }; Id input_attr_array; Id output_attr_array; diff --git a/src/shader_recompiler/frontend/control_flow_graph.cpp b/src/shader_recompiler/frontend/control_flow_graph.cpp index b53db9e94..805fdb108 100644 --- a/src/shader_recompiler/frontend/control_flow_graph.cpp +++ b/src/shader_recompiler/frontend/control_flow_graph.cpp @@ -188,14 +188,15 @@ void CFG::SplitDivergenceScopes() { const bool is_close = is_close_scope(inst); if ((is_close || index == blk->end_index) && curr_begin != -1) { // If there are no instructions inside scope don't do anything. - if (index - curr_begin == 1) { + if (index - curr_begin == 1 && is_close) { curr_begin = -1; continue; } // If all instructions in the scope ignore exec masking, we shouldn't insert a // scope. const auto start = inst_list.begin() + curr_begin + 1; - if (!std::ranges::all_of(start, inst_list.begin() + index, IgnoresExecMask)) { + if (!std::ranges::all_of(start, inst_list.begin() + index + !is_close, + IgnoresExecMask)) { // Determine the first instruction affected by the exec mask. do { ++curr_begin; diff --git a/src/shader_recompiler/frontend/fetch_shader.h b/src/shader_recompiler/frontend/fetch_shader.h index 837caafa0..e77925232 100644 --- a/src/shader_recompiler/frontend/fetch_shader.h +++ b/src/shader_recompiler/frontend/fetch_shader.h @@ -3,7 +3,6 @@ #pragma once -#include #include #include "common/types.h" #include "shader_recompiler/info.h" @@ -29,11 +28,6 @@ struct VertexAttribute { return static_cast(instance_data); } - [[nodiscard]] bool UsesStepRates() const { - const auto step_rate = GetStepRate(); - return step_rate == OverStepRate0 || step_rate == OverStepRate1; - } - [[nodiscard]] constexpr AmdGpu::Buffer GetSharp(const Shader::Info& info) const noexcept { return info.ReadUdReg(sgpr_base, dword_offset); } @@ -52,12 +46,6 @@ struct FetchShaderData { s8 vertex_offset_sgpr = -1; ///< SGPR of vertex offset from VADDR s8 instance_offset_sgpr = -1; ///< SGPR of instance offset from VADDR - [[nodiscard]] bool UsesStepRates() const { - return std::ranges::find_if(attributes, [](const VertexAttribute& attribute) { - return attribute.UsesStepRates(); - }) != attributes.end(); - } - bool operator==(const FetchShaderData& other) const { return attributes == other.attributes && vertex_offset_sgpr == other.vertex_offset_sgpr && instance_offset_sgpr == other.instance_offset_sgpr; diff --git a/src/shader_recompiler/frontend/format.cpp b/src/shader_recompiler/frontend/format.cpp index 52c8c733e..6c4427e5f 100644 --- a/src/shader_recompiler/frontend/format.cpp +++ b/src/shader_recompiler/frontend/format.cpp @@ -397,7 +397,7 @@ constexpr std::array InstructionFormatSOPP = {{ // 17 = S_SENDMSGHALT {InstClass::ScalarProgFlow, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 18 = S_TRAP - {InstClass::Undefined, InstCategory::Undefined, 0, 1, ScalarType::Any, ScalarType::Any}, + {InstClass::Undefined, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 19 = S_ICACHE_INV {InstClass::ScalarCache, InstCategory::FlowControl, 0, 1, ScalarType::Any, ScalarType::Any}, // 20 = S_INCPERFLEVEL diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 8ead93f78..634486fc4 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -3,7 +3,6 @@ #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/reg.h" -#include "shader_recompiler/profile.h" #include "shader_recompiler/runtime_info.h" namespace Shader::Gcn { @@ -12,29 +11,29 @@ void Translator::EmitDataShare(const GcnInst& inst) { switch (inst.opcode) { // DS case Opcode::DS_ADD_U32: - return DS_ADD_U32(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_ADD_U64: - return DS_ADD_U64(inst, false); + return DS_OP(inst, AtomicOp::Add, false); case Opcode::DS_SUB_U32: - return DS_SUB_U32(inst, false); + return DS_OP(inst, AtomicOp::Sub, false); case Opcode::DS_INC_U32: - return DS_INC_U32(inst, false); + return DS_OP(inst, AtomicOp::Inc, false); case Opcode::DS_DEC_U32: - return DS_DEC_U32(inst, false); + return DS_OP(inst, AtomicOp::Dec, false); case Opcode::DS_MIN_I32: - return DS_MIN_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smin, false); case Opcode::DS_MAX_I32: - return DS_MAX_U32(inst, true, false); + return DS_OP(inst, AtomicOp::Smax, false); case Opcode::DS_MIN_U32: - return DS_MIN_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umin, false); case Opcode::DS_MAX_U32: - return DS_MAX_U32(inst, false, false); + return DS_OP(inst, AtomicOp::Umax, false); case Opcode::DS_AND_B32: - return DS_AND_B32(inst, false); + return DS_OP(inst, AtomicOp::And, false); case Opcode::DS_OR_B32: - return DS_OR_B32(inst, false); + return DS_OP(inst, AtomicOp::Or, false); case Opcode::DS_XOR_B32: - return DS_XOR_B32(inst, false); + return DS_OP(inst, AtomicOp::Xor, false); case Opcode::DS_WRITE_B32: return DS_WRITE(32, false, false, false, inst); case Opcode::DS_WRITE2_B32: @@ -42,19 +41,19 @@ void Translator::EmitDataShare(const GcnInst& inst) { case Opcode::DS_WRITE2ST64_B32: return DS_WRITE(32, false, true, true, inst); case Opcode::DS_ADD_RTN_U32: - return DS_ADD_U32(inst, true); + return DS_OP(inst, AtomicOp::Add, true); case Opcode::DS_SUB_RTN_U32: - return DS_SUB_U32(inst, true); + return DS_OP(inst, AtomicOp::Sub, true); case Opcode::DS_MIN_RTN_U32: - return DS_MIN_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umin, true); case Opcode::DS_MAX_RTN_U32: - return DS_MAX_U32(inst, false, true); + return DS_OP(inst, AtomicOp::Umax, true); case Opcode::DS_AND_RTN_B32: - return DS_AND_B32(inst, true); + return DS_OP(inst, AtomicOp::And, true); case Opcode::DS_OR_RTN_B32: - return DS_OR_B32(inst, true); + return DS_OP(inst, AtomicOp::Or, true); case Opcode::DS_XOR_RTN_B32: - return DS_XOR_B32(inst, true); + return DS_OP(inst, AtomicOp::Xor, true); case Opcode::DS_SWIZZLE_B32: return DS_SWIZZLE_B32(inst); case Opcode::DS_READ_B32: @@ -117,92 +116,63 @@ void Translator::V_WRITELANE_B32(const GcnInst& inst) { // DS -void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { +template +void Translator::DS_OP(const GcnInst& inst, AtomicOp op, bool rtn) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; + const T data = [&] { + if (op == AtomicOp::Inc || op == AtomicOp::Dec) { + return T{}; + } + if constexpr (std::is_same_v) { + return GetSrc(inst.src[1]); + } else { + return GetSrc64(inst.src[1]); + } + }(); const IR::U32 offset = ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + const T original_val = [&] -> T { + switch (op) { + case AtomicOp::Add: + return ir.SharedAtomicIAdd(addr_offset, data, is_gds); + case AtomicOp::Umin: + return ir.SharedAtomicIMin(addr_offset, data, false, is_gds); + case AtomicOp::Smin: + return ir.SharedAtomicIMin(addr_offset, data, true, is_gds); + case AtomicOp::Umax: + return ir.SharedAtomicIMax(addr_offset, data, false, is_gds); + case AtomicOp::Smax: + return ir.SharedAtomicIMax(addr_offset, data, true, is_gds); + case AtomicOp::And: + return ir.SharedAtomicAnd(addr_offset, data, is_gds); + case AtomicOp::Or: + return ir.SharedAtomicOr(addr_offset, data, is_gds); + case AtomicOp::Xor: + return ir.SharedAtomicXor(addr_offset, data, is_gds); + case AtomicOp::Sub: + return ir.SharedAtomicISub(addr_offset, data, is_gds); + case AtomicOp::Inc: + return ir.SharedAtomicInc(addr_offset, is_gds); + case AtomicOp::Dec: + return ir.SharedAtomicDec(addr_offset, is_gds); + default: + UNREACHABLE(); + } + }(); if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U64 data{GetSrc64(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); - if (rtn) { - SetDst64(inst.dst[0], IR::U64{original_val}); - } -} - -void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, is_signed); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_AND_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicAnd(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_OR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicOr(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_XOR_B32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicXor(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); + if constexpr (std::is_same_v) { + SetDst(inst.dst[0], original_val); + } else { + SetDst64(inst.dst[0], original_val); + } } } void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg data0{inst.src[1].code}; const IR::VectorReg data1{inst.src[2].code}; @@ -220,33 +190,85 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1))), - addr0); + addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data0), addr0); + ir.WriteShared(32, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 64) { ir.WriteShared(64, ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1))), - addr1); + addr1, is_gds); } else if (bit_size == 32) { - ir.WriteShared(32, ir.GetVectorReg(data1), addr1); + ir.WriteShared(32, ir.GetVectorReg(data1), addr1, is_gds); } else if (bit_size == 16) { - ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1); + ir.WriteShared(16, ir.UConvert(16, ir.GetVectorReg(data1)), addr1, is_gds); } } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); if (bit_size == 64) { const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0, is_gds); } else if (bit_size == 32) { - ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0, is_gds); } else if (bit_size == 16) { - ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0); + ir.WriteShared(bit_size, ir.UConvert(16, ir.GetVectorReg(data0)), addr0, is_gds); + } + } +} + +void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, + const GcnInst& inst) { + const bool is_gds = inst.control.ds.gds; + const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; + IR::VectorReg dst_reg{inst.dst[0].code}; + const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; + if (info.stage == Stage::Fragment) { + ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, + "Unexpected shared memory offset alignment: {}", offset); + ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); + return; + } + if (is_pair) { + // Pair loads are either 32 or 64-bit + const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); + const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data0}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); + } + const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); + const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg++, IR::U32{data1}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); + } + } else { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0, is_gds); + if (bit_size == 64) { + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 32) { + ir.SetVectorReg(dst_reg, IR::U32{data}); + } else if (bit_size == 16) { + ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); } } } @@ -263,91 +285,6 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.QuadShuffle(src, index)); } -void Translator::DS_INC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicInc(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_DEC_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicDec(addr_offset); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_SUB_U32(const GcnInst& inst, bool rtn) { - const IR::U32 addr{GetSrc(inst.src[0])}; - const IR::U32 data{GetSrc(inst.src[1])}; - const IR::U32 offset = - ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); - const IR::U32 addr_offset = ir.IAdd(addr, offset); - const IR::Value original_val = ir.SharedAtomicISub(addr_offset, data); - if (rtn) { - SetDst(inst.dst[0], IR::U32{original_val}); - } -} - -void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, - const GcnInst& inst) { - const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; - IR::VectorReg dst_reg{inst.dst[0].code}; - const u32 offset = (inst.control.ds.offset1 << 8u) + inst.control.ds.offset0; - if (info.stage == Stage::Fragment) { - ASSERT_MSG(!is_pair && bit_size == 32 && offset % 256 == 0, - "Unexpected shared memory offset alignment: {}", offset); - ir.SetVectorReg(dst_reg, ir.GetVectorReg(GetScratchVgpr(offset))); - return; - } - if (is_pair) { - // Pair loads are either 32 or 64-bit - const u32 adj = (bit_size == 32 ? 4 : 8) * (stride64 ? 64 : 1); - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset0 * adj))); - const IR::Value data0 = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data0}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data0}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data0})}); - } - const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); - const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data1}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg++, IR::U32{data1}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data1})}); - } - } else { - const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); - const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - if (bit_size == 64) { - const auto vector = ir.UnpackUint2x32(IR::U64{data}); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); - } else if (bit_size == 32) { - ir.SetVectorReg(dst_reg, IR::U32{data}); - } else if (bit_size == 16) { - ir.SetVectorReg(dst_reg++, IR::U32{ir.UConvert(32, IR::U16{data})}); - } - } -} - void Translator::DS_APPEND(const GcnInst& inst) { const u32 inst_offset = (u32(inst.control.ds.offset1) << 8u) + inst.control.ds.offset0; const IR::U32 gds_offset = ir.IAdd(ir.GetM0(), ir.Imm32(inst_offset)); diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 48f977f49..e3134c300 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -586,6 +586,15 @@ void Translator::S_MOV(const GcnInst& inst) { } void Translator::S_MOV_B64(const GcnInst& inst) { + // Moving SGPR to SGPR is used for thread masks, like most operations, but it can also be used + // for moving sharps. + if (inst.dst[0].field == OperandField::ScalarGPR && + inst.src[0].field == OperandField::ScalarGPR) { + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code))); + ir.SetScalarReg(IR::ScalarReg(inst.dst[0].code + 1), + ir.GetScalarReg(IR::ScalarReg(inst.src[0].code + 1))); + } const IR::U1 src = [&] { switch (inst.src[0].field) { case OperandField::VccLo: @@ -671,8 +680,9 @@ void Translator::S_FF1_I32_B32(const GcnInst& inst) { } void Translator::S_FF1_I32_B64(const GcnInst& inst) { - const IR::U64 src0{GetSrc64(inst.src[0])}; - const IR::U32 result{ir.FindILsb(src0)}; + ASSERT(inst.src[0].field == OperandField::ScalarGPR); + const IR::U32 result{ + ir.BallotFindLsb(ir.Ballot(ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code))))}; SetDst(inst.dst[0], result); } diff --git a/src/shader_recompiler/frontend/translate/scalar_flow.cpp b/src/shader_recompiler/frontend/translate/scalar_flow.cpp index cd1cf51f0..7b57d89ca 100644 --- a/src/shader_recompiler/frontend/translate/scalar_flow.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_flow.cpp @@ -16,6 +16,9 @@ void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { case Opcode::S_SETPRIO: LOG_WARNING(Render_Vulkan, "S_SETPRIO instruction!"); return; + case Opcode::S_TRAP: + LOG_WARNING(Render_Vulkan, "S_TRAP instruction!"); + return; case Opcode::S_GETPC_B64: return S_GETPC_B64(pc, inst); case Opcode::S_SETPC_B64: diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 5853f3e72..310ac9156 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -90,17 +90,40 @@ void Translator::EmitPrologue(IR::Block* first_block) { case LogicalStage::Vertex: // v0: vertex ID, always present ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::VertexId)); - // v1: instance ID, step rate 0 - if (runtime_info.num_input_vgprs > 0) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId0)); - } - // v2: instance ID, step rate 1 - if (runtime_info.num_input_vgprs > 1) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId1)); - } - // v3: instance ID, plain - if (runtime_info.num_input_vgprs > 2) { - ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + if (info.stage == Stage::Local) { + // v1: rel patch ID + if (runtime_info.num_input_vgprs > 0) { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + // v2: instance ID + if (runtime_info.num_input_vgprs > 1) { + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + } + } else { + // v1: instance ID, step rate 0 + if (runtime_info.num_input_vgprs > 0) { + if (runtime_info.vs_info.step_rate_0 != 0) { + ir.SetVectorReg(dst_vreg++, + ir.IDiv(ir.GetAttributeU32(IR::Attribute::InstanceId), + ir.Imm32(runtime_info.vs_info.step_rate_0))); + } else { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + } + // v2: instance ID, step rate 1 + if (runtime_info.num_input_vgprs > 1) { + if (runtime_info.vs_info.step_rate_1 != 0) { + ir.SetVectorReg(dst_vreg++, + ir.IDiv(ir.GetAttributeU32(IR::Attribute::InstanceId), + ir.Imm32(runtime_info.vs_info.step_rate_1))); + } else { + ir.SetVectorReg(dst_vreg++, ir.Imm32(0)); + } + } + // v3: instance ID, plain + if (runtime_info.num_input_vgprs > 2) { + ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); + } } break; case LogicalStage::Fragment: @@ -183,10 +206,8 @@ void Translator::EmitPrologue(IR::Block* first_block) { switch (runtime_info.gs_info.out_primitive[0]) { case AmdGpu::GsOutputPrimitiveType::TriangleStrip: ir.SetVectorReg(IR::VectorReg::V3, ir.Imm32(2u)); // vertex 2 - [[fallthrough]]; case AmdGpu::GsOutputPrimitiveType::LineStrip: ir.SetVectorReg(IR::VectorReg::V1, ir.Imm32(1u)); // vertex 1 - [[fallthrough]]; default: ir.SetVectorReg(IR::VectorReg::V0, ir.Imm32(0u)); // vertex 0 break; @@ -481,11 +502,11 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra } void Translator::EmitFetch(const GcnInst& inst) { - // Read the pointer to the fetch shader assembly. const auto code_sgpr_base = inst.src[0].code; + + // The fetch shader must be inlined to access as regular buffers, so that + // bounds checks can be emitted to emulate robust buffer access. if (!profile.supports_robust_buffer_access) { - // The fetch shader must be inlined to access as regular buffers, so that - // bounds checks can be emitted to emulate robust buffer access. const auto* code = GetFetchShaderCode(info, code_sgpr_base); GcnCodeSlice slice(code, code + std::numeric_limits::max()); GcnDecodeContext decoder; @@ -535,16 +556,6 @@ void Translator::EmitFetch(const GcnInst& inst) { for (u32 i = 0; i < 4; i++) { ir.SetVectorReg(dst_reg++, IR::F32{ir.CompositeExtract(swizzled, i)}); } - - // In case of programmable step rates we need to fallback to instance data pulling in - // shader, so VBs should be bound as regular data buffers - if (attrib.UsesStepRates()) { - info.buffers.push_back({ - .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), - .used_types = IR::Type::F32, - .instance_attrib = attrib.semantic, - }); - } } } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index b5bfec344..4b5ff827b 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -270,21 +270,13 @@ public: // Data share // DS - void DS_ADD_U32(const GcnInst& inst, bool rtn); - void DS_ADD_U64(const GcnInst& inst, bool rtn); - void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); - void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); + template + void DS_OP(const GcnInst& inst, AtomicOp op, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); - void DS_SWIZZLE_B32(const GcnInst& inst); - void DS_AND_B32(const GcnInst& inst, bool rtn); - void DS_OR_B32(const GcnInst& inst, bool rtn); - void DS_XOR_B32(const GcnInst& inst, bool rtn); void DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); + void DS_SWIZZLE_B32(const GcnInst& inst); void DS_APPEND(const GcnInst& inst); void DS_CONSUME(const GcnInst& inst); - void DS_SUB_U32(const GcnInst& inst, bool rtn); - void DS_INC_U32(const GcnInst& inst, bool rtn); - void DS_DEC_U32(const GcnInst& inst, bool rtn); // Buffer Memory // MUBUF / MTBUF diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 54f1088f2..017c77fb0 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -565,7 +565,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ if ((inst.src[0].field == OperandField::ExecHi || - inst.src[0].field == OperandField::VccHi) && + inst.src[0].field == OperandField::VccHi || + inst.src[0].field == OperandField::ScalarGPR) && (inst.src[1].field == OperandField::ConstZero || inst.src[1].field == OperandField::VectorGPR)) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); @@ -579,7 +580,8 @@ void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) { } // v_mbcnt_lo_u32_b32 vY, exec_lo, vX // used combined with above for append buffer indexing. - if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) { + if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo || + inst.src[0].field == OperandField::ScalarGPR) { return SetDst(inst.dst[0], GetSrc(inst.src[1])); } UNREACHABLE(); @@ -623,12 +625,15 @@ void Translator::V_ADDC_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 carry{GetCarryIn(inst)}; - const IR::U32 result{ir.IAdd(ir.IAdd(src0, src1), carry)}; - SetDst(inst.dst[0], result); + const IR::Value tmp1{ir.IAddCarry(src0, src1)}; + const IR::U32 result1{ir.CompositeExtract(tmp1, 0)}; + const IR::U32 carry_out1{ir.CompositeExtract(tmp1, 1)}; + const IR::Value tmp2{ir.IAddCarry(result1, carry)}; + const IR::U32 result2{ir.CompositeExtract(tmp2, 0)}; + const IR::U32 carry_out2{ir.CompositeExtract(tmp2, 1)}; + SetDst(inst.dst[0], result2); - const IR::U1 less_src0{ir.ILessThan(result, src0, false)}; - const IR::U1 less_src1{ir.ILessThan(result, src1, false)}; - const IR::U1 did_overflow{ir.LogicalOr(less_src0, less_src1)}; + const IR::U1 did_overflow{ir.INotEqual(ir.BitwiseOr(carry_out1, carry_out2), ir.Imm32(0))}; SetCarryOut(inst, did_overflow); } diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 91f545cfd..df20f7f73 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -192,9 +192,10 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ const IR::VectorReg vaddr{inst.src[0].code}; const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; + const bool has_soffset = !soffset.IsImmediate() || soffset.U32() != 0; if (info.stage != Stage::Geometry) { - ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, - "Non immediate offset not supported"); + ASSERT_MSG(!has_soffset || !mubuf.offen, + "Having both scalar and vector offsets is not supported"); } const IR::Value address = [&] -> IR::Value { @@ -204,15 +205,21 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_inst_typed, bool is_buffer_ if (mubuf.idxen && mubuf.offen) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1)); } + if (mubuf.idxen && has_soffset) { + return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); + } if (mubuf.idxen || mubuf.offen) { return ir.GetVectorReg(vaddr); } + if (has_soffset) { + return soffset; + } return {}; }(); IR::BufferInstInfo buffer_info{}; buffer_info.index_enable.Assign(mubuf.idxen); - buffer_info.offset_enable.Assign(mubuf.offen); + buffer_info.offset_enable.Assign(mubuf.offen || has_soffset); buffer_info.inst_offset.Assign(mubuf.offset); buffer_info.globally_coherent.Assign(mubuf.glc); buffer_info.system_coherent.Assign(mubuf.slc); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 72977b711..6e12c6816 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -25,7 +25,7 @@ namespace Shader { static constexpr size_t NumUserDataRegs = 16; static constexpr size_t NumImages = 64; -static constexpr size_t NumBuffers = 32; +static constexpr size_t NumBuffers = 40; static constexpr size_t NumSamplers = 16; static constexpr size_t NumFMasks = 8; @@ -113,17 +113,13 @@ struct FMaskResource { using FMaskResourceList = boost::container::small_vector; struct PushData { - static constexpr u32 Step0Index = 0; - static constexpr u32 Step1Index = 1; - static constexpr u32 XOffsetIndex = 2; - static constexpr u32 YOffsetIndex = 3; - static constexpr u32 XScaleIndex = 4; - static constexpr u32 YScaleIndex = 5; - static constexpr u32 UdRegsIndex = 6; + static constexpr u32 XOffsetIndex = 0; + static constexpr u32 YOffsetIndex = 1; + static constexpr u32 XScaleIndex = 2; + static constexpr u32 YScaleIndex = 3; + static constexpr u32 UdRegsIndex = 4; static constexpr u32 BufOffsetIndex = UdRegsIndex + NumUserDataRegs / 4; - u32 step0; - u32 step1; float xoffset; float yoffset; float xscale; diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 6a267e21b..b2f11d141 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -100,22 +100,36 @@ std::string NameOf(Attribute attribute) { return "Param30"; case Attribute::Param31: return "Param31"; + case Attribute::ClipDistance: + return "ClipDistanace"; + case Attribute::CullDistance: + return "CullDistance"; + case Attribute::RenderTargetId: + return "RenderTargetId"; + case Attribute::ViewportId: + return "ViewportId"; case Attribute::VertexId: return "VertexId"; - case Attribute::InstanceId: - return "InstanceId"; case Attribute::PrimitiveId: return "PrimitiveId"; - case Attribute::FragCoord: - return "FragCoord"; + case Attribute::InstanceId: + return "InstanceId"; case Attribute::IsFrontFace: return "IsFrontFace"; + case Attribute::SampleIndex: + return "SampleIndex"; + case Attribute::GlobalInvocationId: + return "GlobalInvocationId"; case Attribute::WorkgroupId: return "WorkgroupId"; + case Attribute::WorkgroupIndex: + return "WorkgroupIndex"; case Attribute::LocalInvocationId: return "LocalInvocationId"; case Attribute::LocalInvocationIndex: return "LocalInvocationIndex"; + case Attribute::FragCoord: + return "FragCoord"; case Attribute::InvocationId: return "InvocationId"; case Attribute::PatchVertices: diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 68472f052..b6b1c8b59 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -73,8 +73,6 @@ enum class Attribute : u64 { LocalInvocationId = 76, LocalInvocationIndex = 77, FragCoord = 78, - InstanceId0 = 79, // step rate 0 - InstanceId1 = 80, // step rate 1 InvocationId = 81, // TCS id in output patch and instanced geometry shader id PatchVertices = 82, TessellationEvaluationPointU = 83, diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3d64cc5da..6ca86b2c0 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -255,8 +255,8 @@ void IREmitter::SetM0(const U32& value) { Inst(Opcode::SetM0, value); } -F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) { - return Inst(Opcode::GetAttribute, attribute, Imm32(comp), index); +F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) { + return Inst(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index)); } U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) { @@ -291,78 +291,137 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Inst(Opcode::SetPatch, patch, value); } -Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { +Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - return Inst(Opcode::LoadSharedU16, offset); + return Inst(Opcode::LoadSharedU16, Flags{is_gds}, offset); case 32: - return Inst(Opcode::LoadSharedU32, offset); + return Inst(Opcode::LoadSharedU32, Flags{is_gds}, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, Flags{is_gds}, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { +void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds) { switch (bit_size) { case 16: - Inst(Opcode::WriteSharedU16, offset, value); + Inst(Opcode::WriteSharedU16, Flags{is_gds}, offset, value); break; case 32: - Inst(Opcode::WriteSharedU32, offset, value); + Inst(Opcode::WriteSharedU32, Flags{is_gds}, offset, value); break; case 64: - Inst(Opcode::WriteSharedU64, offset, value); + Inst(Opcode::WriteSharedU64, Flags{is_gds}, offset, value); break; default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } } -U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds) { switch (data.Type()) { case Type::U32: - return Inst(Opcode::SharedAtomicIAdd32, address, data); + return Inst(Opcode::SharedAtomicIAdd32, Flags{is_gds}, address, data); case Type::U64: - return Inst(Opcode::SharedAtomicIAdd64, address, data); + return Inst(Opcode::SharedAtomicIAdd64, Flags{is_gds}, address, data); default: ThrowInvalidType(data.Type()); } } -U32 IREmitter::SharedAtomicIMin(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMin32, address, data) - : Inst(Opcode::SharedAtomicUMin32, address, data); +U32U64 IREmitter::SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMin32 : Opcode::SharedAtomicUMin32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMin64 : Opcode::SharedAtomicUMin64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicIMax(const U32& address, const U32& data, bool is_signed) { - return is_signed ? Inst(Opcode::SharedAtomicSMax32, address, data) - : Inst(Opcode::SharedAtomicUMax32, address, data); +U32U64 IREmitter::SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(is_signed ? Opcode::SharedAtomicSMax32 : Opcode::SharedAtomicUMax32, + Flags{is_gds}, address, data); + case Type::U64: + return Inst(is_signed ? Opcode::SharedAtomicSMax64 : Opcode::SharedAtomicUMax64, + Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicAnd(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicAnd32, address, data); +U32U64 IREmitter::SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicOr(const U32& address, const U32& data) { +U32U64 IREmitter::SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicAnd32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicAnd64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } return Inst(Opcode::SharedAtomicOr32, address, data); } -U32 IREmitter::SharedAtomicXor(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicXor32, address, data); +U32U64 IREmitter::SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicXor32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicXor64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicInc(const U32& address) { - return Inst(Opcode::SharedAtomicInc32, address); +U32U64 IREmitter::SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds) { + switch (data.Type()) { + case Type::U32: + return Inst(Opcode::SharedAtomicISub32, Flags{is_gds}, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicISub64, Flags{is_gds}, address, data); + default: + ThrowInvalidType(data.Type()); + } } -U32 IREmitter::SharedAtomicDec(const U32& address) { - return Inst(Opcode::SharedAtomicDec32, address); +template <> +U32 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc32, Flags{is_gds}, address); } -U32 IREmitter::SharedAtomicISub(const U32& address, const U32& data) { - return Inst(Opcode::SharedAtomicISub32, address, data); +template <> +U64 IREmitter::SharedAtomicInc(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicInc64, Flags{is_gds}, address); +} + +template <> +U32 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec32, Flags{is_gds}, address); +} + +template <> +U64 IREmitter::SharedAtomicDec(const U32& address, bool is_gds) { + return Inst(Opcode::SharedAtomicDec64, Flags{is_gds}, address); } U32 IREmitter::ReadConst(const Value& base, const U32& offset) { @@ -601,6 +660,14 @@ U32 IREmitter::WriteLane(const U32& value, const U32& write_value, const U32& la return Inst(Opcode::WriteLane, value, write_value, lane); } +Value IREmitter::Ballot(const U1& bit) { + return Inst(Opcode::Ballot, bit); +} + +U32 IREmitter::BallotFindLsb(const Value& mask) { + return Inst(Opcode::BallotFindLsb, mask); +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); @@ -1424,13 +1491,13 @@ U32U64 IREmitter::IAdd(const U32U64& a, const U32U64& b) { } } -Value IREmitter::IAddCary(const U32& a, const U32& b) { +Value IREmitter::IAddCarry(const U32& a, const U32& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); } switch (a.Type()) { case Type::U32: - return Inst(Opcode::IAddCary32, a, b); + return Inst(Opcode::IAddCarry32, a, b); default: ThrowInvalidType(a.Type()); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 119e3752e..a105b042d 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -81,8 +81,7 @@ public: [[nodiscard]] U1 Condition(IR::Condition cond); - [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, - IR::Value index = IR::Value(u32(0u))); + [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0); [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); @@ -96,18 +95,24 @@ public: [[nodiscard]] F32 GetPatch(Patch patch); void SetPatch(Patch patch, const F32& value); - [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); - void WriteShared(int bit_size, const Value& value, const U32& offset); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset, + bool is_gds = false); + void WriteShared(int bit_size, const Value& value, const U32& offset, bool is_gds = false); - [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); - [[nodiscard]] U32 SharedAtomicISub(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); - [[nodiscard]] U32 SharedAtomicInc(const U32& address); - [[nodiscard]] U32 SharedAtomicDec(const U32& address); - [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicOr(const U32& address, const U32& data); - [[nodiscard]] U32 SharedAtomicXor(const U32& address, const U32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicISub(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMin(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicIMax(const U32& address, const U32U64& data, bool is_signed, + bool is_gds); + [[nodiscard]] U32U64 SharedAtomicAnd(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicOr(const U32& address, const U32U64& data, bool is_gds); + [[nodiscard]] U32U64 SharedAtomicXor(const U32& address, const U32U64& data, bool is_gds); + + template + [[nodiscard]] T SharedAtomicInc(const U32& address, bool is_gds); + template + [[nodiscard]] T SharedAtomicDec(const U32& address, bool is_gds); [[nodiscard]] U32 ReadConst(const Value& base, const U32& offset); [[nodiscard]] U32 ReadConstBuffer(const Value& handle, const U32& index); @@ -170,6 +175,8 @@ public: [[nodiscard]] U32 ReadFirstLane(const U32& value); [[nodiscard]] U32 ReadLane(const U32& value, const U32& lane); [[nodiscard]] U32 WriteLane(const U32& value, const U32& write_value, const U32& lane); + [[nodiscard]] Value Ballot(const U1& bit); + [[nodiscard]] U32 BallotFindLsb(const Value& mask); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); @@ -254,7 +261,7 @@ public: [[nodiscard]] F32F64 FPMedTri(const F32F64& a, const F32F64& b, const F32F64& c); [[nodiscard]] U32U64 IAdd(const U32U64& a, const U32U64& b); - [[nodiscard]] Value IAddCary(const U32& a, const U32& b); + [[nodiscard]] Value IAddCarry(const U32& a, const U32& b); [[nodiscard]] U32U64 ISub(const U32U64& a, const U32U64& b); [[nodiscard]] U32 IMulHi(const U32& a, const U32& b, bool is_signed = false); [[nodiscard]] U32U64 IMul(const U32U64& a, const U32U64& b); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 84bdb5739..eaab05cb7 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -92,7 +92,6 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::WriteSharedU32: case Opcode::WriteSharedU64: case Opcode::SharedAtomicIAdd32: - case Opcode::SharedAtomicIAdd64: case Opcode::SharedAtomicISub32: case Opcode::SharedAtomicSMin32: case Opcode::SharedAtomicUMin32: @@ -103,6 +102,17 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::SharedAtomicAnd32: case Opcode::SharedAtomicOr32: case Opcode::SharedAtomicXor32: + case Opcode::SharedAtomicIAdd64: + case Opcode::SharedAtomicISub64: + case Opcode::SharedAtomicSMin64: + case Opcode::SharedAtomicUMin64: + case Opcode::SharedAtomicSMax64: + case Opcode::SharedAtomicUMax64: + case Opcode::SharedAtomicInc64: + case Opcode::SharedAtomicDec64: + case Opcode::SharedAtomicAnd64: + case Opcode::SharedAtomicOr64: + case Opcode::SharedAtomicXor64: case Opcode::ImageWrite: case Opcode::ImageAtomicIAdd32: case Opcode::ImageAtomicSMin32: diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 008f44659..747a27e35 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -41,15 +41,25 @@ OPCODE(WriteSharedU64, Void, U32, OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicISub32, U32, U32, U32, ) +OPCODE(SharedAtomicISub64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) +OPCODE(SharedAtomicSMin64, U64, U32, U64, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) +OPCODE(SharedAtomicUMin64, U64, U32, U64, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) +OPCODE(SharedAtomicSMax64, U64, U32, U64, ) OPCODE(SharedAtomicUMax32, U32, U32, U32, ) +OPCODE(SharedAtomicUMax64, U64, U32, U64, ) OPCODE(SharedAtomicInc32, U32, U32, ) +OPCODE(SharedAtomicInc64, U64, U32, ) OPCODE(SharedAtomicDec32, U32, U32, ) +OPCODE(SharedAtomicDec64, U64, U32, ) OPCODE(SharedAtomicAnd32, U32, U32, U32, ) +OPCODE(SharedAtomicAnd64, U64, U32, U64, ) OPCODE(SharedAtomicOr32, U32, U32, U32, ) +OPCODE(SharedAtomicOr64, U64, U32, U64, ) OPCODE(SharedAtomicXor32, U32, U32, U32, ) +OPCODE(SharedAtomicXor64, U64, U32, U64, ) // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) @@ -328,7 +338,7 @@ OPCODE(FPCmpClass32, U1, F32, // Integer operations OPCODE(IAdd32, U32, U32, U32, ) OPCODE(IAdd64, U64, U64, U64, ) -OPCODE(IAddCary32, U32x2, U32, U32, ) +OPCODE(IAddCarry32, U32x2, U32, U32, ) OPCODE(ISub32, U32, U32, U32, ) OPCODE(ISub64, U64, U64, U64, ) OPCODE(IMul32, U32, U32, U32, ) @@ -462,5 +472,7 @@ OPCODE(QuadShuffle, U32, U32, OPCODE(ReadFirstLane, U32, U32, ) OPCODE(ReadLane, U32, U32, U32 ) OPCODE(WriteLane, U32, U32, U32, U32 ) +OPCODE(Ballot, U32x4, U1, ) +OPCODE(BallotFindLsb, U32, U32x4, ) OPCODE(DataAppend, U32, U32, U32 ) OPCODE(DataConsume, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp index 7253e18c1..e0c99655d 100644 --- a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp +++ b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp @@ -191,7 +191,7 @@ static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info, static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { Xbyak::CodeGenerator& c = g_srt_codegen; - if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) { + if (pass_info.srt_roots.empty()) { return; } @@ -205,29 +205,7 @@ static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { } info.srt_info.walker_func = c.getCurr(); - pass_info.dst_off_dw = NumUserDataRegs; - - // Special case for V# step rate buffers in fetch shader - for (const auto [sgpr_base, dword_offset, num_dwords] : info.srt_info.srt_reservations) { - // get pointer to V# - if (sgpr_base != IR::NumScalarRegs) { - PushPtr(c, sgpr_base); - } - u32 src_off = dword_offset << 2; - - for (auto j = 0; j < num_dwords; j++) { - c.mov(r11d, ptr[rdi + src_off]); - c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d); - - src_off += 4; - ++pass_info.dst_off_dw; - } - if (sgpr_base != IR::NumScalarRegs) { - PopPtr(c); - } - } - ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw); for (const auto& [sgpr_base, root] : pass_info.srt_roots) { diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index 9c5f64f84..d6586bda0 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -95,6 +95,10 @@ void ReadLaneEliminationPass(IR::Program& program) { if (inst.GetOpcode() != IR::Opcode::ReadLane) { continue; } + if (!inst.Arg(1).IsImmediate()) { + continue; + } + const u32 lane = inst.Arg(1).U32(); IR::Inst* prod = inst.Arg(0).InstRecursive(); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index f3972769c..e5a4beb8b 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -84,8 +84,42 @@ bool IsBufferInstruction(const IR::Inst& inst) { } bool IsDataRingInstruction(const IR::Inst& inst) { - return inst.GetOpcode() == IR::Opcode::DataAppend || - inst.GetOpcode() == IR::Opcode::DataConsume; + switch (inst.GetOpcode()) { + case IR::Opcode::DataAppend: + case IR::Opcode::DataConsume: + return true; + case IR::Opcode::LoadSharedU16: + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: + case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicInc32: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec32: + case IR::Opcode::SharedAtomicDec64: + return inst.Flags(); // is_gds + default: + return false; + } } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { @@ -507,7 +541,8 @@ void PatchImageSharp(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& } } -void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { +void PatchGlobalDataShareAccess(IR::Block& block, IR::Inst& inst, Info& info, + Descriptors& descriptors) { const u32 binding = descriptors.Add(BufferResource{ .used_types = IR::Type::U32, .inline_cbuf = AmdGpu::Buffer::Null(), @@ -515,37 +550,111 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto .is_written = true, }); - const auto pred = [](const IR::Inst* inst) -> std::optional { - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst; - } - return std::nullopt; - }; - - // Attempt to deduce the GDS address of counter at compile time. - u32 gds_addr = 0; - const IR::Value& gds_offset = inst.Arg(0); - if (gds_offset.IsImmediate()) { - // Nothing to do, offset is known. - gds_addr = gds_offset.U32() & 0xFFFF; - } else { - const auto result = IR::BreadthFirstSearch(&inst, pred); - ASSERT_MSG(result, "Unable to track M0 source"); - - // M0 must be set by some user data register. - const IR::Inst* prod = gds_offset.InstRecursive(); - const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); - u32 m0_val = info.user_data[ud_reg] >> 16; - if (prod->GetOpcode() == IR::Opcode::IAdd32) { - m0_val += prod->Arg(1).U32(); - } - gds_addr = m0_val & 0xFFFF; - } - - // Patch instruction. IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(gds_addr >> 2)); - inst.SetArg(1, ir.Imm32(binding)); + + // For data append/consume operations attempt to deduce the GDS address. + if (inst.GetOpcode() == IR::Opcode::DataAppend || inst.GetOpcode() == IR::Opcode::DataConsume) { + const auto pred = [](const IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData) { + return inst; + } + return std::nullopt; + }; + + u32 gds_addr = 0; + const IR::Value& gds_offset = inst.Arg(0); + if (gds_offset.IsImmediate()) { + // Nothing to do, offset is known. + gds_addr = gds_offset.U32() & 0xFFFF; + } else { + const auto result = IR::BreadthFirstSearch(&inst, pred); + ASSERT_MSG(result, "Unable to track M0 source"); + + // M0 must be set by some user data register. + const IR::Inst* prod = gds_offset.InstRecursive(); + const u32 ud_reg = u32(result.value()->Arg(0).ScalarReg()); + u32 m0_val = info.user_data[ud_reg] >> 16; + if (prod->GetOpcode() == IR::Opcode::IAdd32) { + m0_val += prod->Arg(1).U32(); + } + gds_addr = m0_val & 0xFFFF; + } + + // Patch instruction. + inst.SetArg(0, ir.Imm32(gds_addr >> 2)); + inst.SetArg(1, ir.Imm32(binding)); + } else { + // Convert shared memory opcode to storage buffer atomic to GDS buffer. + const IR::U32 offset = IR::U32{inst.Arg(0)}; + const IR::U32 address_words = ir.ShiftRightLogical(offset, ir.Imm32(1)); + const IR::U32 address_dwords = ir.ShiftRightLogical(offset, ir.Imm32(2)); + const IR::U32 address_qwords = ir.ShiftRightLogical(offset, ir.Imm32(3)); + const IR::U32 handle = ir.Imm32(binding); + switch (inst.GetOpcode()) { + case IR::Opcode::SharedAtomicIAdd32: + inst.ReplaceUsesWith(ir.BufferAtomicIAdd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicIAdd64: + inst.ReplaceUsesWith( + ir.BufferAtomicIAdd(handle, address_qwords, IR::U64{inst.Arg(1)}, {})); + break; + case IR::Opcode::SharedAtomicISub32: + inst.ReplaceUsesWith(ir.BufferAtomicISub(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicSMin32: + case IR::Opcode::SharedAtomicUMin32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMin(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicSMax32: + case IR::Opcode::SharedAtomicUMax32: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + inst.ReplaceUsesWith( + ir.BufferAtomicIMax(handle, address_dwords, inst.Arg(1), is_signed, {})); + break; + } + case IR::Opcode::SharedAtomicInc32: + inst.ReplaceUsesWith(ir.BufferAtomicInc(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicDec32: + inst.ReplaceUsesWith(ir.BufferAtomicDec(handle, address_dwords, {})); + break; + case IR::Opcode::SharedAtomicAnd32: + inst.ReplaceUsesWith(ir.BufferAtomicAnd(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicOr32: + inst.ReplaceUsesWith(ir.BufferAtomicOr(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::SharedAtomicXor32: + inst.ReplaceUsesWith(ir.BufferAtomicXor(handle, address_dwords, inst.Arg(1), {})); + break; + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWith(ir.LoadBufferU16(handle, address_words, {})); + break; + case IR::Opcode::LoadSharedU32: + inst.ReplaceUsesWith(ir.LoadBufferU32(1, handle, address_dwords, {})); + break; + case IR::Opcode::LoadSharedU64: + inst.ReplaceUsesWith(ir.LoadBufferU64(handle, address_qwords, {})); + break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address_words, IR::U16{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU32: + ir.StoreBufferU32(1, handle, address_dwords, inst.Arg(1), {}); + inst.Invalidate(); + break; + case IR::Opcode::WriteSharedU64: + ir.StoreBufferU64(handle, address_qwords, IR::U64{inst.Arg(1)}, {}); + inst.Invalidate(); + break; + default: + UNREACHABLE(); + } + } } IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, @@ -916,8 +1025,6 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferSharp(*block, inst, info, descriptors); } else if (IsImageInstruction(inst)) { PatchImageSharp(*block, inst, info, descriptors); - } else if (IsDataRingInstruction(inst)) { - PatchDataRingAccess(*block, inst, info, descriptors); } } } @@ -929,6 +1036,8 @@ void ResourceTrackingPass(IR::Program& program) { PatchBufferArgs(*block, inst, info); } else if (IsImageInstruction(inst)) { PatchImageArgs(*block, inst, info); + } else if (IsDataRingInstruction(inst)) { + PatchGlobalDataShareAccess(*block, inst, info, descriptors); } } } diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index b292b41b9..e1e5d762c 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -33,12 +33,9 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim bool is_composite = opcode == IR::Opcode::WriteSharedU64; u32 num_components = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; - u32 offset = 0; - const auto* addr = inst.Arg(0).InstRecursive(); - if (addr->GetOpcode() == IR::Opcode::IAdd32) { - ASSERT(addr->Arg(1).IsImmediate()); - offset = addr->Arg(1).U32(); - } + ASSERT(inst.Arg(0).IsImmediate()); + + u32 offset = inst.Arg(0).U32(); IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) : inst.Arg(1).Resolve(); for (s32 i = 0; i < num_components; i++) { @@ -116,7 +113,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2); + const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); const auto bucket = offset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index a87dceb0a..079827866 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -55,6 +55,16 @@ void Visit(Info& info, const IR::Inst& inst) { info.shared_types |= IR::Type::U32; break; case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: info.uses_shared_int64_atomics = true; [[fallthrough]]; case IR::Opcode::LoadSharedU64: diff --git a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp index 0f80a3b28..555fd505b 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_simplify_pass.cpp @@ -15,6 +15,16 @@ static bool Requires16BitSharedAtomic(const IR::Inst& inst) { static bool Requires64BitSharedAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index a6900e180..b84011acc 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -17,7 +17,6 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd32: - case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicISub32: case IR::Opcode::SharedAtomicSMin32: case IR::Opcode::SharedAtomicUMin32: @@ -28,6 +27,17 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: return true; default: return false; @@ -64,6 +74,16 @@ IR::Type CalculateSharedMemoryTypes(IR::Program& program) { case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicIAdd64: + case IR::Opcode::SharedAtomicISub64: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: + case IR::Opcode::SharedAtomicInc64: + case IR::Opcode::SharedAtomicDec64: + case IR::Opcode::SharedAtomicAnd64: + case IR::Opcode::SharedAtomicOr64: + case IR::Opcode::SharedAtomicXor64: used_types |= IR::Type::U64; break; default: @@ -119,19 +139,26 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicIAdd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicISub32: + case IR::Opcode::SharedAtomicISub64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicISub(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicSMin32: - case IR::Opcode::SharedAtomicUMin32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32; + case IR::Opcode::SharedAtomicUMin32: + case IR::Opcode::SharedAtomicSMin64: + case IR::Opcode::SharedAtomicUMin64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMin32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMin64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMin(handle, address, inst.Arg(1), is_signed, {})); continue; } case IR::Opcode::SharedAtomicSMax32: - case IR::Opcode::SharedAtomicUMax32: { - const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32; + case IR::Opcode::SharedAtomicUMax32: + case IR::Opcode::SharedAtomicSMax64: + case IR::Opcode::SharedAtomicUMax64: { + const bool is_signed = inst.GetOpcode() == IR::Opcode::SharedAtomicSMax32 || + inst.GetOpcode() == IR::Opcode::SharedAtomicSMax64; inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIMax(handle, address, inst.Arg(1), is_signed, {})); continue; @@ -143,12 +170,15 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.ReplaceUsesWithAndRemove(ir.BufferAtomicDec(handle, address, {})); continue; case IR::Opcode::SharedAtomicAnd32: + case IR::Opcode::SharedAtomicAnd64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicAnd(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicOr32: + case IR::Opcode::SharedAtomicOr64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicOr(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicXor32: + case IR::Opcode::SharedAtomicXor64: inst.ReplaceUsesWithAndRemove(ir.BufferAtomicXor(handle, address, inst.Arg(1), {})); continue; case IR::Opcode::LoadSharedU16: @@ -173,7 +203,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ inst.Invalidate(); break; default: - break; + UNREACHABLE(); } } } diff --git a/src/shader_recompiler/ir/passes/srt.h b/src/shader_recompiler/ir/passes/srt.h index 0ddc15ea6..4dce38674 100644 --- a/src/shader_recompiler/ir/passes/srt.h +++ b/src/shader_recompiler/ir/passes/srt.h @@ -20,18 +20,7 @@ struct PersistentSrtInfo { }; PFN_SrtWalker walker_func{}; - boost::container::small_vector srt_reservations; u32 flattened_bufsize_dw = 16; // NumUserDataRegs - - // Special case for fetch shaders because we don't generate IR to read from step rate buffers, - // so we won't see usage with GetUserData/ReadConst. - // Reserve space in the flattened buffer for a sharp ahead of time - u32 ReserveSharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) { - u32 rv = flattened_bufsize_dw; - srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords); - flattened_bufsize_dw += num_dwords; - return rv; - } }; -} // namespace Shader \ No newline at end of file +} // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 5a0408e2c..6cede44a8 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -42,7 +42,6 @@ constexpr u32 MaxStageTypes = static_cast(LogicalStage::NumLogicalStages); struct LocalRuntimeInfo { u32 ls_stride; - bool links_with_tcs; auto operator<=>(const LocalRuntimeInfo&) const noexcept = default; }; @@ -85,6 +84,8 @@ struct VertexRuntimeInfo { std::array outputs; bool emulate_depth_negative_one_to_one{}; bool clip_disable{}; + u32 step_rate_0; + u32 step_rate_1; // Domain AmdGpu::TessellationType tess_type; AmdGpu::TessellationTopology tess_topology; @@ -96,7 +97,8 @@ struct VertexRuntimeInfo { clip_disable == other.clip_disable && tess_type == other.tess_type && tess_topology == other.tess_topology && tess_partitioning == other.tess_partitioning && - hs_output_cp_stride == other.hs_output_cp_stride; + hs_output_cp_stride == other.hs_output_cp_stride && + step_rate_0 == other.step_rate_0 && step_rate_1 == other.step_rate_1; } void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index e40309aaf..d3e671c58 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -13,7 +13,7 @@ namespace Shader { struct VsAttribSpecialization { - s32 num_components{}; + u32 divisor{}; AmdGpu::NumberClass num_class{}; AmdGpu::CompMapping dst_select{}; @@ -74,13 +74,13 @@ struct SamplerSpecialization { * after the first compilation of a module. */ struct StageSpecialization { - static constexpr size_t MaxStageResources = 64; + static constexpr size_t MaxStageResources = 128; const Shader::Info* info; RuntimeInfo runtime_info; + std::bitset bitset{}; std::optional fetch_shader_data{}; boost::container::small_vector vs_attribs; - std::bitset bitset{}; boost::container::small_vector buffers; boost::container::small_vector images; boost::container::small_vector fmasks; @@ -94,10 +94,16 @@ struct StageSpecialization { if (info_.stage == Stage::Vertex && fetch_shader_data) { // Specialize shader on VS input number types to follow spec. ForEachSharp(vs_attribs, fetch_shader_data->attributes, - [&profile_](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { - spec.num_components = desc.UsesStepRates() - ? AmdGpu::NumComponents(sharp.GetDataFmt()) - : 0; + [&profile_, this](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; + if (const auto step_rate = desc.GetStepRate(); + step_rate != InstanceIdType::None) { + spec.divisor = step_rate == InstanceIdType::OverStepRate0 + ? runtime_info.vs_info.step_rate_0 + : (step_rate == InstanceIdType::OverStepRate1 + ? runtime_info.vs_info.step_rate_1 + : 1); + } spec.num_class = profile_.support_legacy_vertex_attributes ? AmdGpu::NumberClass{} : AmdGpu::GetNumberClass(sharp.GetNumberFmt()); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 9b8c28b66..3e66fba6a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -135,9 +135,8 @@ void Liverpool::Process(std::stop_token stoken) { if (submit_done) { VideoCore::EndCapture(); - if (rasterizer) { - rasterizer->ProcessFaults(); + rasterizer->EndCommandList(); rasterizer->Flush(); } submit_done = false; @@ -604,6 +603,8 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanevent_index.Value() == EventIndex::ZpassDone) { + LOG_WARNING(Render, "Unimplemented occlusion query"); } break; } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 0613823ab..c517285fb 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -88,7 +88,7 @@ struct Liverpool { } }; - static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x1000) { + static const BinaryInfo& SearchBinaryInfo(const u32* code, size_t search_limit = 0x2000) { constexpr u32 token_mov_vcchi = 0xBEEB03FF; if (code[0] == token_mov_vcchi) { @@ -304,6 +304,14 @@ struct Liverpool { } }; + struct LineControl { + u32 width_fixed_point; + + float Width() const { + return static_cast(width_fixed_point) / 8.0; + } + }; + struct ModeControl { s32 msaa_enable : 1; s32 vport_scissor_enable : 1; @@ -513,9 +521,16 @@ struct Liverpool { BitField<19, 1, ClipSpace> clip_space; BitField<21, 1, PrimKillCond> vtx_kill_or; BitField<22, 1, u32> dx_rasterization_kill; - BitField<23, 1, u32> dx_linear_attr_clip_enable; + BitField<24, 1, u32> dx_linear_attr_clip_enable; BitField<26, 1, u32> zclip_near_disable; - BitField<26, 1, u32> zclip_far_disable; + BitField<27, 1, u32> zclip_far_disable; + + bool ZclipEnable() const { + if (zclip_near_disable != zclip_far_disable) { + return false; + } + return !zclip_near_disable; + } }; enum class PolygonMode : u32 { @@ -738,12 +753,7 @@ struct Liverpool { u32 data_w; }; - struct BlendConstants { - float red; - float green; - float blue; - float alpha; - }; + using BlendConstants = std::array; union BlendControl { enum class BlendFactor : u32 { @@ -796,11 +806,29 @@ struct Liverpool { Err = 4u, FmaskDecompress = 5u, }; + enum class LogicOp : u32 { + Clear = 0x00, + Nor = 0x11, + AndInverted = 0x22, + CopyInverted = 0x33, + AndReverse = 0x44, + Invert = 0x55, + Xor = 0x66, + Nand = 0x77, + And = 0x88, + Equiv = 0x99, + Noop = 0xAA, + OrInverted = 0xBB, + Copy = 0xCC, + OrReverse = 0xDD, + Or = 0xEE, + Set = 0xFF, + }; BitField<0, 1, u32> disable_dual_quad; BitField<3, 1, u32> degamma_enable; BitField<4, 3, OperationMode> mode; - BitField<16, 8, u32> rop3; + BitField<16, 8, LogicOp> rop3; }; struct ColorBuffer { @@ -1369,7 +1397,9 @@ struct Liverpool { PolygonControl polygon_control; ViewportControl viewport_control; VsOutputControl vs_output_control; - INSERT_PADDING_WORDS(0xA287 - 0xA207 - 1); + INSERT_PADDING_WORDS(0xA287 - 0xA207 - 6); + LineControl line_control; + INSERT_PADDING_WORDS(4); HsTessFactorClamp hs_clamp; INSERT_PADDING_WORDS(0xA290 - 0xA287 - 2); GsMode vgt_gs_mode; @@ -1695,6 +1725,7 @@ static_assert(GFX6_3D_REG_INDEX(color_control) == 0xA202); static_assert(GFX6_3D_REG_INDEX(clipper_control) == 0xA204); static_assert(GFX6_3D_REG_INDEX(viewport_control) == 0xA206); static_assert(GFX6_3D_REG_INDEX(vs_output_control) == 0xA207); +static_assert(GFX6_3D_REG_INDEX(line_control) == 0xA282); static_assert(GFX6_3D_REG_INDEX(hs_clamp) == 0xA287); static_assert(GFX6_3D_REG_INDEX(vgt_gs_mode) == 0xA290); static_assert(GFX6_3D_REG_INDEX(mode_control) == 0xA292); diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index 15bf0d81e..e85a6eb18 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -137,12 +137,15 @@ StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& size_bytes); } -std::pair StreamBuffer::Map(u64 size, u64 alignment) { +std::pair StreamBuffer::Map(u64 size, u64 alignment, bool allow_wait) { if (!is_coherent && usage == MemoryUsage::Stream) { size = Common::AlignUp(size, instance->NonCoherentAtomSize()); } - ASSERT(size <= this->size_bytes); + if (size > this->size_bytes) { + return {nullptr, 0}; + } + mapped_size = size; if (alignment > 0) { @@ -162,8 +165,11 @@ std::pair StreamBuffer::Map(u64 size, u64 alignment) { } const u64 mapped_upper_bound = offset + size; - WaitPendingOperations(mapped_upper_bound); - return std::make_pair(mapped_data.data() + offset, offset); + if (!WaitPendingOperations(mapped_upper_bound, allow_wait)) { + return {nullptr, 0}; + } + + return {mapped_data.data() + offset, offset}; } void StreamBuffer::Commit() { @@ -177,6 +183,12 @@ void StreamBuffer::Commit() { } offset += mapped_size; + if (current_watch_cursor != 0 && + current_watches[current_watch_cursor].tick == scheduler->CurrentTick()) { + current_watches[current_watch_cursor].upper_bound = offset; + return; + } + if (current_watch_cursor + 1 >= current_watches.size()) { // Ensure that there are enough watches. ReserveWatches(current_watches, WATCHES_RESERVE_CHUNK); @@ -191,16 +203,20 @@ void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_ watches.resize(watches.size() + grow_size); } -void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { +bool StreamBuffer::WaitPendingOperations(u64 requested_upper_bound, bool allow_wait) { if (!invalidation_mark) { - return; + return true; } while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; - wait_bound = watch.upper_bound; + if (!scheduler->IsFree(watch.tick) && !allow_wait) { + return false; + } scheduler->Wait(watch.tick); + wait_bound = watch.upper_bound; ++wait_cursor; } + return true; } } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 530968787..a7a0ce84f 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -168,7 +168,7 @@ public: MemoryUsage usage, u64 size_bytes_); /// Reserves a region of memory from the stream buffer. - std::pair Map(u64 size, u64 alignment = 0); + std::pair Map(u64 size, u64 alignment = 0, bool allow_wait = true); /// Ensures that reserved bytes of memory are available to the GPU. void Commit(); @@ -181,10 +181,6 @@ public: return offset; } - u64 GetFreeSize() const { - return size_bytes - offset - mapped_size; - } - private: struct Watch { u64 tick{}; @@ -195,7 +191,7 @@ private: void ReserveWatches(std::vector& watches, std::size_t grow_size); /// Waits pending watches until requested upper bound. - void WaitPendingOperations(u64 requested_upper_bound); + bool WaitPendingOperations(u64 requested_upper_bound, bool allow_wait); private: u64 offset{}; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index d55e05d1e..42e3c61a5 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -48,6 +48,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s memory_tracker = std::make_unique(tracker); + std::memset(gds_buffer.mapped_data.data(), 0, DataShareBufferSize); + // Ensure the first slot is used for the null buffer const auto null_id = slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, AllFlags, 16); @@ -137,8 +139,7 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { return; } memory_tracker->InvalidateRegion( - device_addr, size, Config::readbacks(), - [this, device_addr, size] { ReadMemory(device_addr, size, true); }); + device_addr, size, [this, device_addr, size] { ReadMemory(device_addr, size, true); }); } void BufferCache::ReadMemory(VAddr device_addr, u64 size, bool is_write) { @@ -197,10 +198,13 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si } void BufferCache::BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline) { + const auto& regs = liverpool->regs; Vulkan::VertexInputs attributes; Vulkan::VertexInputs bindings; + Vulkan::VertexInputs divisors; Vulkan::VertexInputs guest_buffers; - pipeline.GetVertexInputs(attributes, bindings, guest_buffers); + pipeline.GetVertexInputs(attributes, bindings, divisors, guest_buffers, + regs.vgt_instance_step_rate_0, regs.vgt_instance_step_rate_1); if (instance.IsVertexInputDynamicState()) { // Update current vertex inputs. @@ -313,7 +317,10 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); if (!is_gds) { - ASSERT(memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)); + if (!memory->TryWriteBacking(std::bit_cast(address), value, num_bytes)) { + std::memcpy(std::bit_cast(address), value, num_bytes); + return; + } if (!IsRegionRegistered(address, num_bytes)) { return; } @@ -817,22 +824,22 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; + size_t total_size_bytes = 0; VAddr buffer_start = buffer.CpuAddr(); + vk::Buffer src_buffer = VK_NULL_HANDLE; memory_tracker->ForEachUploadRange( - device_addr, size, is_written, [&](u64 device_addr_out, u64 range_size) { - const u64 offset = staging_buffer.Copy(device_addr_out, range_size); - copies.push_back(vk::BufferCopy{ - .srcOffset = offset, - .dstOffset = device_addr_out - buffer_start, - .size = range_size, - }); - }); + device_addr, size, is_written, + [&](u64 device_addr_out, u64 range_size) { + copies.emplace_back(total_size_bytes, device_addr_out - buffer_start, range_size); + total_size_bytes += range_size; + }, + [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); }); SCOPE_EXIT { if (is_texel_buffer) { SynchronizeBufferFromImage(buffer, device_addr, size); } }; - if (copies.empty()) { + if (!src_buffer) { return; } scheduler.EndRendering(); @@ -861,7 +868,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &pre_barrier, }); - cmdbuf.copyBuffer(staging_buffer.Handle(), buffer.buffer, copies); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -869,6 +876,39 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, }); } +vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes) { + if (copies.empty()) { + return VK_NULL_HANDLE; + } + const auto [staging, offset] = staging_buffer.Map(total_size_bytes); + if (staging) { + for (auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + // Apply the staging offset + copy.srcOffset += offset; + } + staging_buffer.Commit(); + return staging_buffer.Handle(); + } else { + // For large one time transfers use a temporary host buffer. + auto temp_buffer = + std::make_unique(instance, scheduler, MemoryUsage::Upload, 0, + vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes); + const vk::Buffer src_buffer = temp_buffer->Handle(); + u8* const staging = temp_buffer->mapped_data.data(); + for (const auto& copy : copies) { + u8* const src_pointer = staging + copy.srcOffset; + const VAddr device_addr = buffer.CpuAddr() + copy.dstOffset; + std::memcpy(src_pointer, std::bit_cast(device_addr), copy.size); + } + scheduler.DeferOperation([buffer = std::move(temp_buffer)]() mutable { buffer.reset(); }); + return src_buffer; + } +} + bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { boost::container::small_vector image_ids; texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 900a27aee..b509ce2d0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -112,7 +112,7 @@ public: /// Invalidates any buffer in the logical page range. void InvalidateMemory(VAddr device_addr, u64 size); - /// Waits on pending downloads in the logical page range. + /// Flushes any GPU modified buffer in the logical page range back to CPU memory. void ReadMemory(VAddr device_addr, u64 size, bool is_write = false); /// Binds host vertex buffers for the current draw. @@ -194,6 +194,9 @@ private: void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer); + vk::Buffer UploadCopies(Buffer& buffer, std::span copies, + size_t total_size_bytes); + bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); void InlineDataBuffer(Buffer& buffer, VAddr address, const void* value, u32 num_bytes); diff --git a/src/video_core/buffer_cache/memory_tracker.h b/src/video_core/buffer_cache/memory_tracker.h index ca87c7df0..ec0878c3b 100644 --- a/src/video_core/buffer_cache/memory_tracker.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -62,17 +62,17 @@ public: } /// Removes all protection from a page and ensures GPU data has been flushed if requested - void InvalidateRegion(VAddr cpu_addr, u64 size, bool try_flush, auto&& on_flush) noexcept { + void InvalidateRegion(VAddr cpu_addr, u64 size, auto&& on_flush) noexcept { IteratePages( - cpu_addr, size, - [try_flush, &on_flush](RegionManager* manager, u64 offset, size_t size) { + cpu_addr, size, [&on_flush](RegionManager* manager, u64 offset, size_t size) { const bool should_flush = [&] { // Perform both the GPU modification check and CPU state change with the lock // in case we are racing with GPU thread trying to mark the page as GPU // modified. If we need to flush the flush function is going to perform CPU // state change. std::scoped_lock lk{manager->lock}; - if (try_flush && manager->template IsRegionModified(offset, size)) { + if (Config::readbacks() && + manager->template IsRegionModified(offset, size)) { return true; } manager->template ChangeRegionState( @@ -86,17 +86,27 @@ public: } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func) { + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, bool is_written, auto&& func, + auto&& on_upload) { IteratePages(query_cpu_range, query_size, [&func, is_written](RegionManager* manager, u64 offset, size_t size) { - std::scoped_lock lk{manager->lock}; + manager->lock.lock(); manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, func); - if (is_written) { - manager->template ChangeRegionState( - manager->GetCpuAddr() + offset, size); + if (!is_written) { + manager->lock.unlock(); } }); + on_upload(); + if (!is_written) { + return; + } + IteratePages(query_cpu_range, query_size, + [&func, is_written](RegionManager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + manager->lock.unlock(); + }); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 5972296c0..fd1a91260 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -245,6 +245,46 @@ vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func) { } } +vk::LogicOp LogicOp(Liverpool::ColorControl::LogicOp logic_op) { + using LogicOp = Liverpool::ColorControl::LogicOp; + switch (logic_op) { + case LogicOp::Clear: + return vk::LogicOp::eClear; + case LogicOp::Nor: + return vk::LogicOp::eNor; + case LogicOp::AndInverted: + return vk::LogicOp::eAndInverted; + case LogicOp::CopyInverted: + return vk::LogicOp::eCopyInverted; + case LogicOp::AndReverse: + return vk::LogicOp::eAndReverse; + case LogicOp::Invert: + return vk::LogicOp::eInvert; + case LogicOp::Xor: + return vk::LogicOp::eXor; + case LogicOp::Nand: + return vk::LogicOp::eNand; + case LogicOp::And: + return vk::LogicOp::eAnd; + case LogicOp::Equiv: + return vk::LogicOp::eEquivalent; + case LogicOp::Noop: + return vk::LogicOp::eNoOp; + case LogicOp::OrInverted: + return vk::LogicOp::eOrInverted; + case LogicOp::Copy: + return vk::LogicOp::eCopy; + case LogicOp::OrReverse: + return vk::LogicOp::eOrReverse; + case LogicOp::Or: + return vk::LogicOp::eOr; + case LogicOp::Set: + return vk::LogicOp::eSet; + default: + UNREACHABLE_MSG("Unknown logic op {}", u32(logic_op)); + } +} + // https://github.com/chaotic-cx/mesa-mirror/blob/0954afff5/src/amd/vulkan/radv_sampler.c#L21 vk::SamplerAddressMode ClampMode(AmdGpu::ClampMode mode) { switch (mode) { diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.h b/src/video_core/renderer_vulkan/liverpool_to_vk.h index 61fd4a8c1..61b7ea0a9 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.h +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.h @@ -34,6 +34,8 @@ bool IsDualSourceBlendFactor(Liverpool::BlendControl::BlendFactor factor); vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func); +vk::LogicOp LogicOp(Liverpool::ColorControl::LogicOp logic_op); + vk::SamplerAddressMode ClampMode(AmdGpu::ClampMode mode); vk::CompareOp DepthCompare(AmdGpu::DepthCompare comp); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7c020a012..10e5bed5f 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -72,12 +72,21 @@ GraphicsPipeline::GraphicsPipeline( VertexInputs vertex_attributes; VertexInputs vertex_bindings; + VertexInputs divisors; VertexInputs guest_buffers; if (!instance.IsVertexInputDynamicState()) { - GetVertexInputs(vertex_attributes, vertex_bindings, guest_buffers); + const auto& vs_info = runtime_infos[u32(Shader::LogicalStage::Vertex)].vs_info; + GetVertexInputs(vertex_attributes, vertex_bindings, divisors, guest_buffers, + vs_info.step_rate_0, vs_info.step_rate_1); } + const vk::PipelineVertexInputDivisorStateCreateInfo divisor_state = { + .vertexBindingDivisorCount = static_cast(divisors.size()), + .pVertexBindingDivisors = divisors.data(), + }; + const vk::PipelineVertexInputStateCreateInfo vertex_input_info = { + .pNext = divisors.empty() ? nullptr : &divisor_state, .vertexBindingDescriptionCount = static_cast(vertex_bindings.size()), .pVertexBindingDescriptions = vertex_bindings.data(), .vertexAttributeDescriptionCount = static_cast(vertex_attributes.size()), @@ -100,28 +109,63 @@ GraphicsPipeline::GraphicsPipeline( .patchControlPoints = is_rect_list ? 3U : (is_quad_list ? 4U : key.patch_control_points), }; - const vk::PipelineRasterizationStateCreateInfo raster_state = { - .depthClampEnable = false, - .rasterizerDiscardEnable = false, - .polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode), - .lineWidth = 1.0f, + vk::StructureChain raster_chain = { + vk::PipelineRasterizationStateCreateInfo{ + .depthClampEnable = key.depth_clamp_enable || + (!key.depth_clip_enable && !instance.IsDepthClipEnableSupported()), + .rasterizerDiscardEnable = false, + .polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode), + .lineWidth = 1.0f, + }, + vk::PipelineRasterizationProvokingVertexStateCreateInfoEXT{ + .provokingVertexMode = key.provoking_vtx_last == Liverpool::ProvokingVtxLast::First + ? vk::ProvokingVertexModeEXT::eFirstVertex + : vk::ProvokingVertexModeEXT::eLastVertex, + }, + vk::PipelineRasterizationDepthClipStateCreateInfoEXT{ + .depthClipEnable = key.depth_clip_enable, + }, }; + if (!instance.IsProvokingVertexSupported()) { + raster_chain.unlink(); + } + if (!instance.IsDepthClipEnableSupported()) { + raster_chain.unlink(); + } + const vk::PipelineMultisampleStateCreateInfo multisampling = { .rasterizationSamples = LiverpoolToVK::NumSamples(key.num_samples, instance.GetFramebufferSampleCounts()), .sampleShadingEnable = false, }; - const vk::PipelineViewportDepthClipControlCreateInfoEXT clip_control = { - .negativeOneToOne = key.clip_space == Liverpool::ClipSpace::MinusWToW, + const vk::DepthClampRangeEXT depth_clamp_range = { + .minDepthClamp = key.min_depth_clamp, + .maxDepthClamp = key.max_depth_clamp, }; - const vk::PipelineViewportStateCreateInfo viewport_info = { - .pNext = instance.IsDepthClipControlSupported() ? &clip_control : nullptr, + vk::StructureChain viewport_chain = { + vk::PipelineViewportStateCreateInfo{}, + vk::PipelineViewportDepthClipControlCreateInfoEXT{ + .negativeOneToOne = key.clip_space == Liverpool::ClipSpace::MinusWToW, + }, + vk::PipelineViewportDepthClampControlCreateInfoEXT{ + .depthClampMode = key.depth_clamp_user_defined_range + ? vk::DepthClampModeEXT::eUserDefinedRange + : vk::DepthClampModeEXT::eViewportRange, + .pDepthClampRange = &depth_clamp_range, + }, }; - boost::container::static_vector dynamic_states = { + if (!instance.IsDepthClampControlSupported()) { + viewport_chain.unlink(); + } + if (!instance.IsDepthClipControlSupported()) { + viewport_chain.unlink(); + } + + boost::container::static_vector dynamic_states = { vk::DynamicState::eViewportWithCount, vk::DynamicState::eScissorWithCount, vk::DynamicState::eBlendConstants, vk::DynamicState::eDepthTestEnable, vk::DynamicState::eDepthWriteEnable, vk::DynamicState::eDepthCompareOp, @@ -129,7 +173,8 @@ GraphicsPipeline::GraphicsPipeline( vk::DynamicState::eStencilTestEnable, vk::DynamicState::eStencilReference, vk::DynamicState::eStencilCompareMask, vk::DynamicState::eStencilWriteMask, vk::DynamicState::eStencilOp, vk::DynamicState::eCullMode, - vk::DynamicState::eFrontFace, + vk::DynamicState::eFrontFace, vk::DynamicState::eRasterizerDiscardEnable, + vk::DynamicState::eLineWidth, }; if (instance.IsPrimitiveRestartDisableSupported()) { @@ -212,11 +257,19 @@ GraphicsPipeline::GraphicsPipeline( }); } + const auto depth_format = + instance.GetSupportedFormat(LiverpoolToVK::DepthFormat(key.z_format, key.stencil_format), + vk::FormatFeatureFlagBits2::eDepthStencilAttachment); const vk::PipelineRenderingCreateInfo pipeline_rendering_ci = { .colorAttachmentCount = key.num_color_attachments, .pColorAttachmentFormats = key.color_formats.data(), - .depthAttachmentFormat = key.depth_format, - .stencilAttachmentFormat = key.stencil_format, + .depthAttachmentFormat = key.z_format != Liverpool::DepthBuffer::ZFormat::Invalid + ? depth_format + : vk::Format::eUndefined, + .stencilAttachmentFormat = + key.stencil_format != Liverpool::DepthBuffer::StencilFormat::Invalid + ? depth_format + : vk::Format::eUndefined, }; std::array attachments; @@ -271,8 +324,9 @@ GraphicsPipeline::GraphicsPipeline( } const vk::PipelineColorBlendStateCreateInfo color_blending = { - .logicOpEnable = false, - .logicOp = vk::LogicOp::eCopy, + .logicOpEnable = + instance.IsLogicOpSupported() && key.logic_op != Liverpool::ColorControl::LogicOp::Copy, + .logicOp = LiverpoolToVK::LogicOp(key.logic_op), .attachmentCount = key.num_color_attachments, .pAttachments = attachments.data(), .blendConstants = std::array{1.0f, 1.0f, 1.0f, 1.0f}, @@ -285,8 +339,8 @@ GraphicsPipeline::GraphicsPipeline( .pVertexInputState = !instance.IsVertexInputDynamicState() ? &vertex_input_info : nullptr, .pInputAssemblyState = &input_assembly, .pTessellationState = &tessellation_state, - .pViewportState = &viewport_info, - .pRasterizationState = &raster_state, + .pViewportState = &viewport_chain.get(), + .pRasterizationState = &raster_chain.get(), .pMultisampleState = &multisampling, .pColorBlendState = &color_blending, .pDynamicState = &dynamic_info, @@ -304,19 +358,17 @@ GraphicsPipeline::GraphicsPipeline( GraphicsPipeline::~GraphicsPipeline() = default; template -void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, - VertexInputs& bindings, - VertexInputs& guest_buffers) const { +void GraphicsPipeline::GetVertexInputs( + VertexInputs& attributes, VertexInputs& bindings, + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const { + using InstanceIdType = Shader::Gcn::VertexAttribute::InstanceIdType; if (!fetch_shader || fetch_shader->attributes.empty()) { return; } const auto& vs_info = GetStage(Shader::LogicalStage::Vertex); for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } - + const auto step_rate = attrib.GetStepRate(); const auto& buffer = attrib.GetSharp(vs_info); attributes.push_back(Attribute{ .location = attrib.semantic, @@ -327,12 +379,19 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, bindings.push_back(Binding{ .binding = attrib.semantic, .stride = buffer.GetStride(), - .inputRate = attrib.GetStepRate() == Shader::Gcn::VertexAttribute::InstanceIdType::None - ? vk::VertexInputRate::eVertex - : vk::VertexInputRate::eInstance, + .inputRate = step_rate == InstanceIdType::None ? vk::VertexInputRate::eVertex + : vk::VertexInputRate::eInstance, }); + const u32 divisor = step_rate == InstanceIdType::OverStepRate0 + ? step_rate_0 + : (step_rate == InstanceIdType::OverStepRate1 ? step_rate_1 : 1); if constexpr (std::is_same_v) { - bindings.back().divisor = 1; + bindings.back().divisor = divisor; + } else if (step_rate != InstanceIdType::None) { + divisors.push_back(vk::VertexInputBindingDivisorDescriptionEXT{ + .binding = attrib.semantic, + .divisor = divisor, + }); } guest_buffers.emplace_back(buffer); } @@ -342,11 +401,13 @@ void GraphicsPipeline::GetVertexInputs(VertexInputs& attributes, template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; template void GraphicsPipeline::GetVertexInputs( VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, u32 step_rate_1) const; void GraphicsPipeline::BuildDescSetLayout() { boost::container::small_vector bindings; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 59230ae46..1ecfa6b42 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -33,22 +33,32 @@ using VertexInputs = boost::container::static_vector; struct GraphicsPipelineKey { std::array stage_hashes; + std::array vertex_buffer_formats; + u32 patch_control_points; u32 num_color_attachments; std::array color_formats; std::array color_buffers; - vk::Format depth_format; - vk::Format stencil_format; - - u32 num_samples; - u32 mrt_mask; - AmdGpu::PrimitiveType prim_type; - Liverpool::PolygonMode polygon_mode; - Liverpool::ClipSpace clip_space; - Liverpool::ColorBufferMask cb_shader_mask; std::array blend_controls; std::array write_masks; - std::array vertex_buffer_formats; - u32 patch_control_points; + Liverpool::ColorBufferMask cb_shader_mask; + Liverpool::ColorControl::LogicOp logic_op; + u32 num_samples; + u32 mrt_mask; + struct { + Liverpool::DepthBuffer::ZFormat z_format : 2; + Liverpool::DepthBuffer::StencilFormat stencil_format : 1; + u32 depth_clamp_enable : 1; + u32 depth_clamp_user_defined_range : 1; + float min_depth_clamp; + float max_depth_clamp; + }; + struct { + AmdGpu::PrimitiveType prim_type : 5; + Liverpool::PolygonMode polygon_mode : 2; + Liverpool::ClipSpace clip_space : 1; + Liverpool::ProvokingVtxLast provoking_vtx_last : 1; + u32 depth_clip_enable : 1; + }; bool operator==(const GraphicsPipelineKey& key) const noexcept { return std::memcmp(this, &key, sizeof(key)) == 0; @@ -81,7 +91,9 @@ public: /// Gets the attributes and bindings for vertex inputs. template void GetVertexInputs(VertexInputs& attributes, VertexInputs& bindings, - VertexInputs& guest_buffers) const; + VertexInputs& divisors, + VertexInputs& guest_buffers, u32 step_rate_0, + u32 step_rate_1) const; private: void BuildDescSetLayout(); diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 237fa202d..3a461b321 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -248,6 +248,7 @@ bool Instance::CreateDevice() { // Required ASSERT(add_extension(VK_KHR_SWAPCHAIN_EXTENSION_NAME)); ASSERT(add_extension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME)); + ASSERT(add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME)); // Optional depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); @@ -269,10 +270,13 @@ bool Instance::CreateDevice() { } custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME); depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME); + depth_clip_enable = add_extension(VK_EXT_DEPTH_CLIP_ENABLE_EXTENSION_NAME); + depth_clamp_control = add_extension(VK_EXT_DEPTH_CLAMP_CONTROL_EXTENSION_NAME); vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); list_restart = add_extension(VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME); fragment_shader_barycentric = add_extension(VK_KHR_FRAGMENT_SHADER_BARYCENTRIC_EXTENSION_NAME); legacy_vertex_attributes = add_extension(VK_EXT_LEGACY_VERTEX_ATTRIBUTES_EXTENSION_NAME); + provoking_vertex = add_extension(VK_EXT_PROVOKING_VERTEX_EXTENSION_NAME); shader_stencil_export = add_extension(VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME); image_load_store_lod = add_extension(VK_AMD_SHADER_IMAGE_LOAD_STORE_LOD_EXTENSION_NAME); amd_gcn_shader = add_extension(VK_AMD_GCN_SHADER_EXTENSION_NAME); @@ -361,9 +365,11 @@ bool Instance::CreateDevice() { .dualSrcBlend = features.dualSrcBlend, .logicOp = features.logicOp, .multiDrawIndirect = features.multiDrawIndirect, + .depthClamp = features.depthClamp, .depthBiasClamp = features.depthBiasClamp, .fillModeNonSolid = features.fillModeNonSolid, .depthBounds = features.depthBounds, + .wideLines = features.wideLines, .multiViewport = features.multiViewport, .samplerAnisotropy = features.samplerAnisotropy, .vertexPipelineStoresAndAtomics = features.vertexPipelineStoresAndAtomics, @@ -417,6 +423,12 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceDepthClipControlFeaturesEXT{ .depthClipControl = true, }, + vk::PhysicalDeviceDepthClipEnableFeaturesEXT{ + .depthClipEnable = true, + }, + vk::PhysicalDeviceDepthClampControlFeaturesEXT{ + .depthClampControl = true, + }, vk::PhysicalDeviceRobustness2FeaturesEXT{ .robustBufferAccess2 = robustness2_features.robustBufferAccess2, .robustImageAccess2 = robustness2_features.robustImageAccess2, @@ -436,6 +448,12 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceLegacyVertexAttributesFeaturesEXT{ .legacyVertexAttributes = true, }, + vk::PhysicalDeviceProvokingVertexFeaturesEXT{ + .provokingVertexLast = true, + }, + vk::PhysicalDeviceVertexAttributeDivisorFeatures{ + .vertexAttributeInstanceRateDivisor = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, @@ -483,6 +501,12 @@ bool Instance::CreateDevice() { if (!depth_clip_control) { device_chain.unlink(); } + if (!depth_clip_enable) { + device_chain.unlink(); + } + if (!depth_clamp_control) { + device_chain.unlink(); + } if (!robustness2) { device_chain.unlink(); } @@ -498,6 +522,9 @@ bool Instance::CreateDevice() { if (!legacy_vertex_attributes) { device_chain.unlink(); } + if (!provoking_vertex) { + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index c9e354186..67dcc183a 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -109,6 +109,16 @@ public: return depth_clip_control; } + /// Returns true when VK_EXT_depth_clip_enable is supported + bool IsDepthClipEnableSupported() const { + return depth_clip_enable; + } + + /// Returns true when VK_EXT_depth_clamp_control is supported + bool IsDepthClampControlSupported() const { + return depth_clamp_control; + } + /// Returns true when VK_EXT_depth_range_unrestricted is supported bool IsDepthRangeUnrestrictedSupported() const { return depth_range_unrestricted; @@ -150,6 +160,11 @@ public: return legacy_vertex_attributes; } + /// Returns true when VK_EXT_provoking_vertex is supported. + bool IsProvokingVertexSupported() const { + return provoking_vertex; + } + /// Returns true when VK_AMD_shader_image_load_store_lod is supported. bool IsImageLoadStoreLodSupported() const { return image_load_store_lod; @@ -324,11 +339,21 @@ public: return properties.limits.maxViewportDimensions[0]; } - /// Returns the maximum viewport height. + /// Returns the maximum viewport height. u32 GetMaxViewportHeight() const { return properties.limits.maxViewportDimensions[1]; } + /// Returns the maximum render area width. + u32 GetMaxFramebufferWidth() const { + return properties.limits.maxFramebufferWidth; + } + + /// Returns the maximum render area height. + u32 GetMaxFramebufferHeight() const { + return properties.limits.maxFramebufferHeight; + } + /// Returns the sample count flags supported by framebuffers. vk::SampleCountFlags GetFramebufferSampleCounts() const { return properties.limits.framebufferColorSampleCounts & @@ -341,6 +366,11 @@ public: return driver_id != vk::DriverId::eMoltenvk; } + /// Returns true if logic ops are supported by the device. + bool IsLogicOpSupported() const { + return features.logicOp; + } + /// Determines if a format is supported for a set of feature flags. [[nodiscard]] bool IsFormatSupported(vk::Format format, vk::FormatFeatureFlags2 flags) const; @@ -389,12 +419,15 @@ private: bool custom_border_color{}; bool fragment_shader_barycentric{}; bool depth_clip_control{}; + bool depth_clip_enable{}; + bool depth_clamp_control{}; bool depth_range_unrestricted{}; bool dynamic_state_3{}; bool vertex_input_dynamic_state{}; bool robustness2{}; bool list_restart{}; bool legacy_vertex_attributes{}; + bool provoking_vertex{}; bool shader_stencil_export{}; bool image_load_store_lod{}; bool amd_gcn_shader{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 7dd468f9a..d9e01091e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -94,15 +94,10 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS switch (stage) { case Stage::Local: { BuildCommon(regs.ls_program); - if (regs.stage_enable.IsStageEnabled(static_cast(Stage::Hull))) { - info.ls_info.links_with_tcs = true; - Shader::TessellationDataConstantBuffer tess_constants; - const auto* pgm = regs.ProgramForStage(static_cast(Stage::Hull)); - const auto params = Liverpool::GetParams(*pgm); - const auto& hull_info = program_cache.at(params.hash)->info; - hull_info.ReadTessConstantBuffer(tess_constants); - info.ls_info.ls_stride = tess_constants.ls_stride; - } + Shader::TessellationDataConstantBuffer tess_constants; + const auto* hull_info = infos[u32(Shader::LogicalStage::TessellationControl)]; + hull_info->ReadTessConstantBuffer(tess_constants); + info.ls_info.ls_stride = tess_constants.ls_stride; break; } case Stage::Hull: { @@ -122,6 +117,8 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS case Stage::Vertex: { BuildCommon(regs.vs_program); GatherVertexOutputs(info.vs_info, regs.vs_output_control); + info.vs_info.step_rate_0 = regs.vgt_instance_step_rate_0; + info.vs_info.step_rate_1 = regs.vgt_instance_step_rate_1; info.vs_info.emulate_depth_negative_one_to_one = !instance.IsDepthClipControlSupported() && regs.clipper_control.clip_space == Liverpool::ClipSpace::MinusWToW; @@ -288,26 +285,21 @@ bool PipelineCache::RefreshGraphicsKey() { auto& regs = liverpool->regs; auto& key = graphics_key; - const auto depth_format = instance.GetSupportedFormat( - LiverpoolToVK::DepthFormat(regs.depth_buffer.z_info.format, - regs.depth_buffer.stencil_info.format), - vk::FormatFeatureFlagBits2::eDepthStencilAttachment); - if (regs.depth_buffer.DepthValid()) { - key.depth_format = depth_format; - } else { - key.depth_format = vk::Format::eUndefined; - } - if (regs.depth_buffer.StencilValid()) { - key.stencil_format = depth_format; - } else { - key.stencil_format = vk::Format::eUndefined; - } - + key.z_format = regs.depth_buffer.DepthValid() ? regs.depth_buffer.z_info.format.Value() + : Liverpool::DepthBuffer::ZFormat::Invalid; + key.stencil_format = regs.depth_buffer.StencilValid() + ? regs.depth_buffer.stencil_info.format.Value() + : Liverpool::DepthBuffer::StencilFormat::Invalid; + key.depth_clip_enable = regs.clipper_control.ZclipEnable(); + key.clip_space = regs.clipper_control.clip_space; + key.provoking_vtx_last = regs.polygon_control.provoking_vtx_last; key.prim_type = regs.primitive_type; key.polygon_mode = regs.polygon_control.PolyMode(); - key.clip_space = regs.clipper_control.clip_space; + key.logic_op = regs.color_control.rop3; key.num_samples = regs.NumSamples(); + RefreshDepthClampRange(); + const bool skip_cb_binding = regs.color_control.mode == AmdGpu::Liverpool::ColorControl::OperationMode::Disable; @@ -460,10 +452,6 @@ bool PipelineCache::RefreshGraphicsKey() { // Stride will still be handled outside the pipeline using dynamic state. u32 vertex_binding = 0; for (const auto& attrib : fetch_shader->attributes) { - if (attrib.UsesStepRates()) { - // Skip attribute binding as the data will be pulled by shader. - continue; - } const auto& buffer = attrib.GetSharp(*vs_info); ASSERT(vertex_binding < MaxVertexBufferCount); key.vertex_buffer_formats[vertex_binding++] = @@ -498,7 +486,63 @@ bool PipelineCache::RefreshGraphicsKey() { } return true; -} // namespace Vulkan +} + +void PipelineCache::RefreshDepthClampRange() { + auto& regs = liverpool->regs; + auto& key = graphics_key; + + key.depth_clamp_enable = !regs.depth_render_override.disable_viewport_clamp; + if (key.z_format == Liverpool::DepthBuffer::ZFormat::Invalid || !key.depth_clamp_enable) { + return; + } + + bool depth_clamp_can_use_viewport_range = true; + bool depth_clamp_is_same_on_all_viewports = true; + float zmin = std::numeric_limits::max(); + float zmax = std::numeric_limits::max(); + const auto& vp_ctl = regs.viewport_control; + for (u32 i = 0; i < Liverpool::NumViewports; i++) { + const auto& vp = regs.viewports[i]; + const auto& vp_d = regs.viewport_depths[i]; + if (vp.xscale == 0) { + continue; + } + const auto zoffset = vp_ctl.zoffset_enable ? vp.zoffset : 0.f; + const auto zscale = vp_ctl.zscale_enable ? vp.zscale : 1.f; + + float min_depth; + float max_depth; + if (regs.clipper_control.clip_space == AmdGpu::Liverpool::ClipSpace::MinusWToW) { + min_depth = zoffset - zscale; + max_depth = zoffset + zscale; + } else { + min_depth = zoffset; + max_depth = zoffset + zscale; + } + if (zmin == std::numeric_limits::max()) { + zmin = vp_d.zmin; + zmax = vp_d.zmax; + } + depth_clamp_is_same_on_all_viewports &= (zmin == vp_d.zmin && zmax == vp_d.zmax); + depth_clamp_can_use_viewport_range &= (min_depth == vp_d.zmin && max_depth == vp_d.zmax); + } + + if (zmin == std::numeric_limits::max()) { + return; + } + + if (!depth_clamp_can_use_viewport_range && !depth_clamp_is_same_on_all_viewports) { + LOG_ERROR(Render_Vulkan, + "Viewport depth clamping configuration cannot be accurately emulated"); + } + + key.depth_clamp_user_defined_range = !depth_clamp_can_use_viewport_range; + if (key.depth_clamp_user_defined_range) { + key.min_depth_clamp = zmin; + key.max_depth_clamp = zmax; + } +} bool PipelineCache::RefreshComputeKey() { Shader::Backend::Bindings binding{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index ba3407b48..405275439 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -76,6 +76,8 @@ private: bool RefreshGraphicsKey(); bool RefreshComputeKey(); + void RefreshDepthClampRange(); + void DumpShader(std::span code, u64 hash, Shader::Stage stage, size_t perm_idx, std::string_view ext); std::optional> GetShaderPatch(u64 hash, Shader::Stage stage, size_t perm_idx, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index e4e026485..b6130e873 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -20,12 +20,9 @@ namespace Vulkan { static Shader::PushData MakeUserData(const AmdGpu::Liverpool::Regs& regs) { - Shader::PushData push_data{}; - push_data.step0 = regs.vgt_instance_step_rate_0; - push_data.step1 = regs.vgt_instance_step_rate_1; - // TODO(roamic): Add support for multiple viewports and geometry shaders when ViewportIndex // is encountered and implemented in the recompiler. + Shader::PushData push_data{}; push_data.xoffset = regs.viewport_control.xoffset_enable ? regs.viewports[0].xoffset : 0.f; push_data.xscale = regs.viewport_control.xscale_enable ? regs.viewports[0].xscale : 1.f; push_data.yoffset = regs.viewport_control.yoffset_enable ? regs.viewports[0].yoffset : 0.f; @@ -113,6 +110,8 @@ RenderState Rasterizer::PrepareRenderState(u32 mrt_mask) { // Prefetch color and depth buffers to let texture cache handle possible overlaps with bound // textures (e.g. mipgen) RenderState state; + state.width = instance.GetMaxFramebufferWidth(); + state.height = instance.GetMaxFramebufferHeight(); cb_descs.clear(); db_desc.reset(); @@ -272,6 +271,8 @@ void Rasterizer::EliminateFastClear() { void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -317,6 +318,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 u32 max_count, VAddr count_address) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + if (!FilterDraw()) { return; } @@ -380,6 +383,8 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -407,6 +412,8 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; + scheduler.PopPendingOperations(); + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { @@ -439,11 +446,12 @@ void Rasterizer::Finish() { scheduler.Finish(); } -void Rasterizer::ProcessFaults() { +void Rasterizer::EndCommandList() { if (fault_process_pending) { fault_process_pending = false; buffer_cache.ProcessFaultBuffer(); } + texture_cache.ProcessDownloadImages(); } bool Rasterizer::BindResources(const Pipeline* pipeline) { @@ -649,8 +657,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (instance.IsNullDescriptorSupported()) { image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); } else { - auto& null_image_view = - texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc.view_info); + auto& null_image_view = texture_cache.FindTexture(VideoCore::NULL_IMAGE_ID, desc); image_infos.emplace_back(VK_NULL_HANDLE, *null_image_view.image_view, vk::ImageLayout::eGeneral); } @@ -664,7 +671,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin bound_images.emplace_back(image_id); auto& image = texture_cache.GetImage(image_id); - auto& image_view = texture_cache.FindTexture(image_id, desc.view_info); + auto& image_view = texture_cache.FindTexture(image_id, desc); if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, @@ -1007,9 +1014,10 @@ void Rasterizer::UpdateDynamicState(const GraphicsPipeline& pipeline) const { UpdateViewportScissorState(); UpdateDepthStencilState(); UpdatePrimitiveState(); + UpdateRasterizationState(); auto& dynamic_state = scheduler.GetDynamicState(); - dynamic_state.SetBlendConstants(&liverpool->regs.blend_constants.red); + dynamic_state.SetBlendConstants(liverpool->regs.blend_constants); dynamic_state.SetColorWriteMasks(pipeline.GetWriteMasks()); // Commit new dynamic state to the command buffer. @@ -1079,12 +1087,6 @@ void Rasterizer::UpdateViewportScissorState() const { viewport.maxDepth = zoffset + zscale; } - if (!regs.depth_render_override.disable_viewport_clamp) { - // Apply depth clamp. - viewport.minDepth = std::max(viewport.minDepth, vp_d.zmin); - viewport.maxDepth = std::min(viewport.maxDepth, vp_d.zmax); - } - if (!instance.IsDepthRangeUnrestrictedSupported()) { // Unrestricted depth range not supported by device. Restrict to valid range. viewport.minDepth = std::max(viewport.minDepth, 0.f); @@ -1224,10 +1226,17 @@ void Rasterizer::UpdatePrimitiveState() const { const auto front_face = LiverpoolToVK::FrontFace(regs.polygon_control.front_face); dynamic_state.SetPrimitiveRestartEnabled(prim_restart); + dynamic_state.SetRasterizerDiscardEnabled(regs.clipper_control.dx_rasterization_kill); dynamic_state.SetCullMode(cull_mode); dynamic_state.SetFrontFace(front_face); } +void Rasterizer::UpdateRasterizationState() const { + const auto& regs = liverpool->regs; + auto& dynamic_state = scheduler.GetDynamicState(); + dynamic_state.SetLineWidth(regs.line_control.Width()); +} + void Rasterizer::ScopeMarkerBegin(const std::string_view& str, bool from_guest) { if ((from_guest && !Config::getVkGuestMarkersEnabled()) || (!from_guest && !Config::getVkHostMarkersEnabled())) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 4a978746c..79e7722b8 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -68,7 +68,7 @@ public: void CpSync(); u64 Flush(); void Finish(); - void ProcessFaults(); + void EndCommandList(); PipelineCache& GetPipelineCache() { return pipeline_cache; @@ -94,6 +94,7 @@ private: void UpdateViewportScissorState() const; void UpdateDepthStencilState() const; void UpdatePrimitiveState() const; + void UpdateRasterizationState() const; bool FilterDraw(); diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index e75a69924..7c3429297 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -34,16 +34,11 @@ void Scheduler::BeginRendering(const RenderState& new_state) { is_rendering = true; render_state = new_state; - const auto width = - render_state.width != std::numeric_limits::max() ? render_state.width : 1; - const auto height = - render_state.height != std::numeric_limits::max() ? render_state.height : 1; - const vk::RenderingInfo rendering_info = { .renderArea = { .offset = {0, 0}, - .extent = {width, height}, + .extent = {render_state.width, render_state.height}, }, .layerCount = 1, .colorAttachmentCount = render_state.num_color_attachments, @@ -101,6 +96,14 @@ void Scheduler::Wait(u64 tick) { } } +void Scheduler::PopPendingOperations() { + master_semaphore.Refresh(); + while (!pending_ops.empty() && master_semaphore.IsFree(pending_ops.front().gpu_tick)) { + pending_ops.front().callback(); + pending_ops.pop(); + } +} + void Scheduler::AllocateWorkerCommandBuffers() { const vk::CommandBufferBeginInfo begin_info = { .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, @@ -175,10 +178,7 @@ void Scheduler::SubmitExecution(SubmitInfo& info) { AllocateWorkerCommandBuffers(); // Apply pending operations - while (!pending_ops.empty() && IsFree(pending_ops.front().gpu_tick)) { - pending_ops.front().callback(); - pending_ops.pop(); - } + PopPendingOperations(); } void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf) { @@ -308,6 +308,10 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd cmdbuf.setPrimitiveRestartEnable(primitive_restart_enable); } } + if (dirty_state.rasterizer_discard_enable) { + dirty_state.rasterizer_discard_enable = false; + cmdbuf.setRasterizerDiscardEnable(rasterizer_discard_enable); + } if (dirty_state.cull_mode) { dirty_state.cull_mode = false; cmdbuf.setCullMode(cull_mode); @@ -318,7 +322,7 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd } if (dirty_state.blend_constants) { dirty_state.blend_constants = false; - cmdbuf.setBlendConstants(blend_constants); + cmdbuf.setBlendConstants(blend_constants.data()); } if (dirty_state.color_write_masks) { dirty_state.color_write_masks = false; @@ -326,6 +330,10 @@ void DynamicState::Commit(const Instance& instance, const vk::CommandBuffer& cmd cmdbuf.setColorWriteMaskEXT(0, color_write_masks); } } + if (dirty_state.line_width) { + dirty_state.line_width = false; + cmdbuf.setLineWidth(line_width); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8ddf00f6a..3616d8478 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -26,8 +26,8 @@ struct RenderState { u32 num_color_attachments{}; bool has_depth{}; bool has_stencil{}; - u32 width = std::numeric_limits::max(); - u32 height = std::numeric_limits::max(); + u32 width{}; + u32 height{}; bool operator==(const RenderState& other) const noexcept { return std::memcmp(this, &other, sizeof(RenderState)) == 0; @@ -96,11 +96,13 @@ struct DynamicState { bool stencil_back_compare_mask : 1; bool primitive_restart_enable : 1; + bool rasterizer_discard_enable : 1; bool cull_mode : 1; bool front_face : 1; bool blend_constants : 1; bool color_write_masks : 1; + bool line_width : 1; } dirty_state{}; Viewports viewports{}; @@ -130,11 +132,13 @@ struct DynamicState { u32 stencil_back_compare_mask{}; bool primitive_restart_enable{}; + bool rasterizer_discard_enable{}; vk::CullModeFlags cull_mode{}; vk::FrontFace front_face{}; - float blend_constants[4]{}; + std::array blend_constants{}; ColorWriteMasks color_write_masks{}; + float line_width{}; /// Commits the dynamic state to the provided command buffer. void Commit(const Instance& instance, const vk::CommandBuffer& cmdbuf); @@ -283,19 +287,33 @@ struct DynamicState { } } - void SetBlendConstants(const float blend_constants_[4]) { - if (!std::equal(blend_constants, std::end(blend_constants), blend_constants_)) { - std::memcpy(blend_constants, blend_constants_, sizeof(blend_constants)); + void SetBlendConstants(const std::array blend_constants_) { + if (blend_constants != blend_constants_) { + blend_constants = blend_constants_; dirty_state.blend_constants = true; } } + void SetRasterizerDiscardEnabled(const bool enabled) { + if (rasterizer_discard_enable != enabled) { + rasterizer_discard_enable = enabled; + dirty_state.rasterizer_discard_enable = true; + } + } + void SetColorWriteMasks(const ColorWriteMasks& color_write_masks_) { if (!std::ranges::equal(color_write_masks, color_write_masks_)) { color_write_masks = color_write_masks_; dirty_state.color_write_masks = true; } } + + void SetLineWidth(const float width) { + if (line_width != width) { + line_width = width; + dirty_state.line_width = true; + } + } }; class Scheduler { @@ -317,6 +335,9 @@ public: /// Waits for the given tick to trigger on the GPU. void Wait(u64 tick); + /// Attempts to execute operations whose tick the GPU has caught up with. + void PopPendingOperations(); + /// Starts a new rendering scope with provided state. void BeginRendering(const RenderState& new_state); @@ -344,7 +365,11 @@ public: } /// Returns true when a tick has been triggered by the GPU. - [[nodiscard]] bool IsFree(u64 tick) const noexcept { + [[nodiscard]] bool IsFree(u64 tick) noexcept { + if (master_semaphore.IsFree(tick)) { + return true; + } + master_semaphore.Refresh(); return master_semaphore.IsFree(tick); } diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index a50601af6..723b95892 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -5,7 +5,9 @@ #include #include "common/assert.h" +#include "common/config.h" #include "common/debug.h" +#include "core/memory.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/page_manager.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -58,6 +60,50 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { return null_id; } +void TextureCache::ProcessDownloadImages() { + for (const ImageId image_id : download_images) { + DownloadImageMemory(image_id); + } + download_images.clear(); +} + +void TextureCache::DownloadImageMemory(ImageId image_id) { + Image& image = slot_images[image_id]; + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + auto& download_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::Download); + const u32 download_size = image.info.pitch * image.info.size.height * + image.info.resources.layers * (image.info.num_bits / 8); + ASSERT(download_size <= image.info.guest_size); + const auto [download, offset] = download_buffer.Map(download_size); + download_buffer.Commit(); + const vk::BufferImageCopy image_download = { + .bufferOffset = offset, + .bufferRowLength = image.info.pitch, + .bufferImageHeight = image.info.size.height, + .imageSubresource = + { + .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = image.info.resources.layers, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, + download_buffer.Handle(), image_download); + scheduler.DeferOperation([device_addr = image.info.guest_address, download, download_size] { + auto* memory = Core::Memory::Instance(); + memory->TryWriteBacking(std::bit_cast(device_addr), download, download_size); + }); +} + void TextureCache::MarkAsMaybeDirty(ImageId image_id, Image& image) { if (image.hash == 0) { // Initialize hash @@ -169,7 +215,7 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi if (recreate) { auto new_info = requested_info; - new_info.resources = std::min(requested_info.resources, cache_image.info.resources); + new_info.resources = std::max(requested_info.resources, cache_image.info.resources); const auto new_image_id = slot_images.insert(instance, scheduler, new_info); RegisterImage(new_image_id); @@ -437,16 +483,27 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo return slot_image_views[view_id]; } -ImageView& TextureCache::FindTexture(ImageId image_id, const ImageViewInfo& view_info) { +ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; + if (desc.type == BindingType::Storage) { + image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } + } UpdateImage(image_id); - return RegisterImageView(image_id, view_info); + return RegisterImageView(image_id, desc.view_info); } ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; + if (Config::readbackLinearImages() && + image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + download_images.emplace(image_id); + } image.usage.render_target = 1u; UpdateImage(image_id); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 87228b84f..ff8ffb61c 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -105,11 +106,14 @@ public: /// Evicts any images that overlap the unmapped range. void UnmapMemory(VAddr cpu_addr, size_t size); + /// Schedules a copy of pending images for download back to CPU memory. + void ProcessDownloadImages(); + /// Retrieves the image handle of the image with the provided attributes. [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); /// Retrieves an image view with the properties of the specified image id. - [[nodiscard]] ImageView& FindTexture(ImageId image_id, const ImageViewInfo& view_info); + [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); /// Retrieves the render target with specified properties [[nodiscard]] ImageView& FindRenderTarget(BaseDesc& desc); @@ -252,6 +256,9 @@ private: /// Gets or creates a null image for a particular format. ImageId GetNullImage(vk::Format format); + /// Copies image memory back to CPU. + void DownloadImageMemory(ImageId image_id); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); @@ -293,6 +300,7 @@ private: Common::SlotVector slot_image_views; tsl::robin_map samplers; tsl::robin_map null_images; + std::unordered_set download_images; PageTable page_table; std::mutex mutex;