From 4d1a1ce9c2aae933a3d4e9a0a07e55eb5e5875e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Thu, 5 Jun 2025 01:55:47 +0200 Subject: [PATCH 01/28] v_rcp_legacy_f32 (#3040) --- .../frontend/translate/translate.h | 1 + .../frontend/translate/vector_alu.cpp | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 7b4b03f27..2584d5c5e 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -204,6 +204,7 @@ public: void V_EXP_F32(const GcnInst& inst); void V_LOG_F32(const GcnInst& inst); void V_RCP_F32(const GcnInst& inst); + void V_RCP_LEGACY_F32(const GcnInst& inst); void V_RCP_F64(const GcnInst& inst); void V_RSQ_F32(const GcnInst& inst); void V_SQRT_F32(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index fb3f52c7f..3b88e4dec 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -158,6 +158,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_LOG_F32(inst); case Opcode::V_RCP_F32: return V_RCP_F32(inst); + case Opcode::V_RCP_LEGACY_F32: + return V_RCP_LEGACY_F32(inst); case Opcode::V_RCP_F64: return V_RCP_F64(inst); case Opcode::V_RCP_IFLAG_F32: @@ -798,6 +800,20 @@ void Translator::V_RCP_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPRecip(src0)); } +void Translator::V_RCP_LEGACY_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0])}; + const auto result = ir.FPRecip(src0); + const auto inf = ir.FPIsInf(result); + + const auto raw_result = ir.ConvertFToU(32, result); + const auto sign_bit = ir.ShiftRightLogical(raw_result, ir.Imm32(31u)); + const auto sign_bit_set = ir.INotEqual(sign_bit, ir.Imm32(0u)); + const IR::F32 inf_result{ir.Select(sign_bit_set, ir.Imm32(-0.0f), ir.Imm32(0.0f))}; + const IR::F32 val{ir.Select(inf, inf_result, result)}; + + SetDst(inst.dst[0], val); +} + void Translator::V_RCP_F64(const GcnInst& inst) { const IR::F64 src0{GetSrc64(inst.src[0])}; SetDst64(inst.dst[0], ir.FPRecip(src0)); From d4fbeea085d704a94151e44a5dcf686e2d33a7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Thu, 5 Jun 2025 02:00:11 +0200 Subject: [PATCH 02/28] Stub PM4 COPY_DATA opcode (#3032) --- src/video_core/amdgpu/liverpool.cpp | 15 ++++++-- src/video_core/amdgpu/pm4_cmds.h | 55 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 4db7648c6..118c43cef 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -394,7 +394,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span dcb, std::span(header); - LOG_DEBUG(Render_Vulkan, - "Encountered EventWrite: event_type = {}, event_index = {}", + LOG_DEBUG(Render, "Encountered EventWrite: event_type = {}, event_index = {}", magic_enum::enum_name(event->event_type.Value()), magic_enum::enum_name(event->event_index.Value())); if (event->event_type.Value() == EventType::SoVgtStreamoutFlush) { @@ -673,6 +672,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + LOG_WARNING(Render, + "unhandled IT_COPY_DATA src_sel = {}, dst_sel = {}, " + "count_sel = {}, wr_confirm = {}, engine_sel = {}", + u32(copy_data->src_sel.Value()), u32(copy_data->dst_sel.Value()), + copy_data->count_sel.Value(), copy_data->wr_confirm.Value(), + u32(copy_data->engine_sel.Value())); + break; + } case PM4ItOpcode::MemSemaphore: { const auto* mem_semaphore = reinterpret_cast(header); if (mem_semaphore->IsSignaling()) { diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 58ecda93e..011e47bf0 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -554,6 +554,61 @@ struct PM4DmaData { } }; +enum class CopyDataSrc : u32 { + MappedRegister = 0, + Memory = 1, + TCL2 = 2, + Gds = 3, + // Reserved = 4, + Immediate = 5, + Atomic = 6, + GdsAtomic0 = 7, + GdsAtomic1 = 8, + GpuClock = 9, +}; + +enum class CopyDataDst : u32 { + MappedRegister = 0, + MemorySync = 1, + TCL2 = 2, + Gds = 3, + // Reserved = 4, + MemoryAsync = 5, +}; + +enum class CopyDataEngine : u32 { + Me = 0, + Pfp = 1, + Ce = 2, + // Reserved = 3 +}; + +struct PM4CmdCopyData { + PM4Type3Header header; + union { + BitField<0, 4, CopyDataSrc> src_sel; + BitField<8, 4, CopyDataDst> dst_sel; + BitField<16, 1, u32> count_sel; + BitField<20, 1, u32> wr_confirm; + BitField<30, 2, CopyDataEngine> engine_sel; + u32 control; + }; + u32 src_addr_lo; + u32 src_addr_hi; + u32 dst_addr_lo; + u32 dst_addr_hi; + + template + T SrcAddress() const { + return std::bit_cast(src_addr_lo | u64(src_addr_hi) << 32); + } + + template + T DstAddress() const { + return std::bit_cast(dst_addr_lo | u64(dst_addr_hi) << 32); + } +}; + struct PM4CmdRewind { PM4Type3Header header; union { From 285df1b5befcedb1287007ed992e1805b148025f Mon Sep 17 00:00:00 2001 From: DanielSvoboda Date: Thu, 5 Jun 2025 02:48:47 -0300 Subject: [PATCH 03/28] QT: AutoUpdate - Fix Changelog Error (#3042) --- .github/workflows/build.yml | 6 +++--- src/qt_gui/check_update.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ceb915f6a..bb3d157b7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -494,7 +494,7 @@ jobs: with: token: ${{ secrets.SHADPS4_TOKEN_REPO }} name: "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.shorthash }}" - tag: "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.shorthash }}" + tag: "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.fullhash }}" draft: false prerelease: true body: "Full Changelog: [${{ env.last_release_tag }}...${{ needs.get-info.outputs.shorthash }}](https://github.com/shadps4-emu/shadPS4/compare/${{ env.last_release_tag }}...${{ needs.get-info.outputs.fullhash }})" @@ -530,14 +530,14 @@ jobs: # Check if release already exists and get ID release_id=$(curl -s -H "Authorization: token $GITHUB_TOKEN" \ - "https://api.github.com/repos/$REPO/releases/tags/Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.shorthash }}" | jq -r '.id') + "https://api.github.com/repos/$REPO/releases/tags/Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.fullhash }}" | jq -r '.id') if [[ "$release_id" == "null" ]]; then echo "Creating release in $REPO for $filename" release_id=$(curl -s -X POST -H "Authorization: token $GITHUB_TOKEN" \ -H "Accept: application/vnd.github.v3+json" \ -d '{ - "tag_name": "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.shorthash }}", + "tag_name": "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.fullhash }}", "name": "Pre-release-shadPS4-${{ needs.get-info.outputs.date }}-${{ needs.get-info.outputs.shorthash }}", "draft": false, "prerelease": true, diff --git a/src/qt_gui/check_update.cpp b/src/qt_gui/check_update.cpp index 550fdddb5..b0858840a 100644 --- a/src/qt_gui/check_update.cpp +++ b/src/qt_gui/check_update.cpp @@ -137,7 +137,7 @@ tr("The Auto Updater allows up to 60 update checks per hour.\\nYou have reached } } - latestRev = latestVersion.right(7); + latestRev = latestVersion.right(40); latestDate = jsonObj["published_at"].toString(); QJsonArray assets = jsonObj["assets"].toArray(); @@ -167,7 +167,7 @@ tr("The Auto Updater allows up to 60 update checks per hour.\\nYou have reached QDateTime dateTime = QDateTime::fromString(latestDate, Qt::ISODate); latestDate = dateTime.isValid() ? dateTime.toString("yyyy-MM-dd HH:mm:ss") : "Unknown date"; - if (latestRev == currentRev.left(7)) { + if (latestRev == currentRev) { if (showMessage) { QMessageBox::information(this, tr("Auto Updater"), tr("Your version is already up to date!")); @@ -215,7 +215,7 @@ void CheckUpdate::setupUI(const QString& downloadUrl, const QString& latestDate, "%3" "(%4)" "

") - .arg(currentRev.left(7), currentDate, latestRev, latestDate); + .arg(currentRev.left(7), currentDate, latestRev.left(7), latestDate); QLabel* updateLabel = new QLabel(updateText, this); layout->addWidget(updateLabel); From 93222c6f9f01c15855b3cee23c7856b963b6b1e2 Mon Sep 17 00:00:00 2001 From: georgemoralis Date: Thu, 5 Jun 2025 08:49:32 +0300 Subject: [PATCH 04/28] New Crowdin updates (#3038) * New translations en_us.ts (Portuguese, Brazilian) * New translations en_us.ts (Turkish) --- src/qt_gui/translations/pt_BR.ts | 2 +- src/qt_gui/translations/tr_TR.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/qt_gui/translations/pt_BR.ts b/src/qt_gui/translations/pt_BR.ts index 34d31f240..9f254e272 100644 --- a/src/qt_gui/translations/pt_BR.ts +++ b/src/qt_gui/translations/pt_BR.ts @@ -2048,7 +2048,7 @@ * Unsupported Vulkan Version - * Unsupported Vulkan Version + * Versão do Vulkan não suportada diff --git a/src/qt_gui/translations/tr_TR.ts b/src/qt_gui/translations/tr_TR.ts index e61985e90..c6d641470 100644 --- a/src/qt_gui/translations/tr_TR.ts +++ b/src/qt_gui/translations/tr_TR.ts @@ -138,7 +138,7 @@ File Exists - Dosya mevcut + Dosya Mevcut File already exists. Do you want to replace it? @@ -1221,7 +1221,7 @@ Exit shadPS4 - shadPS4'ten Çık + shadPS4 Çıkış Exit the application. @@ -1381,7 +1381,7 @@ Game Boot - Oyun Başlatma + Oyun Başlat Only one file can be selected! From 0e9420a7b228f3e560ba154ad33e037358679638 Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Thu, 5 Jun 2025 08:43:39 -0500 Subject: [PATCH 05/28] Fix request queues in libSceZlib (#3041) Queues are a FIFO data structure, so pop() removes the front, not the end. --- src/core/libraries/zlib/zlib.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/libraries/zlib/zlib.cpp b/src/core/libraries/zlib/zlib.cpp index 899cb5bf6..b304992ad 100644 --- a/src/core/libraries/zlib/zlib.cpp +++ b/src/core/libraries/zlib/zlib.cpp @@ -51,7 +51,7 @@ void ZlibTaskThread(const std::stop_token& stop) { if (!task_queue_cv.wait(lock, stop, [&] { return !task_queue.empty(); })) { break; } - task = task_queue.back(); + task = task_queue.front(); task_queue.pop(); } @@ -136,7 +136,7 @@ s32 PS4_SYSV_ABI sceZlibWaitForDone(u64* request_id, const u32* timeout) { } else { done_queue_cv.wait(lock, pred); } - *request_id = done_queue.back(); + *request_id = done_queue.front(); done_queue.pop(); } return ORBIS_OK; From 3b3026ff1c98137e4f3051ec44c0eb4e1fa2f8ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=A5IGA?= <164882787+Xphalnos@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:44:02 +0200 Subject: [PATCH 06/28] [CI] Update Qt to 6.9.1 (#3037) --- .github/workflows/build.yml | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bb3d157b7..588236b14 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -76,18 +76,13 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-sdl-cache-cmake-build with: append-timestamp: false key: ${{ env.cache-name }}-${{ hashFiles('**/CMakeLists.txt', 'cmake/**') }} - - name: Setup VS Environment - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - name: Configure CMake run: cmake --fresh -G Ninja -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE=ON -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -111,7 +106,7 @@ jobs: - name: Setup Qt uses: jurplel/install-qt-action@v4 with: - version: 6.9.0 + version: 6.9.1 host: windows target: desktop arch: win64_msvc2022_64 @@ -130,18 +125,13 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-qt-cache-cmake-build with: append-timestamp: false key: ${{ env.cache-name }}-${{ hashFiles('**/CMakeLists.txt', 'cmake/**') }} - - name: Setup VS Environment - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - name: Configure CMake run: cmake --fresh -G Ninja -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DENABLE_QT_GUI=ON -DENABLE_UPDATER=ON -DCMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE=ON -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -186,7 +176,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{runner.os}}-sdl-cache-cmake-build with: @@ -228,7 +218,7 @@ jobs: - name: Setup Qt uses: jurplel/install-qt-action@v4 with: - version: 6.9.0 + version: 6.9.1 host: mac target: desktop arch: clang_64 @@ -247,7 +237,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{runner.os}}-qt-cache-cmake-build with: @@ -301,7 +291,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-sdl-cache-cmake-build with: @@ -362,7 +352,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-qt-cache-cmake-build with: @@ -409,7 +399,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-sdl-gcc-cache-cmake-build with: @@ -445,7 +435,7 @@ jobs: ${{ env.cache-name }}- - name: Cache CMake Build - uses: hendrikmuhs/ccache-action@v1.2.17 + uses: hendrikmuhs/ccache-action@v1.2.18 env: cache-name: ${{ runner.os }}-qt-gcc-cache-cmake-build with: From 43bf4ed1bca1cd6442e97b04d9afb8383219bb86 Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:14:34 -0500 Subject: [PATCH 07/28] sceVideoOutGetResolutionStatus error behavior (#3044) --- src/core/libraries/videoout/video_out.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/core/libraries/videoout/video_out.cpp b/src/core/libraries/videoout/video_out.cpp index c5208b6dd..da715b3bf 100644 --- a/src/core/libraries/videoout/video_out.cpp +++ b/src/core/libraries/videoout/video_out.cpp @@ -282,7 +282,12 @@ s32 PS4_SYSV_ABI sceVideoOutGetVblankStatus(int handle, SceVideoOutVblankStatus* s32 PS4_SYSV_ABI sceVideoOutGetResolutionStatus(s32 handle, SceVideoOutResolutionStatus* status) { LOG_INFO(Lib_VideoOut, "called"); - *status = driver->GetPort(handle)->resolution; + auto* port = driver->GetPort(handle); + if (!port || !port->is_open) { + return ORBIS_VIDEO_OUT_ERROR_INVALID_HANDLE; + } + + *status = port->resolution; return ORBIS_OK; } From fff3bf9917faef7a185cae896efce87bea7b5b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Thu, 5 Jun 2025 23:33:25 +0200 Subject: [PATCH 08/28] s_flbit_i32_b64 (#3033) * s_flbit_i32_b64 * Split FindUMsb64 into two 32bit ops --- .../backend/spirv/emit_spirv_instructions.h | 1 + .../backend/spirv/emit_spirv_integer.cpp | 14 ++++++++++++++ .../frontend/translate/scalar_alu.cpp | 13 +++++++++++++ .../frontend/translate/translate.h | 1 + src/shader_recompiler/ir/ir_emitter.cpp | 11 +++++++++-- src/shader_recompiler/ir/ir_emitter.h | 2 +- src/shader_recompiler/ir/opcodes.inc | 1 + 7 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 09f9732bf..172358866 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -372,6 +372,7 @@ Id EmitBitCount64(EmitContext& ctx, Id value); Id EmitBitwiseNot32(EmitContext& ctx, Id value); Id EmitFindSMsb32(EmitContext& ctx, Id value); Id EmitFindUMsb32(EmitContext& ctx, Id value); +Id EmitFindUMsb64(EmitContext& ctx, Id value); Id EmitFindILsb32(EmitContext& ctx, Id value); Id EmitFindILsb64(EmitContext& ctx, Id value); Id EmitSMin32(EmitContext& ctx, Id a, Id b); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp index 10bfbb2ab..1a995354d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_integer.cpp @@ -229,6 +229,20 @@ Id EmitFindUMsb32(EmitContext& ctx, Id value) { return ctx.OpFindUMsb(ctx.U32[1], value); } +Id EmitFindUMsb64(EmitContext& ctx, Id value) { + // Vulkan restricts some bitwise operations to 32-bit only, so decompose into + // two 32-bit values and select the correct result. + const Id unpacked{ctx.OpBitcast(ctx.U32[2], value)}; + const Id hi{ctx.OpCompositeExtract(ctx.U32[1], unpacked, 1U)}; + const Id lo{ctx.OpCompositeExtract(ctx.U32[1], unpacked, 0U)}; + const Id hi_msb{ctx.OpFindUMsb(ctx.U32[1], hi)}; + const Id lo_msb{ctx.OpFindUMsb(ctx.U32[1], lo)}; + const Id found_hi{ctx.OpINotEqual(ctx.U1[1], hi_msb, ctx.ConstU32(u32(-1)))}; + const Id shifted_hi{ctx.OpIAdd(ctx.U32[1], hi_msb, ctx.ConstU32(32u))}; + // value == 0 case is checked in IREmitter + return ctx.OpSelect(ctx.U32[1], found_hi, shifted_hi, lo_msb); +} + Id EmitFindILsb32(EmitContext& ctx, Id value) { return ctx.OpFindILsb(ctx.U32[1], value); } diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 3a8e894ae..7beb594c3 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -114,6 +114,8 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { return S_FF1_I32_B64(inst); case Opcode::S_FLBIT_I32_B32: return S_FLBIT_I32_B32(inst); + case Opcode::S_FLBIT_I32_B64: + return S_FLBIT_I32_B64(inst); case Opcode::S_BITSET0_B32: return S_BITSET_B32(inst, 0); case Opcode::S_BITSET1_B32: @@ -686,6 +688,17 @@ void Translator::S_FLBIT_I32_B32(const GcnInst& inst) { SetDst(inst.dst[0], IR::U32{ir.Select(cond, pos_from_left, ir.Imm32(~0U))}); } +void Translator::S_FLBIT_I32_B64(const GcnInst& inst) { + const IR::U64 src0{GetSrc64(inst.src[0])}; + // Gcn wants the MSB position counting from the left, but SPIR-V counts from the rightmost (LSB) + // position + const IR::U32 msb_pos = ir.FindUMsb(src0); + const IR::U32 pos_from_left = ir.ISub(ir.Imm32(63), msb_pos); + // Select 0xFFFFFFFF if src0 was 0 + const IR::U1 cond = ir.INotEqual(src0, ir.Imm64(u64(0u))); + SetDst(inst.dst[0], IR::U32{ir.Select(cond, pos_from_left, ir.Imm32(~0U))}); +} + void Translator::S_BITSET_B32(const GcnInst& inst, u32 bit_value) { const IR::U32 old_value{GetSrc(inst.dst[0])}; const IR::U32 offset{ir.BitFieldExtract(GetSrc(inst.src[0]), ir.Imm32(0U), ir.Imm32(5U))}; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 2584d5c5e..15ba8c8d7 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -121,6 +121,7 @@ public: void S_FF1_I32_B32(const GcnInst& inst); void S_FF1_I32_B64(const GcnInst& inst); void S_FLBIT_I32_B32(const GcnInst& inst); + void S_FLBIT_I32_B64(const GcnInst& inst); void S_BITSET_B32(const GcnInst& inst, u32 bit_value); void S_GETPC_B64(u32 pc, const GcnInst& inst); void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 01d945178..dcb734d01 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -1546,8 +1546,15 @@ U32 IREmitter::FindSMsb(const U32& value) { return Inst(Opcode::FindSMsb32, value); } -U32 IREmitter::FindUMsb(const U32& value) { - return Inst(Opcode::FindUMsb32, value); +U32 IREmitter::FindUMsb(const U32U64& value) { + switch (value.Type()) { + case Type::U32: + return Inst(Opcode::FindUMsb32, value); + case Type::U64: + return Inst(Opcode::FindUMsb64, value); + default: + ThrowInvalidType(value.Type()); + } } U32 IREmitter::FindILsb(const U32U64& value) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 8f8a12736..da7adf42b 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -266,7 +266,7 @@ public: [[nodiscard]] U32 BitwiseNot(const U32& value); [[nodiscard]] U32 FindSMsb(const U32& value); - [[nodiscard]] U32 FindUMsb(const U32& value); + [[nodiscard]] U32 FindUMsb(const U32U64& value); [[nodiscard]] U32 FindILsb(const U32U64& value); [[nodiscard]] U32 SMin(const U32& a, const U32& b); [[nodiscard]] U32 UMin(const U32& a, const U32& b); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index ab6dbfde9..647432bcf 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -349,6 +349,7 @@ OPCODE(BitwiseNot32, U32, U32, OPCODE(FindSMsb32, U32, U32, ) OPCODE(FindUMsb32, U32, U32, ) +OPCODE(FindUMsb64, U32, U64, ) OPCODE(FindILsb32, U32, U32, ) OPCODE(FindILsb64, U32, U64, ) OPCODE(SMin32, U32, U32, U32, ) From 91d29459fb55cb0d28006639e7a38134c5a368ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Fri, 6 Jun 2025 05:19:05 +0200 Subject: [PATCH 09/28] Implement PM4CondExec (#3046) --- src/video_core/amdgpu/liverpool.cpp | 13 +++++++++++++ src/video_core/amdgpu/pm4_cmds.h | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 118c43cef..e031d0ebc 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -765,6 +765,19 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + if (cond_exec->command.Value() != 0) { + LOG_WARNING(Render, "IT_COND_EXEC used a reserved command"); + } + const auto skip = *cond_exec->Address() == false; + if (skip) { + dcb = NextPacket(dcb, + header->type3.NumWords() + 1 + cond_exec->exec_count.Value()); + continue; + } + break; + } default: UNREACHABLE_MSG("Unknown PM4 type 3 opcode {:#x} with count {}", static_cast(opcode), count); diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 011e47bf0..23c1b8f21 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -1159,4 +1159,25 @@ struct PM4CmdMemSemaphore { } }; +struct PM4CmdCondExec { + PM4Type3Header header; + union { + BitField<2, 30, u32> bool_addr_lo; ///< low 32 address bits for the block in memory from + ///< where the CP will fetch the condition + }; + union { + BitField<0, 16, u32> bool_addr_hi; ///< high address bits for the condition + BitField<28, 4, u32> command; + }; + union { + BitField<0, 14, u32> exec_count; ///< Number of DWords that the CP will skip + ///< if bool pointed to is zero + }; + + bool* Address() const { + return std::bit_cast(u64(bool_addr_hi.Value()) << 32 | u64(bool_addr_lo.Value()) + << 2); + } +}; + } // namespace AmdGpu From 5edd9ff54b6baca5025dff0f5491ddf0d0746a19 Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Sat, 7 Jun 2025 16:17:45 -0500 Subject: [PATCH 10/28] Improved sceKernelMapNamedFlexibleMemory logging (#3050) * More descriptive sceKernelMapNamedFlexibleMemory logging * Misc exports These functions are used by Overwatch: Origins Edition * Clang * Function parameter cleanup Changes the parameters on our sceKernelMapNamedFlexibleMemory and sceKernelMapFlexibleMemory functions to better align with our current standards. --- src/core/libraries/kernel/kernel.cpp | 4 ++++ src/core/libraries/kernel/memory.cpp | 21 +++++++++------------ src/core/libraries/kernel/memory.h | 7 +++---- src/core/libraries/kernel/threads/mutex.cpp | 1 + 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/core/libraries/kernel/kernel.cpp b/src/core/libraries/kernel/kernel.cpp index 180850217..930640d0e 100644 --- a/src/core/libraries/kernel/kernel.cpp +++ b/src/core/libraries/kernel/kernel.cpp @@ -273,6 +273,10 @@ void RegisterKernel(Core::Loader::SymbolsResolver* sym) { Libraries::Net::sceNetInetNtop); // TODO fix it to sys_ ... LIB_FUNCTION("4n51s0zEf0c", "libScePosix", 1, "libkernel", 1, 1, Libraries::Net::sceNetInetPton); // TODO fix it to sys_ ... + LIB_FUNCTION("XVL8So3QJUk", "libScePosix", 1, "libkernel", 1, 1, Libraries::Net::sys_connect); + LIB_FUNCTION("3e+4Iv7IJ8U", "libScePosix", 1, "libkernel", 1, 1, Libraries::Net::sys_accept); + LIB_FUNCTION("aNeavPDNKzA", "libScePosix", 1, "libkernel", 1, 1, Libraries::Net::sys_sendmsg); + LIB_FUNCTION("pxnCmagrtao", "libScePosix", 1, "libkernel", 1, 1, Libraries::Net::sys_listen); } } // namespace Libraries::Kernel diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index 18676cbdf..5e94199e1 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -222,9 +222,10 @@ s32 PS4_SYSV_ABI sceKernelMapDirectMemory2(void** addr, u64 len, s32 type, s32 p return ret; } -s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, std::size_t len, int prot, - int flags, const char* name) { - +s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, u64 len, s32 prot, s32 flags, + const char* name) { + LOG_INFO(Kernel_Vmm, "in_addr = {}, len = {:#x}, prot = {:#x}, flags = {:#x}, name = '{}'", + fmt::ptr(*addr_in_out), len, prot, flags, name); if (len == 0 || !Common::Is16KBAligned(len)) { LOG_ERROR(Kernel_Vmm, "len is 0 or not 16kb multiple"); return ORBIS_KERNEL_ERROR_EINVAL; @@ -243,18 +244,14 @@ s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, std::size_t const VAddr in_addr = reinterpret_cast(*addr_in_out); const auto mem_prot = static_cast(prot); const auto map_flags = static_cast(flags); - SCOPE_EXIT { - LOG_INFO(Kernel_Vmm, - "in_addr = {:#x}, out_addr = {}, len = {:#x}, prot = {:#x}, flags = {:#x}", - in_addr, fmt::ptr(*addr_in_out), len, prot, flags); - }; auto* memory = Core::Memory::Instance(); - return memory->MapMemory(addr_in_out, in_addr, len, mem_prot, map_flags, - Core::VMAType::Flexible, name); + const auto ret = memory->MapMemory(addr_in_out, in_addr, len, mem_prot, map_flags, + Core::VMAType::Flexible, name); + LOG_INFO(Kernel_Vmm, "out_addr = {}", fmt::ptr(*addr_in_out)); + return ret; } -s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, std::size_t len, int prot, - int flags) { +s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, u64 len, s32 prot, s32 flags) { return sceKernelMapNamedFlexibleMemory(addr_in_out, len, prot, flags, "anon"); } diff --git a/src/core/libraries/kernel/memory.h b/src/core/libraries/kernel/memory.h index 6cefe0d07..ea42e7546 100644 --- a/src/core/libraries/kernel/memory.h +++ b/src/core/libraries/kernel/memory.h @@ -141,10 +141,9 @@ s32 PS4_SYSV_ABI sceKernelAvailableDirectMemorySize(u64 searchStart, u64 searchE s32 PS4_SYSV_ABI sceKernelVirtualQuery(const void* addr, int flags, OrbisVirtualQueryInfo* info, size_t infoSize); s32 PS4_SYSV_ABI sceKernelReserveVirtualRange(void** addr, u64 len, int flags, u64 alignment); -s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addrInOut, std::size_t len, int prot, - int flags, const char* name); -s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, std::size_t len, int prot, - int flags); +s32 PS4_SYSV_ABI sceKernelMapNamedFlexibleMemory(void** addr_in_out, u64 len, s32 prot, s32 flags, + const char* name); +s32 PS4_SYSV_ABI sceKernelMapFlexibleMemory(void** addr_in_out, u64 len, s32 prot, s32 flags); int PS4_SYSV_ABI sceKernelQueryMemoryProtection(void* addr, void** start, void** end, u32* prot); s32 PS4_SYSV_ABI sceKernelMprotect(const void* addr, u64 size, s32 prot); diff --git a/src/core/libraries/kernel/threads/mutex.cpp b/src/core/libraries/kernel/threads/mutex.cpp index 956e5ef65..3dbade96a 100644 --- a/src/core/libraries/kernel/threads/mutex.cpp +++ b/src/core/libraries/kernel/threads/mutex.cpp @@ -426,6 +426,7 @@ void RegisterMutex(Core::Loader::SymbolsResolver* sym) { // Posix LIB_FUNCTION("ttHNfU+qDBU", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_init); LIB_FUNCTION("7H0iTOciTLo", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_lock); + LIB_FUNCTION("Io9+nTKXZtA", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_timedlock); LIB_FUNCTION("2Z+PpY6CaJg", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_unlock); LIB_FUNCTION("ltCfaGr2JGE", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutex_destroy); LIB_FUNCTION("dQHWEsJtoE4", "libScePosix", 1, "libkernel", 1, 1, posix_pthread_mutexattr_init); From 2857ef34f037ac8e96730faabdf7dd3511c7d6af Mon Sep 17 00:00:00 2001 From: Stephen Miller <56742918+StevenMiller123@users.noreply.github.com> Date: Sun, 8 Jun 2025 13:04:43 -0500 Subject: [PATCH 11/28] Don't coalesce dmem pages (#3059) Looks like this change is what broke P.T. I'll need to look closer at this when I have a chance, clearly we're doing something wrong here. --- src/core/memory.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index ba3640877..54cae910b 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -182,7 +182,6 @@ PAddr MemoryManager::Allocate(PAddr search_start, PAddr search_end, size_t size, auto& area = CarveDmemArea(mapping_start, size)->second; area.memory_type = memory_type; area.is_free = false; - MergeAdjacent(dmem_map, dmem_area); return mapping_start; } From 5d064dd89f17b73be3cae84bd0bf98a7b02d6997 Mon Sep 17 00:00:00 2001 From: Fire Cube Date: Sun, 8 Jun 2025 20:04:55 +0200 Subject: [PATCH 12/28] Dev Tools: Fix Module Viewer HLE detection (#3058) * fix * clang --- src/core/linker.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/core/linker.cpp b/src/core/linker.cpp index c50b03a8f..1f45caf12 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -332,21 +332,22 @@ bool Linker::Resolve(const std::string& name, Loader::SymbolType sym_type, Modul sr.type = sym_type; const auto* record = m_hle_symbols.FindSymbol(sr); - if (!record) { - // Check if it an export function - const auto* p = FindExportedModule(*module, *library); - if (p && p->export_sym.GetSize() > 0) { - record = p->export_sym.FindSymbol(sr); - } - } if (record) { *return_info = *record; - Core::Devtools::Widget::ModuleList::AddModule(sr.library); - return true; } + // Check if it an export function + const auto* p = FindExportedModule(*module, *library); + if (p && p->export_sym.GetSize() > 0) { + record = p->export_sym.FindSymbol(sr); + if (record) { + *return_info = *record; + return true; + } + } + const auto aeronid = AeroLib::FindByNid(sr.name.c_str()); if (aeronid) { return_info->name = aeronid->name; From 2bc199a41be994eaad6abd0db978808aac3cb2c0 Mon Sep 17 00:00:00 2001 From: Mahmoud Adel <94652220+AboMedoz@users.noreply.github.com> Date: Sun, 8 Jun 2025 21:33:08 +0300 Subject: [PATCH 13/28] black image error fix (#3051) --- src/video_core/texture_cache/texture_cache.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 63cfc4431..4b173c313 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -299,6 +299,7 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { auto& new_image = slot_images[new_image_id]; src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + RefreshImage(new_image); new_image.CopyImage(src_image); if (src_image.binding.is_bound || src_image.binding.is_target) { From 952cef5a154c231d0b8879a6caca519cfaebb670 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Sun, 8 Jun 2025 21:38:58 +0300 Subject: [PATCH 14/28] shader_recompiler: Implement dual source blending (#3054) --- .../backend/spirv/spirv_emit_context.cpp | 15 +++++++++++++-- .../frontend/translate/export.cpp | 10 ++++++++-- src/shader_recompiler/runtime_info.h | 2 ++ .../renderer_vulkan/liverpool_to_vk.cpp | 13 +++++++++++++ src/video_core/renderer_vulkan/liverpool_to_vk.h | 2 ++ .../renderer_vulkan/vk_pipeline_cache.cpp | 9 +++++++++ 6 files changed, 47 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 68bfcc0d0..bd10fd3df 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -634,7 +634,8 @@ void EmitContext::DefineOutputs() { } break; } - case LogicalStage::Fragment: + case LogicalStage::Fragment: { + u32 num_render_targets = 0; for (u32 i = 0; i < IR::NumRenderTargets; i++) { const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i}; if (!info.stores.GetAny(mrt)) { @@ -643,11 +644,21 @@ void EmitContext::DefineOutputs() { const u32 num_components = info.stores.NumComponents(mrt); const AmdGpu::NumberFormat num_format{runtime_info.fs_info.color_buffers[i].num_format}; const Id type{GetAttributeType(*this, num_format)[num_components]}; - const Id id{DefineOutput(type, i)}; + Id id; + if (runtime_info.fs_info.dual_source_blending) { + id = DefineOutput(type, 0); + Decorate(id, spv::Decoration::Index, i); + } else { + id = DefineOutput(type, i); + } Name(id, fmt::format("frag_color{}", i)); frag_outputs[i] = GetAttributeInfo(num_format, id, num_components, true); + ++num_render_targets; } + ASSERT_MSG(!runtime_info.fs_info.dual_source_blending || num_render_targets == 2, + "Dual source blending enabled, there must be exactly two MRT exports"); break; + } case LogicalStage::Geometry: { output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); diff --git a/src/shader_recompiler/frontend/translate/export.cpp b/src/shader_recompiler/frontend/translate/export.cpp index 0abef2e81..8a99f38a9 100644 --- a/src/shader_recompiler/frontend/translate/export.cpp +++ b/src/shader_recompiler/frontend/translate/export.cpp @@ -26,8 +26,11 @@ void Translator::ExportMrtValue(IR::Attribute attribute, u32 comp, const IR::F32 } void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR::U32& value) { - const u32 color_buffer_idx = + u32 color_buffer_idx = static_cast(attribute) - static_cast(IR::Attribute::RenderTarget0); + if (runtime_info.fs_info.dual_source_blending && attribute == IR::Attribute::RenderTarget1) { + color_buffer_idx = 0; + } const auto color_buffer = runtime_info.fs_info.color_buffers[color_buffer_idx]; AmdGpu::NumberFormat num_format; @@ -68,8 +71,11 @@ void Translator::ExportMrtCompressed(IR::Attribute attribute, u32 idx, const IR: } void Translator::ExportMrtUncompressed(IR::Attribute attribute, u32 comp, const IR::F32& value) { - const u32 color_buffer_idx = + u32 color_buffer_idx = static_cast(attribute) - static_cast(IR::Attribute::RenderTarget0); + if (runtime_info.fs_info.dual_source_blending && attribute == IR::Attribute::RenderTarget1) { + color_buffer_idx = 0; + } const auto color_buffer = runtime_info.fs_info.color_buffers[color_buffer_idx]; const auto swizzled_comp = SwizzleMrtComponent(color_buffer, comp); diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index b8ed42f5b..53d2d5303 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -196,11 +196,13 @@ struct FragmentRuntimeInfo { u32 num_inputs; std::array inputs; std::array color_buffers; + bool dual_source_blending; bool operator==(const FragmentRuntimeInfo& other) const noexcept { return std::ranges::equal(color_buffers, other.color_buffers) && en_flags.raw == other.en_flags.raw && addr_flags.raw == other.addr_flags.raw && num_inputs == other.num_inputs && + dual_source_blending == other.dual_source_blending && std::ranges::equal(inputs.begin(), inputs.begin() + num_inputs, other.inputs.begin(), other.inputs.begin() + num_inputs); } diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index a6ae0c304..5972296c0 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -214,6 +214,19 @@ vk::BlendFactor BlendFactor(Liverpool::BlendControl::BlendFactor factor) { } } +bool IsDualSourceBlendFactor(Liverpool::BlendControl::BlendFactor factor) { + using BlendFactor = Liverpool::BlendControl::BlendFactor; + switch (factor) { + case BlendFactor::Src1Color: + case BlendFactor::Src1Alpha: + case BlendFactor::InvSrc1Color: + case BlendFactor::InvSrc1Alpha: + return true; + default: + return false; + } +} + vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func) { using BlendFunc = Liverpool::BlendControl::BlendFunc; switch (func) { diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.h b/src/video_core/renderer_vulkan/liverpool_to_vk.h index fca0a8378..61fd4a8c1 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.h +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.h @@ -30,6 +30,8 @@ vk::FrontFace FrontFace(Liverpool::FrontFace mode); vk::BlendFactor BlendFactor(Liverpool::BlendControl::BlendFactor factor); +bool IsDualSourceBlendFactor(Liverpool::BlendControl::BlendFactor factor); + vk::BlendOp BlendOp(Liverpool::BlendControl::BlendFunc func); vk::SamplerAddressMode ClampMode(AmdGpu::ClampMode mode); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index d7ad47a3c..b72f77e55 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -158,6 +158,15 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS info.fs_info.addr_flags = regs.ps_input_addr; const auto& ps_inputs = regs.ps_inputs; info.fs_info.num_inputs = regs.num_interp; + const auto& cb0_blend = regs.blend_control[0]; + info.fs_info.dual_source_blending = + LiverpoolToVK::IsDualSourceBlendFactor(cb0_blend.color_dst_factor) || + LiverpoolToVK::IsDualSourceBlendFactor(cb0_blend.color_src_factor); + if (cb0_blend.separate_alpha_blend) { + info.fs_info.dual_source_blending |= + LiverpoolToVK::IsDualSourceBlendFactor(cb0_blend.alpha_dst_factor) || + LiverpoolToVK::IsDualSourceBlendFactor(cb0_blend.alpha_src_factor); + } for (u32 i = 0; i < regs.num_interp; i++) { info.fs_info.inputs[i] = { .param_index = u8(ps_inputs[i].input_offset.Value()), From ce84e80f65745c9f00981e4dbcfe79ef1a11cfe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Sun, 8 Jun 2025 20:43:58 +0200 Subject: [PATCH 15/28] BUFFER_ATOMIC_CMPSWAP (#3045) --- .../backend/spirv/emit_spirv_atomic.cpp | 22 +++++++++++++++++++ .../backend/spirv/emit_spirv_instructions.h | 2 ++ .../frontend/translate/vector_memory.cpp | 4 ++++ src/shader_recompiler/ir/ir_emitter.cpp | 5 +++++ src/shader_recompiler/ir/ir_emitter.h | 3 +++ src/shader_recompiler/ir/opcodes.inc | 1 + 6 files changed, 37 insertions(+) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index d7c73ca8f..a342b47b6 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -68,6 +68,22 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id }); } +Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id cmp_value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id, Id, Id)) { + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(2u)); + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value); + }); +} + Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& texture = ctx.images[handle & 0xFFFF]; @@ -175,6 +191,12 @@ Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicExchange); } +Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id cmp_value) { + return BufferAtomicU32CmpSwap(ctx, inst, handle, address, value, cmp_value, + &Sirit::Module::OpAtomicCompareExchange); +} + Id EmitImageAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value) { return ImageAtomicU32(ctx, inst, handle, coords, value, &Sirit::Module::OpAtomicIAdd); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 172358866..b9707224c 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -96,6 +96,8 @@ Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicCmpSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id cmp_value); Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 5c972c607..8c035f26c 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -331,6 +331,10 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { switch (op) { case AtomicOp::Swap: return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info); + case AtomicOp::CmpSwap: { + IR::Value cmp_val = ir.GetVectorReg(vdata + 1); + return ir.BufferAtomicCmpSwap(handle, address, vdata_val, cmp_val, buffer_info); + } case AtomicOp::Add: return ir.BufferAtomicIAdd(handle, address, vdata_val, buffer_info); case AtomicOp::Smin: diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index dcb734d01..07249edfe 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -513,6 +513,11 @@ Value IREmitter::BufferAtomicSwap(const Value& handle, const Value& address, con return Inst(Opcode::BufferAtomicSwap32, Flags{info}, handle, address, value); } +Value IREmitter::BufferAtomicCmpSwap(const Value& handle, const Value& address, const Value& vdata, + const Value& cmp_value, BufferInstInfo info) { + return Inst(Opcode::BufferAtomicCmpSwap32, Flags{info}, handle, address, vdata, cmp_value); +} + U32 IREmitter::DataAppend(const U32& counter) { return Inst(Opcode::DataAppend, counter, Imm32(0)); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index da7adf42b..7b9b81093 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -150,6 +150,9 @@ public: const Value& value, BufferInstInfo info); [[nodiscard]] Value BufferAtomicSwap(const Value& handle, const Value& address, const Value& value, BufferInstInfo info); + [[nodiscard]] Value BufferAtomicCmpSwap(const Value& handle, const Value& address, + const Value& value, const Value& cmp_value, + BufferInstInfo info); [[nodiscard]] U32 DataAppend(const U32& counter); [[nodiscard]] U32 DataConsume(const U32& counter); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 647432bcf..5b3216be6 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -126,6 +126,7 @@ OPCODE(BufferAtomicAnd32, U32, Opaq OPCODE(BufferAtomicOr32, U32, Opaque, Opaque, U32, ) OPCODE(BufferAtomicXor32, U32, Opaque, Opaque, U32, ) OPCODE(BufferAtomicSwap32, U32, Opaque, Opaque, U32, ) +OPCODE(BufferAtomicCmpSwap32, U32, Opaque, Opaque, U32, U32, ) // Vector utility OPCODE(CompositeConstructU32x2, U32x2, U32, U32, ) From f2bbb6847dc22037c50a5723acd70195d30245c9 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Sun, 8 Jun 2025 11:53:11 -0700 Subject: [PATCH 16/28] fix: Missing switch case for BUFFER_ATOMIC_CMPSWAP --- src/shader_recompiler/frontend/translate/vector_memory.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 8c035f26c..5eb2079a4 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -70,6 +70,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_ATOMIC(AtomicOp::Add, inst); case Opcode::BUFFER_ATOMIC_SWAP: return BUFFER_ATOMIC(AtomicOp::Swap, inst); + case Opcode::BUFFER_ATOMIC_CMPSWAP: + return BUFFER_ATOMIC(AtomicOp::CmpSwap, inst); case Opcode::BUFFER_ATOMIC_SMIN: return BUFFER_ATOMIC(AtomicOp::Smin, inst); case Opcode::BUFFER_ATOMIC_UMIN: @@ -332,7 +334,7 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) { case AtomicOp::Swap: return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info); case AtomicOp::CmpSwap: { - IR::Value cmp_val = ir.GetVectorReg(vdata + 1); + const IR::Value cmp_val = ir.GetVectorReg(vdata + 1); return ir.BufferAtomicCmpSwap(handle, address, vdata_val, cmp_val, buffer_info); } case AtomicOp::Add: From a07a6bb9d3519fe128538a0c5d537e1e1e5bfbe1 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Sun, 8 Jun 2025 22:14:09 +0300 Subject: [PATCH 17/28] buffer_cache: Better image search for buffer validation (#3057) --- src/video_core/buffer_cache/buffer_cache.cpp | 51 ++++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 4717a5ff8..8a5283d83 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -798,24 +798,45 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, } bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { - static constexpr FindFlags find_flags = - FindFlags::NoCreate | FindFlags::RelaxDim | FindFlags::RelaxFmt | FindFlags::RelaxSize; - TextureCache::BaseDesc desc{}; - desc.info.guest_address = device_addr; - desc.info.guest_size = size; - const ImageId image_id = texture_cache.FindImage(desc, find_flags); - if (!image_id) { + boost::container::small_vector image_ids; + texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { + if (image.info.guest_address != device_addr) { + return; + } + // Only perform sync if image is: + // - GPU modified; otherwise there are no changes to synchronize. + // - Not CPU dirty; otherwise we could overwrite CPU changes with stale GPU changes. + // - Not GPU dirty; otherwise we could overwrite GPU changes with stale image data. + if (False(image.flags & ImageFlagBits::GpuModified) || + True(image.flags & ImageFlagBits::Dirty)) { + return; + } + image_ids.push_back(image_id); + }); + if (image_ids.empty()) { return false; } + ImageId image_id{}; + if (image_ids.size() == 1) { + // Sometimes image size might not exactly match with requested buffer size + // If we only found 1 candidate image use it without too many questions. + image_id = image_ids[0]; + } else { + for (s32 i = 0; i < image_ids.size(); ++i) { + Image& image = texture_cache.GetImage(image_ids[i]); + if (image.info.guest_size == size) { + image_id = image_ids[i]; + break; + } + } + if (!image_id) { + LOG_WARNING(Render_Vulkan, + "Failed to find exact image match for copy addr={:#x}, size={:#x}", + device_addr, size); + return false; + } + } Image& image = texture_cache.GetImage(image_id); - // Only perform sync if image is: - // - GPU modified; otherwise there are no changes to synchronize. - // - Not CPU dirty; otherwise we could overwrite CPU changes with stale GPU changes. - // - Not GPU dirty; otherwise we could overwrite GPU changes with stale image data. - if (False(image.flags & ImageFlagBits::GpuModified) || - True(image.flags & ImageFlagBits::Dirty)) { - return false; - } ASSERT_MSG(device_addr == image.info.guest_address, "Texel buffer aliases image subresources {:x} : {:x}", device_addr, image.info.guest_address); From 5004e41100d6749dec1775ce335d7de3d93b46a8 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios Date: Sun, 8 Jun 2025 22:29:33 +0300 Subject: [PATCH 18/28] Patch movntss and movntsd (#3049) * Patch movntss and movntsd * clang-format * Deduplication * Allow rep to be in other places --- src/core/cpu_patches.cpp | 53 +++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 8937ef04b..8512858e9 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -88,7 +88,8 @@ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { dst_op.reg.value <= ZYDIS_REGISTER_R15; } -static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { +static void GenerateTcbAccess(void* /* address */, const ZydisDecodedOperand* operands, + Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); #if defined(_WIN32) @@ -126,7 +127,8 @@ static bool FilterNoSSE4a(const ZydisDecodedOperand*) { return !cpu.has(Cpu::tSSE4a); } -static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { +static void GenerateEXTRQ(void* /* address */, const ZydisDecodedOperand* operands, + Xbyak::CodeGenerator& c) { bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; @@ -245,7 +247,8 @@ static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenera } } -static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { +static void GenerateINSERTQ(void* /* address */, const ZydisDecodedOperand* operands, + Xbyak::CodeGenerator& c) { bool immediateForm = operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[3].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; @@ -383,8 +386,44 @@ static void GenerateINSERTQ(const ZydisDecodedOperand* operands, Xbyak::CodeGene } } +static void ReplaceMOVNT(void* address, u8 rep_prefix) { + // Find the opcode byte + // There can be any amount of prefixes but the instruction can't be more than 15 bytes + // And we know for sure this is a MOVNTSS/MOVNTSD + bool found = false; + bool rep_prefix_found = false; + int index = 0; + u8* ptr = reinterpret_cast(address); + for (int i = 0; i < 15; i++) { + if (ptr[i] == rep_prefix) { + rep_prefix_found = true; + } else if (ptr[i] == 0x2B) { + index = i; + found = true; + break; + } + } + + // Some sanity checks + ASSERT(found); + ASSERT(index >= 2); + ASSERT(ptr[index - 1] == 0x0F); + ASSERT(rep_prefix_found); + + // This turns the MOVNTSS/MOVNTSD to a MOVSS/MOVSD m, xmm + ptr[index] = 0x11; +} + +static void ReplaceMOVNTSS(void* address, const ZydisDecodedOperand*, Xbyak::CodeGenerator&) { + ReplaceMOVNT(address, 0xF3); +} + +static void ReplaceMOVNTSD(void* address, const ZydisDecodedOperand*, Xbyak::CodeGenerator&) { + ReplaceMOVNT(address, 0xF2); +} + using PatchFilter = bool (*)(const ZydisDecodedOperand*); -using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); +using InstructionGenerator = void (*)(void*, const ZydisDecodedOperand*, Xbyak::CodeGenerator&); struct PatchInfo { /// Filter for more granular patch conditions past just the instruction mnemonic. PatchFilter filter; @@ -400,6 +439,8 @@ static const std::unordered_map Patches = { // SSE4a {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, + {ZYDIS_MNEMONIC_MOVNTSS, {FilterNoSSE4a, ReplaceMOVNTSS, false}}, + {ZYDIS_MNEMONIC_MOVNTSD, {FilterNoSSE4a, ReplaceMOVNTSD, false}}, #if defined(_WIN32) // Windows needs a trampoline. @@ -477,7 +518,7 @@ static std::pair TryPatch(u8* code, PatchModule* module) { auto& trampoline_gen = module->trampoline_gen; const auto trampoline_ptr = trampoline_gen.getCurr(); - patch_info.generator(operands, trampoline_gen); + patch_info.generator(code, operands, trampoline_gen); // Return to the following instruction at the end of the trampoline. trampoline_gen.jmp(code + instruction.length); @@ -485,7 +526,7 @@ static std::pair TryPatch(u8* code, PatchModule* module) { // Replace instruction with near jump to the trampoline. patch_gen.jmp(trampoline_ptr, Xbyak::CodeGenerator::LabelType::T_NEAR); } else { - patch_info.generator(operands, patch_gen); + patch_info.generator(code, operands, patch_gen); } const auto patch_size = patch_gen.getCurr() - code; From 8ffcfc87bd9bcd8396cde82eec9daf2a250fd018 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Sun, 8 Jun 2025 22:46:34 +0300 Subject: [PATCH 19/28] shader_recompiler: Implement linear interpolation support (#3055) --- .../backend/spirv/spirv_emit_context.cpp | 23 +- .../backend/spirv/spirv_emit_context.h | 4 +- .../frontend/structured_control_flow.cpp | 12 +- .../frontend/translate/translate.cpp | 135 ++++---- .../frontend/translate/translate.h | 16 +- .../translate/vector_interpolation.cpp | 5 +- src/shader_recompiler/info.h | 4 + src/shader_recompiler/ir/attribute.h | 19 ++ src/shader_recompiler/ir/ir_emitter.cpp | 1 - src/shader_recompiler/ir/ir_emitter.h | 2 +- src/shader_recompiler/ir/reg.h | 2 +- src/video_core/amdgpu/pixel_format.h | 300 +++++++++++++++++- src/video_core/amdgpu/resource.h | 1 - src/video_core/amdgpu/types.h | 276 ---------------- 14 files changed, 425 insertions(+), 375 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index bd10fd3df..9e51f8e60 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -307,7 +307,9 @@ void EmitContext::DefineInterpolatedAttribs() { const Id p2{OpCompositeExtract(F32[4], p_array, 2U)}; const Id p10{OpFSub(F32[4], p1, p0)}; const Id p20{OpFSub(F32[4], p2, p0)}; - const Id bary_coord{OpLoad(F32[3], gl_bary_coord_id)}; + const Id bary_coord{OpLoad(F32[3], IsLinear(info.interp_qualifiers[i]) + ? bary_coord_linear_id + : bary_coord_persp_id)}; const Id bary_coord_y{OpCompositeExtract(F32[1], bary_coord, 1)}; const Id bary_coord_z{OpCompositeExtract(F32[1], bary_coord, 2)}; const Id p10_y{OpVectorTimesScalar(F32[4], p10, bary_coord_y)}; @@ -411,8 +413,14 @@ void EmitContext::DefineInputs() { DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); } if (profile.needs_manual_interpolation) { - gl_bary_coord_id = - DefineVariable(F32[3], spv::BuiltIn::BaryCoordKHR, spv::StorageClass::Input); + if (info.has_perspective_interp) { + bary_coord_persp_id = + DefineVariable(F32[3], spv::BuiltIn::BaryCoordKHR, spv::StorageClass::Input); + } + if (info.has_linear_interp) { + bary_coord_linear_id = DefineVariable(F32[3], spv::BuiltIn::BaryCoordNoPerspKHR, + spv::StorageClass::Input); + } } for (s32 i = 0; i < runtime_info.fs_info.num_inputs; i++) { const auto& input = runtime_info.fs_info.inputs[i]; @@ -435,9 +443,12 @@ void EmitContext::DefineInputs() { } else { attr_id = DefineInput(type, semantic); Name(attr_id, fmt::format("fs_in_attr{}", semantic)); - } - if (input.is_flat) { - Decorate(attr_id, spv::Decoration::Flat); + + if (input.is_flat) { + Decorate(attr_id, spv::Decoration::Flat); + } else if (IsLinear(info.interp_qualifiers[i])) { + Decorate(attr_id, spv::Decoration::NoPerspective); + } } input_params[semantic] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, false); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index a2e0d2f47..20d936cf0 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -293,8 +293,8 @@ public: Id shared_memory_u32_type{}; - Id interpolate_func{}; - Id gl_bary_coord_id{}; + Id bary_coord_persp_id{}; + Id bary_coord_linear_id{}; struct TextureDefinition { const VectorIds* data_types; diff --git a/src/shader_recompiler/frontend/structured_control_flow.cpp b/src/shader_recompiler/frontend/structured_control_flow.cpp index 11b40d07c..1a7a43f4d 100644 --- a/src/shader_recompiler/frontend/structured_control_flow.cpp +++ b/src/shader_recompiler/frontend/structured_control_flow.cpp @@ -605,11 +605,12 @@ public: Info& info_, const RuntimeInfo& runtime_info_, const Profile& profile_) : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, syntax_list{syntax_list_}, inst_list{inst_list_}, info{info_}, - runtime_info{runtime_info_}, profile{profile_} { + runtime_info{runtime_info_}, profile{profile_}, + translator{info_, runtime_info_, profile_} { Visit(root_stmt, nullptr, nullptr); - IR::Block& first_block{*syntax_list.front().data.block}; - Translator{&first_block, info, runtime_info, profile}.EmitPrologue(); + IR::Block* first_block = syntax_list.front().data.block; + translator.EmitPrologue(first_block); } private: @@ -637,8 +638,8 @@ private: current_block->has_multiple_predecessors = stmt.block->num_predecessors > 1; const u32 start = stmt.block->begin_index; const u32 size = stmt.block->end_index - start + 1; - Translate(current_block, stmt.block->begin, inst_list.subspan(start, size), - info, runtime_info, profile); + translator.Translate(current_block, stmt.block->begin, + inst_list.subspan(start, size)); } break; } @@ -820,6 +821,7 @@ private: Info& info; const RuntimeInfo& runtime_info; const Profile& profile; + Translator translator; }; } // Anonymous namespace diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 5675adf3c..5853f3e72 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -21,16 +21,60 @@ namespace Shader::Gcn { -static u32 next_vgpr_num; -static std::unordered_map vgpr_map; - -Translator::Translator(IR::Block* block_, Info& info_, const RuntimeInfo& runtime_info_, - const Profile& profile_) - : ir{*block_, block_->begin()}, info{info_}, runtime_info{runtime_info_}, profile{profile_} { - next_vgpr_num = vgpr_map.empty() ? runtime_info.num_allocated_vgprs : next_vgpr_num; +Translator::Translator(Info& info_, const RuntimeInfo& runtime_info_, const Profile& profile_) + : info{info_}, runtime_info{runtime_info_}, profile{profile_}, + next_vgpr_num{runtime_info.num_allocated_vgprs} { + if (info.l_stage == LogicalStage::Fragment) { + dst_frag_vreg = GatherInterpQualifiers(); + } } -void Translator::EmitPrologue() { +IR::VectorReg Translator::GatherInterpQualifiers() { + u32 dst_vreg{}; + if (runtime_info.fs_info.addr_flags.persp_sample_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveSample; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveSample; // J + info.has_perspective_interp = true; + } + if (runtime_info.fs_info.addr_flags.persp_center_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveCenter; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveCenter; // J + info.has_perspective_interp = true; + } + if (runtime_info.fs_info.addr_flags.persp_centroid_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveCentroid; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::PerspectiveCentroid; // J + info.has_perspective_interp = true; + } + if (runtime_info.fs_info.addr_flags.persp_pull_model_ena) { + ++dst_vreg; // I/W + ++dst_vreg; // J/W + ++dst_vreg; // 1/W + } + if (runtime_info.fs_info.addr_flags.linear_sample_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearSample; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearSample; // J + info.has_linear_interp = true; + } + if (runtime_info.fs_info.addr_flags.linear_center_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearCenter; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearCenter; // J + info.has_linear_interp = true; + } + if (runtime_info.fs_info.addr_flags.linear_centroid_ena) { + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearCentroid; // I + vgpr_to_interp[dst_vreg++] = IR::Interpolation::LinearCentroid; // J + info.has_linear_interp = true; + } + if (runtime_info.fs_info.addr_flags.line_stipple_tex_ena) { + ++dst_vreg; + } + return IR::VectorReg(dst_vreg); +} + +void Translator::EmitPrologue(IR::Block* first_block) { + ir = IR::IREmitter(*first_block, first_block->begin()); + ir.Prologue(); ir.SetExec(ir.Imm1(true)); @@ -60,39 +104,7 @@ void Translator::EmitPrologue() { } break; case LogicalStage::Fragment: - dst_vreg = IR::VectorReg::V0; - if (runtime_info.fs_info.addr_flags.persp_sample_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.persp_center_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.persp_centroid_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.persp_pull_model_ena) { - ++dst_vreg; // I/W - ++dst_vreg; // J/W - ++dst_vreg; // 1/W - } - if (runtime_info.fs_info.addr_flags.linear_sample_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.linear_center_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.linear_centroid_ena) { - ++dst_vreg; // I - ++dst_vreg; // J - } - if (runtime_info.fs_info.addr_flags.line_stipple_tex_ena) { - ++dst_vreg; - } + dst_vreg = dst_frag_vreg; if (runtime_info.fs_info.addr_flags.pos_x_float_ena) { if (runtime_info.fs_info.en_flags.pos_x_float_ena) { ir.SetVectorReg(dst_vreg++, ir.GetAttribute(IR::Attribute::FragCoord, 0)); @@ -543,6 +555,26 @@ void Translator::LogMissingOpcode(const GcnInst& inst) { info.translation_failed = true; } +void Translator::Translate(IR::Block* block, u32 pc, std::span inst_list) { + if (inst_list.empty()) { + return; + } + ir = IR::IREmitter{*block, block->begin()}; + for (const auto& inst : inst_list) { + pc += inst.length; + + // Special case for emitting fetch shader. + if (inst.opcode == Opcode::S_SWAPPC_B64) { + ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export || + info.stage == Stage::Local); + EmitFetch(inst); + continue; + } + + TranslateInstruction(inst, pc); + } +} + void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) { // Emit instructions for each category. switch (inst.category) { @@ -577,25 +609,4 @@ void Translator::TranslateInstruction(const GcnInst& inst, const u32 pc) { } } -void Translate(IR::Block* block, u32 pc, std::span inst_list, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile) { - if (inst_list.empty()) { - return; - } - Translator translator{block, info, runtime_info, profile}; - for (const auto& inst : inst_list) { - pc += inst.length; - - // Special case for emitting fetch shader. - if (inst.opcode == Opcode::S_SWAPPC_B64) { - ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export || - info.stage == Stage::Local); - translator.EmitFetch(inst); - continue; - } - - translator.TranslateInstruction(inst, pc); - } -} - } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 15ba8c8d7..f8ffb9638 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -53,15 +53,17 @@ enum class NegateMode : u32 { Result, }; +static constexpr size_t MaxInterpVgpr = 16; + class Translator { public: - explicit Translator(IR::Block* block_, Info& info, const RuntimeInfo& runtime_info, - const Profile& profile); + explicit Translator(Info& info, const RuntimeInfo& runtime_info, const Profile& profile); + void Translate(IR::Block* block, u32 pc, std::span inst_list); void TranslateInstruction(const GcnInst& inst, u32 pc); // Instruction categories - void EmitPrologue(); + void EmitPrologue(IR::Block* first_block); void EmitFetch(const GcnInst& inst); void EmitExport(const GcnInst& inst); void EmitFlowControl(u32 pc, const GcnInst& inst); @@ -326,16 +328,18 @@ private: void LogMissingOpcode(const GcnInst& inst); IR::VectorReg GetScratchVgpr(u32 offset); + IR::VectorReg GatherInterpQualifiers(); private: IR::IREmitter ir; Info& info; const RuntimeInfo& runtime_info; const Profile& profile; + u32 next_vgpr_num; + std::unordered_map vgpr_map; + std::array vgpr_to_interp{}; + IR::VectorReg dst_frag_vreg{}; bool opcode_missing = false; }; -void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile); - } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp index 431cb2f04..2d7297c12 100644 --- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp +++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp @@ -22,13 +22,14 @@ void Translator::EmitVectorInterpolation(const GcnInst& inst) { // VINTRP void Translator::V_INTERP_P2_F32(const GcnInst& inst) { - auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); + const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); + info.interp_qualifiers[attr.param_index] = vgpr_to_interp[inst.src[0].code]; const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan)); } void Translator::V_INTERP_MOV_F32(const GcnInst& inst) { - auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); + const auto& attr = runtime_info.fs_info.inputs.at(inst.control.vintrp.attr); const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; SetDst(inst.dst[0], ir.GetAttribute(attrib, inst.control.vintrp.chan)); } diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 24e0741c1..e14c7988d 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -193,6 +193,8 @@ struct Info { PersistentSrtInfo srt_info; std::vector flattened_ud_buf; + std::array interp_qualifiers{}; + IR::ScalarReg tess_consts_ptr_base = IR::ScalarReg::Max; s32 tess_consts_dword_offset = -1; @@ -206,6 +208,8 @@ struct Info { bool has_discard{}; bool has_image_gather{}; bool has_image_query{}; + bool has_perspective_interp{}; + bool has_linear_interp{}; bool uses_atomic_float_min_max{}; bool uses_lane_id{}; bool uses_group_quad{}; diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 5117f5650..68472f052 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -83,6 +83,16 @@ enum class Attribute : u64 { Max, }; +enum class Interpolation { + Invalid = 0, + PerspectiveSample = 1, + PerspectiveCenter = 2, + PerspectiveCentroid = 3, + LinearSample = 4, + LinearCenter = 5, + LinearCentroid = 6, +}; + constexpr size_t NumAttributes = static_cast(Attribute::Max); constexpr size_t NumRenderTargets = 8; constexpr size_t NumParams = 32; @@ -104,6 +114,15 @@ constexpr bool IsMrt(Attribute attribute) noexcept { return attribute >= Attribute::RenderTarget0 && attribute <= Attribute::RenderTarget7; } +constexpr bool IsLinear(Interpolation interp) noexcept { + return interp >= Interpolation::LinearSample && interp <= Interpolation::LinearCentroid; +} + +constexpr bool IsPerspective(Interpolation interp) noexcept { + return interp >= Interpolation::PerspectiveSample && + interp <= Interpolation::PerspectiveCentroid; +} + [[nodiscard]] std::string NameOf(Attribute attribute); [[nodiscard]] constexpr Attribute operator+(Attribute attr, int num) { diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 07249edfe..e6cc32829 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include -#include #include #include #include "common/assert.h" diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 7b9b81093..0e41f4b2d 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -6,7 +6,6 @@ #include #include -#include "shader_recompiler/info.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/condition.h" @@ -17,6 +16,7 @@ namespace Shader::IR { class IREmitter { public: + explicit IREmitter() = default; explicit IREmitter(Block& block_) : block{&block_}, insertion_point{block->end()} {} explicit IREmitter(Block& block_, Block::iterator insertion_point_) : block{&block_}, insertion_point{insertion_point_} {} diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index 82aa436a7..c534eecd8 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -7,7 +7,7 @@ #include "common/bit_field.h" #include "common/enum.h" #include "common/types.h" -#include "video_core/amdgpu/types.h" +#include "video_core/amdgpu/pixel_format.h" namespace Shader::IR { diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 38c81ba5f..faba8e285 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -5,34 +5,310 @@ #include #include +#include "common/assert.h" #include "common/types.h" -#include "video_core/amdgpu/types.h" namespace AmdGpu { -enum NumberClass { +// Table 8.13 Data and Image Formats [Sea Islands Series Instruction Set Architecture] +enum class DataFormat : u32 { + FormatInvalid = 0, + Format8 = 1, + Format16 = 2, + Format8_8 = 3, + Format32 = 4, + Format16_16 = 5, + Format10_11_11 = 6, + Format11_11_10 = 7, + Format10_10_10_2 = 8, + Format2_10_10_10 = 9, + Format8_8_8_8 = 10, + Format32_32 = 11, + Format16_16_16_16 = 12, + Format32_32_32 = 13, + Format32_32_32_32 = 14, + Format5_6_5 = 16, + Format1_5_5_5 = 17, + Format5_5_5_1 = 18, + Format4_4_4_4 = 19, + Format8_24 = 20, + Format24_8 = 21, + FormatX24_8_32 = 22, + FormatGB_GR = 32, + FormatBG_RG = 33, + Format5_9_9_9 = 34, + FormatBc1 = 35, + FormatBc2 = 36, + FormatBc3 = 37, + FormatBc4 = 38, + FormatBc5 = 39, + FormatBc6 = 40, + FormatBc7 = 41, + FormatFmask8_1 = 47, + FormatFmask8_2 = 48, + FormatFmask8_4 = 49, + FormatFmask16_1 = 50, + FormatFmask16_2 = 51, + FormatFmask32_2 = 52, + FormatFmask32_4 = 53, + FormatFmask32_8 = 54, + FormatFmask64_4 = 55, + FormatFmask64_8 = 56, + Format4_4 = 57, + Format6_5_5 = 58, + Format1 = 59, + Format1_Reversed = 60, + Format32_As_8 = 61, + Format32_As_8_8 = 62, + Format32_As_32_32_32_32 = 63, +}; + +enum class NumberFormat : u32 { + Unorm = 0, + Snorm = 1, + Uscaled = 2, + Sscaled = 3, + Uint = 4, + Sint = 5, + SnormNz = 6, + Float = 7, + Srgb = 9, + Ubnorm = 10, + UbnormNz = 11, + Ubint = 12, + Ubscaled = 13, +}; + +enum class NumberClass { Float, Sint, Uint, }; -[[nodiscard]] constexpr NumberClass GetNumberClass(const NumberFormat nfmt) { - switch (nfmt) { - case NumberFormat::Sint: - return Sint; - case NumberFormat::Uint: - return Uint; +enum class CompSwizzle : u8 { + Zero = 0, + One = 1, + Red = 4, + Green = 5, + Blue = 6, + Alpha = 7, +}; + +enum class NumberConversion : u32 { + None = 0, + UintToUscaled = 1, + SintToSscaled = 2, + UnormToUbnorm = 3, + Sint8ToSnormNz = 4, + Sint16ToSnormNz = 5, + Uint32ToUnorm = 6, +}; + +struct CompMapping { + CompSwizzle r; + CompSwizzle g; + CompSwizzle b; + CompSwizzle a; + + auto operator<=>(const CompMapping& other) const = default; + + template + [[nodiscard]] std::array Apply(const std::array& data) const { + return { + ApplySingle(data, r), + ApplySingle(data, g), + ApplySingle(data, b), + ApplySingle(data, a), + }; + } + + [[nodiscard]] CompMapping Inverse() const { + CompMapping result{}; + InverseSingle(result.r, CompSwizzle::Red); + InverseSingle(result.g, CompSwizzle::Green); + InverseSingle(result.b, CompSwizzle::Blue); + InverseSingle(result.a, CompSwizzle::Alpha); + return result; + } + +private: + template + T ApplySingle(const std::array& data, const CompSwizzle swizzle) const { + switch (swizzle) { + case CompSwizzle::Zero: + return T(0); + case CompSwizzle::One: + return T(1); + case CompSwizzle::Red: + return data[0]; + case CompSwizzle::Green: + return data[1]; + case CompSwizzle::Blue: + return data[2]; + case CompSwizzle::Alpha: + return data[3]; + default: + UNREACHABLE(); + } + } + + void InverseSingle(CompSwizzle& dst, const CompSwizzle target) const { + if (r == target) { + dst = CompSwizzle::Red; + } else if (g == target) { + dst = CompSwizzle::Green; + } else if (b == target) { + dst = CompSwizzle::Blue; + } else if (a == target) { + dst = CompSwizzle::Alpha; + } else { + dst = CompSwizzle::Zero; + } + } +}; + +static constexpr CompMapping IdentityMapping = { + .r = CompSwizzle::Red, + .g = CompSwizzle::Green, + .b = CompSwizzle::Blue, + .a = CompSwizzle::Alpha, +}; + +constexpr DataFormat RemapDataFormat(const DataFormat format) { + switch (format) { + case DataFormat::Format11_11_10: + return DataFormat::Format10_11_11; + case DataFormat::Format10_10_10_2: + return DataFormat::Format2_10_10_10; + case DataFormat::Format5_5_5_1: + return DataFormat::Format1_5_5_5; default: - return Float; + return format; } } -[[nodiscard]] constexpr bool IsInteger(const NumberFormat nfmt) { +constexpr NumberFormat RemapNumberFormat(const NumberFormat format, const DataFormat data_format) { + switch (format) { + case NumberFormat::Unorm: { + switch (data_format) { + case DataFormat::Format32: + case DataFormat::Format32_32: + case DataFormat::Format32_32_32: + case DataFormat::Format32_32_32_32: + return NumberFormat::Uint; + default: + return format; + } + } + case NumberFormat::Uscaled: + return NumberFormat::Uint; + case NumberFormat::Sscaled: + case NumberFormat::SnormNz: + return NumberFormat::Sint; + case NumberFormat::Ubnorm: + return NumberFormat::Unorm; + case NumberFormat::Float: + if (data_format == DataFormat::Format8) { + // Games may ask for 8-bit float when they want to access the stencil component + // of a depth-stencil image. Change to unsigned int to match the stencil format. + // This is also the closest approximation to pass the bits through unconverted. + return NumberFormat::Uint; + } + [[fallthrough]]; + default: + return format; + } +} + +constexpr CompMapping RemapSwizzle(const DataFormat format, const CompMapping swizzle) { + switch (format) { + case DataFormat::Format1_5_5_5: + case DataFormat::Format11_11_10: { + CompMapping result; + result.r = swizzle.b; + result.g = swizzle.g; + result.b = swizzle.r; + result.a = swizzle.a; + return result; + } + case DataFormat::Format10_10_10_2: { + CompMapping result; + result.r = swizzle.a; + result.g = swizzle.b; + result.b = swizzle.g; + result.a = swizzle.r; + return result; + } + case DataFormat::Format4_4_4_4: { + // Remap to a more supported component order. + CompMapping result; + result.r = swizzle.g; + result.g = swizzle.b; + result.b = swizzle.a; + result.a = swizzle.r; + return result; + } + default: + return swizzle; + } +} + +constexpr NumberConversion MapNumberConversion(const NumberFormat num_fmt, + const DataFormat data_fmt) { + switch (num_fmt) { + case NumberFormat::Unorm: { + switch (data_fmt) { + case DataFormat::Format32: + case DataFormat::Format32_32: + case DataFormat::Format32_32_32: + case DataFormat::Format32_32_32_32: + return NumberConversion::Uint32ToUnorm; + default: + return NumberConversion::None; + } + } + case NumberFormat::Uscaled: + return NumberConversion::UintToUscaled; + case NumberFormat::Sscaled: + return NumberConversion::SintToSscaled; + case NumberFormat::Ubnorm: + return NumberConversion::UnormToUbnorm; + case NumberFormat::SnormNz: { + switch (data_fmt) { + case DataFormat::Format8: + case DataFormat::Format8_8: + case DataFormat::Format8_8_8_8: + return NumberConversion::Sint8ToSnormNz; + case DataFormat::Format16: + case DataFormat::Format16_16: + case DataFormat::Format16_16_16_16: + return NumberConversion::Sint16ToSnormNz; + default: + UNREACHABLE_MSG("data_fmt = {}", u32(data_fmt)); + } + } + default: + return NumberConversion::None; + } +} + +constexpr NumberClass GetNumberClass(const NumberFormat nfmt) { + switch (nfmt) { + case NumberFormat::Sint: + return NumberClass::Sint; + case NumberFormat::Uint: + return NumberClass::Uint; + default: + return NumberClass::Float; + } +} + +constexpr bool IsInteger(const NumberFormat nfmt) { return nfmt == AmdGpu::NumberFormat::Sint || nfmt == AmdGpu::NumberFormat::Uint; } -[[nodiscard]] std::string_view NameOf(DataFormat fmt); -[[nodiscard]] std::string_view NameOf(NumberFormat fmt); +std::string_view NameOf(DataFormat fmt); +std::string_view NameOf(NumberFormat fmt); int NumComponents(DataFormat format); int NumBits(DataFormat format); diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index 89ac04f9a..5ede90200 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -6,7 +6,6 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/bit_field.h" -#include "common/types.h" #include "video_core/amdgpu/pixel_format.h" namespace AmdGpu { diff --git a/src/video_core/amdgpu/types.h b/src/video_core/amdgpu/types.h index f7536f7e2..009fbbbb2 100644 --- a/src/video_core/amdgpu/types.h +++ b/src/video_core/amdgpu/types.h @@ -5,7 +5,6 @@ #include #include -#include "common/assert.h" #include "common/types.h" namespace AmdGpu { @@ -114,281 +113,6 @@ enum class GsOutputPrimitiveType : u32 { TriangleStrip = 2, }; -// Table 8.13 Data and Image Formats [Sea Islands Series Instruction Set Architecture] -enum class DataFormat : u32 { - FormatInvalid = 0, - Format8 = 1, - Format16 = 2, - Format8_8 = 3, - Format32 = 4, - Format16_16 = 5, - Format10_11_11 = 6, - Format11_11_10 = 7, - Format10_10_10_2 = 8, - Format2_10_10_10 = 9, - Format8_8_8_8 = 10, - Format32_32 = 11, - Format16_16_16_16 = 12, - Format32_32_32 = 13, - Format32_32_32_32 = 14, - Format5_6_5 = 16, - Format1_5_5_5 = 17, - Format5_5_5_1 = 18, - Format4_4_4_4 = 19, - Format8_24 = 20, - Format24_8 = 21, - FormatX24_8_32 = 22, - FormatGB_GR = 32, - FormatBG_RG = 33, - Format5_9_9_9 = 34, - FormatBc1 = 35, - FormatBc2 = 36, - FormatBc3 = 37, - FormatBc4 = 38, - FormatBc5 = 39, - FormatBc6 = 40, - FormatBc7 = 41, - FormatFmask8_1 = 47, - FormatFmask8_2 = 48, - FormatFmask8_4 = 49, - FormatFmask16_1 = 50, - FormatFmask16_2 = 51, - FormatFmask32_2 = 52, - FormatFmask32_4 = 53, - FormatFmask32_8 = 54, - FormatFmask64_4 = 55, - FormatFmask64_8 = 56, - Format4_4 = 57, - Format6_5_5 = 58, - Format1 = 59, - Format1_Reversed = 60, - Format32_As_8 = 61, - Format32_As_8_8 = 62, - Format32_As_32_32_32_32 = 63, -}; - -enum class NumberFormat : u32 { - Unorm = 0, - Snorm = 1, - Uscaled = 2, - Sscaled = 3, - Uint = 4, - Sint = 5, - SnormNz = 6, - Float = 7, - Srgb = 9, - Ubnorm = 10, - UbnormNz = 11, - Ubint = 12, - Ubscaled = 13, -}; - -enum class CompSwizzle : u8 { - Zero = 0, - One = 1, - Red = 4, - Green = 5, - Blue = 6, - Alpha = 7, -}; - -enum class NumberConversion : u32 { - None = 0, - UintToUscaled = 1, - SintToSscaled = 2, - UnormToUbnorm = 3, - Sint8ToSnormNz = 4, - Sint16ToSnormNz = 5, - Uint32ToUnorm = 6, -}; - -struct CompMapping { - CompSwizzle r; - CompSwizzle g; - CompSwizzle b; - CompSwizzle a; - - auto operator<=>(const CompMapping& other) const = default; - - template - [[nodiscard]] std::array Apply(const std::array& data) const { - return { - ApplySingle(data, r), - ApplySingle(data, g), - ApplySingle(data, b), - ApplySingle(data, a), - }; - } - - [[nodiscard]] CompMapping Inverse() const { - CompMapping result{}; - InverseSingle(result.r, CompSwizzle::Red); - InverseSingle(result.g, CompSwizzle::Green); - InverseSingle(result.b, CompSwizzle::Blue); - InverseSingle(result.a, CompSwizzle::Alpha); - return result; - } - -private: - template - T ApplySingle(const std::array& data, const CompSwizzle swizzle) const { - switch (swizzle) { - case CompSwizzle::Zero: - return T(0); - case CompSwizzle::One: - return T(1); - case CompSwizzle::Red: - return data[0]; - case CompSwizzle::Green: - return data[1]; - case CompSwizzle::Blue: - return data[2]; - case CompSwizzle::Alpha: - return data[3]; - default: - UNREACHABLE(); - } - } - - void InverseSingle(CompSwizzle& dst, const CompSwizzle target) const { - if (r == target) { - dst = CompSwizzle::Red; - } else if (g == target) { - dst = CompSwizzle::Green; - } else if (b == target) { - dst = CompSwizzle::Blue; - } else if (a == target) { - dst = CompSwizzle::Alpha; - } else { - dst = CompSwizzle::Zero; - } - } -}; - -static constexpr CompMapping IdentityMapping = { - .r = CompSwizzle::Red, - .g = CompSwizzle::Green, - .b = CompSwizzle::Blue, - .a = CompSwizzle::Alpha, -}; - -inline DataFormat RemapDataFormat(const DataFormat format) { - switch (format) { - case DataFormat::Format11_11_10: - return DataFormat::Format10_11_11; - case DataFormat::Format10_10_10_2: - return DataFormat::Format2_10_10_10; - case DataFormat::Format5_5_5_1: - return DataFormat::Format1_5_5_5; - default: - return format; - } -} - -inline NumberFormat RemapNumberFormat(const NumberFormat format, const DataFormat data_format) { - switch (format) { - case NumberFormat::Unorm: { - switch (data_format) { - case DataFormat::Format32: - case DataFormat::Format32_32: - case DataFormat::Format32_32_32: - case DataFormat::Format32_32_32_32: - return NumberFormat::Uint; - default: - return format; - } - } - case NumberFormat::Uscaled: - return NumberFormat::Uint; - case NumberFormat::Sscaled: - case NumberFormat::SnormNz: - return NumberFormat::Sint; - case NumberFormat::Ubnorm: - return NumberFormat::Unorm; - case NumberFormat::Float: - if (data_format == DataFormat::Format8) { - // Games may ask for 8-bit float when they want to access the stencil component - // of a depth-stencil image. Change to unsigned int to match the stencil format. - // This is also the closest approximation to pass the bits through unconverted. - return NumberFormat::Uint; - } - [[fallthrough]]; - default: - return format; - } -} - -inline CompMapping RemapSwizzle(const DataFormat format, const CompMapping swizzle) { - switch (format) { - case DataFormat::Format1_5_5_5: - case DataFormat::Format11_11_10: { - CompMapping result; - result.r = swizzle.b; - result.g = swizzle.g; - result.b = swizzle.r; - result.a = swizzle.a; - return result; - } - case DataFormat::Format10_10_10_2: { - CompMapping result; - result.r = swizzle.a; - result.g = swizzle.b; - result.b = swizzle.g; - result.a = swizzle.r; - return result; - } - case DataFormat::Format4_4_4_4: { - // Remap to a more supported component order. - CompMapping result; - result.r = swizzle.g; - result.g = swizzle.b; - result.b = swizzle.a; - result.a = swizzle.r; - return result; - } - default: - return swizzle; - } -} - -inline NumberConversion MapNumberConversion(const NumberFormat num_fmt, const DataFormat data_fmt) { - switch (num_fmt) { - case NumberFormat::Unorm: { - switch (data_fmt) { - case DataFormat::Format32: - case DataFormat::Format32_32: - case DataFormat::Format32_32_32: - case DataFormat::Format32_32_32_32: - return NumberConversion::Uint32ToUnorm; - default: - return NumberConversion::None; - } - } - case NumberFormat::Uscaled: - return NumberConversion::UintToUscaled; - case NumberFormat::Sscaled: - return NumberConversion::SintToSscaled; - case NumberFormat::Ubnorm: - return NumberConversion::UnormToUbnorm; - case NumberFormat::SnormNz: { - switch (data_fmt) { - case DataFormat::Format8: - case DataFormat::Format8_8: - case DataFormat::Format8_8_8_8: - return NumberConversion::Sint8ToSnormNz; - case DataFormat::Format16: - case DataFormat::Format16_16: - case DataFormat::Format16_16_16_16: - return NumberConversion::Sint16ToSnormNz; - default: - UNREACHABLE_MSG("data_fmt = {}", u32(data_fmt)); - } - } - default: - return NumberConversion::None; - } -} - } // namespace AmdGpu template <> From ce42eccc9d9b629b2e25296d42f7d686d53cfb25 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Sun, 8 Jun 2025 23:09:08 +0300 Subject: [PATCH 20/28] texture_cache: Handle compressed views of uncompressed images (#3056) * pixel_format: Remove unused tables, refactor * host_compatibilty: Cleanup and support uncompressed views of compressed formats * texture_cache: Handle compressed views of uncompressed images * tile_manager: Bump max supported mips to 16 Fixes a crash during start * oops * texture_cache: Fix order of format compat check --- CMakeLists.txt | 1 + .../ir/passes/lower_buffer_format_to_raw.cpp | 2 +- src/video_core/amdgpu/liverpool.h | 2 +- src/video_core/amdgpu/pixel_format.cpp | 214 +++++----- src/video_core/amdgpu/pixel_format.h | 6 +- .../host_shaders/detilers/micro_128bpp.comp | 2 +- .../host_shaders/detilers/micro_16bpp.comp | 2 +- .../host_shaders/detilers/micro_32bpp.comp | 2 +- .../host_shaders/detilers/micro_64bpp.comp | 2 +- .../host_shaders/detilers/micro_8bpp.comp | 2 +- .../texture_cache/host_compatibility.cpp | 220 ++++++++++ .../texture_cache/host_compatibility.h | 380 +----------------- src/video_core/texture_cache/image.cpp | 65 +-- src/video_core/texture_cache/image_info.cpp | 79 +++- src/video_core/texture_cache/image_info.h | 25 +- .../texture_cache/texture_cache.cpp | 25 +- src/video_core/texture_cache/tile_manager.cpp | 14 +- 17 files changed, 434 insertions(+), 609 deletions(-) create mode 100644 src/video_core/texture_cache/host_compatibility.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 20d33ac95..6dfe9348a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -964,6 +964,7 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/texture_cache/tile_manager.cpp src/video_core/texture_cache/tile_manager.h src/video_core/texture_cache/types.h + src/video_core/texture_cache/host_compatibility.cpp src/video_core/texture_cache/host_compatibility.h src/video_core/page_manager.cpp src/video_core/page_manager.h diff --git a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp index 65be02541..fcb86e3fb 100644 --- a/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp +++ b/src/shader_recompiler/ir/passes/lower_buffer_format_to_raw.cpp @@ -15,7 +15,7 @@ struct FormatInfo { AmdGpu::NumberFormat num_format; AmdGpu::CompMapping swizzle; AmdGpu::NumberConversion num_conversion; - int num_components; + u32 num_components; }; static bool IsBufferFormatLoad(const IR::Inst& inst) { diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 245e34d35..2f33c7302 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -914,7 +914,7 @@ struct Liverpool { } size_t GetColorSliceSize() const { - const auto num_bytes_per_element = NumBits(info.format) / 8u; + const auto num_bytes_per_element = NumBitsPerBlock(info.format) / 8u; const auto slice_size = num_bytes_per_element * (slice.tile_max + 1) * 64u * NumSamples(); return slice_size; diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 881c33e44..682cdf357 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -111,136 +111,106 @@ std::string_view NameOf(NumberFormat fmt) { } } -int NumComponents(DataFormat format) { - constexpr std::array num_components_per_element = { - 0, 1, 1, 2, 1, 2, 3, 3, 4, 4, 4, 2, 4, 3, 4, -1, 3, 4, 4, 4, 2, - 2, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, 3, 4, 4, 4, 1, 2, 3, 4, - -1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 1}; - - const u32 index = static_cast(format); - if (index >= num_components_per_element.size()) { - return 0; - } - return num_components_per_element[index]; -} - -int NumBits(DataFormat format) { - const std::array num_bits_per_element = { - 0, 8, 16, 16, 32, 32, 32, 32, 32, 32, 32, 64, 64, 96, 128, -1, 16, 16, 16, 16, 32, - 32, 64, -1, -1, -1, -1, -1, -1, -1, -1, -1, 16, 16, 32, 4, 8, 8, 4, 8, 8, 8, - -1, -1, 8, 8, 8, 8, 8, 8, 16, 16, 32, 32, 32, 64, 64, 8, 16, 1, 1}; - - const u32 index = static_cast(format); - if (index >= num_bits_per_element.size()) { - return 0; - } - return num_bits_per_element[index]; -} - -static constexpr std::array component_bits = { - std::array{0, 0, 0, 0}, // 0 FormatInvalid - std::array{8, 0, 0, 0}, // 1 Format8 - std::array{16, 0, 0, 0}, // 2 Format16 - std::array{8, 8, 0, 0}, // 3 Format8_8 - std::array{32, 0, 0, 0}, // 4 Format32 - std::array{16, 16, 0, 0}, // 5 Format16_16 - std::array{11, 11, 10, 0}, // 6 Format10_11_11 - std::array{10, 11, 11, 0}, // 7 Format11_11_10 - std::array{2, 10, 10, 10}, // 8 Format10_10_10_2 - std::array{10, 10, 10, 2}, // 9 Format2_10_10_10 - std::array{8, 8, 8, 8}, // 10 Format8_8_8_8 - std::array{32, 32, 0, 0}, // 11 Format32_32 - std::array{16, 16, 16, 16}, // 12 Format16_16_16_16 - std::array{32, 32, 32, 0}, // 13 Format32_32_32 - std::array{32, 32, 32, 32}, // 14 Format32_32_32_32 - std::array{0, 0, 0, 0}, // 15 - std::array{5, 6, 5, 0}, // 16 Format5_6_5 - std::array{5, 5, 5, 1}, // 17 Format1_5_5_5 - std::array{1, 5, 5, 5}, // 18 Format5_5_5_1 - std::array{4, 4, 4, 4}, // 19 Format4_4_4_4 - std::array{24, 8, 0, 0}, // 20 Format8_24 - std::array{8, 24, 0, 0}, // 21 Format24_8 - std::array{8, 24, 0, 0}, // 22 FormatX24_8_32 - std::array{0, 0, 0, 0}, // 23 - std::array{0, 0, 0, 0}, // 24 - std::array{0, 0, 0, 0}, // 25 - std::array{0, 0, 0, 0}, // 26 - std::array{0, 0, 0, 0}, // 27 - std::array{0, 0, 0, 0}, // 28 - std::array{0, 0, 0, 0}, // 29 - std::array{0, 0, 0, 0}, // 30 - std::array{0, 0, 0, 0}, // 31 - std::array{0, 0, 0, 0}, // 32 FormatGB_GR - std::array{0, 0, 0, 0}, // 33 FormatBG_RG - std::array{0, 0, 0, 0}, // 34 Format5_9_9_9 - std::array{0, 0, 0, 0}, // 35 FormatBc1 - std::array{0, 0, 0, 0}, // 36 FormatBc2 - std::array{0, 0, 0, 0}, // 37 FormatBc3 - std::array{0, 0, 0, 0}, // 38 FormatBc4 - std::array{0, 0, 0, 0}, // 39 FormatBc5 - std::array{0, 0, 0, 0}, // 40 FormatBc6 - std::array{0, 0, 0, 0}, // 41 FormatBc7 +static constexpr std::array NUM_COMPONENTS = { + 0, // 0 FormatInvalid + 1, // 1 Format8 + 1, // 2 Format16 + 2, // 3 Format8_8 + 1, // 4 Format32 + 2, // 5 Format16_16 + 3, // 6 Format10_11_11 + 3, // 7 Format11_11_10 + 4, // 8 Format10_10_10_2 + 4, // 9 Format2_10_10_10 + 4, // 10 Format8_8_8_8 + 2, // 11 Format32_32 + 4, // 12 Format16_16_16_16 + 3, // 13 Format32_32_32 + 4, // 14 Format32_32_32_32 + 0, // 15 + 3, // 16 Format5_6_5 + 4, // 17 Format1_5_5_5 + 4, // 18 Format5_5_5_1 + 4, // 19 Format4_4_4_4 + 2, // 20 Format8_24 + 2, // 21 Format24_8 + 2, // 22 FormatX24_8_32 + 0, // 23 + 0, // 24 + 0, // 25 + 0, // 26 + 0, // 27 + 0, // 28 + 0, // 29 + 0, // 30 + 0, // 31 + 3, // 32 FormatGB_GR + 3, // 33 FormatBG_RG + 4, // 34 Format5_9_9_9 + 4, // 35 FormatBc1 + 4, // 36 FormatBc2 + 4, // 37 FormatBc3 + 1, // 38 FormatBc4 + 2, // 39 FormatBc5 + 3, // 40 FormatBc6 + 4, // 41 FormatBc7 }; -u32 ComponentBits(DataFormat format, u32 comp) { +u32 NumComponents(DataFormat format) { const u32 index = static_cast(format); - if (index >= component_bits.size() || comp >= 4) { - return 0; - } - return component_bits[index][comp]; + ASSERT_MSG(index < NUM_COMPONENTS.size(), "Invalid data format = {}", format); + return NUM_COMPONENTS[index]; } -static constexpr std::array component_offset = { - std::array{-1, -1, -1, -1}, // 0 FormatInvalid - std::array{0, -1, -1, -1}, // 1 Format8 - std::array{0, -1, -1, -1}, // 2 Format16 - std::array{0, 8, -1, -1}, // 3 Format8_8 - std::array{0, -1, -1, -1}, // 4 Format32 - std::array{0, 16, -1, -1}, // 5 Format16_16 - std::array{0, 11, 22, -1}, // 6 Format10_11_11 - std::array{0, 10, 21, -1}, // 7 Format11_11_10 - std::array{0, 2, 12, 22}, // 8 Format10_10_10_2 - std::array{0, 10, 20, 30}, // 9 Format2_10_10_10 - std::array{0, 8, 16, 24}, // 10 Format8_8_8_8 - std::array{0, 32, -1, -1}, // 11 Format32_32 - std::array{0, 16, 32, 48}, // 12 Format16_16_16_16 - std::array{0, 32, 64, -1}, // 13 Format32_32_32 - std::array{0, 32, 64, 96}, // 14 Format32_32_32_32 - std::array{-1, -1, -1, -1}, // 15 - std::array{0, 5, 11, -1}, // 16 Format5_6_5 - std::array{0, 5, 10, 15}, // 17 Format1_5_5_5 - std::array{0, 1, 6, 11}, // 18 Format5_5_5_1 - std::array{0, 4, 8, 12}, // 19 Format4_4_4_4 - std::array{0, 24, -1, -1}, // 20 Format8_24 - std::array{0, 8, -1, -1}, // 21 Format24_8 - std::array{0, 8, -1, -1}, // 22 FormatX24_8_32 - std::array{-1, -1, -1, -1}, // 23 - std::array{-1, -1, -1, -1}, // 24 - std::array{-1, -1, -1, -1}, // 25 - std::array{-1, -1, -1, -1}, // 26 - std::array{-1, -1, -1, -1}, // 27 - std::array{-1, -1, -1, -1}, // 28 - std::array{-1, -1, -1, -1}, // 29 - std::array{-1, -1, -1, -1}, // 30 - std::array{-1, -1, -1, -1}, // 31 - std::array{-1, -1, -1, -1}, // 32 FormatGB_GR - std::array{-1, -1, -1, -1}, // 33 FormatBG_RG - std::array{-1, -1, -1, -1}, // 34 Format5_9_9_9 - std::array{-1, -1, -1, -1}, // 35 FormatBc1 - std::array{-1, -1, -1, -1}, // 36 FormatBc2 - std::array{-1, -1, -1, -1}, // 37 FormatBc3 - std::array{-1, -1, -1, -1}, // 38 FormatBc4 - std::array{-1, -1, -1, -1}, // 39 FormatBc5 - std::array{-1, -1, -1, -1}, // 40 FormatBc6 - std::array{-1, -1, -1, -1}, // 41 FormatBc7 +static constexpr std::array BITS_PER_BLOCK = { + 0, // 0 FormatInvalid + 8, // 1 Format8 + 16, // 2 Format16 + 16, // 3 Format8_8 + 32, // 4 Format32 + 32, // 5 Format16_16 + 32, // 6 Format10_11_11 + 32, // 7 Format11_11_10 + 32, // 8 Format10_10_10_2 + 32, // 9 Format2_10_10_10 + 32, // 10 Format8_8_8_8 + 64, // 11 Format32_32 + 64, // 12 Format16_16_16_16 + 96, // 13 Format32_32_32 + 128, // 14 Format32_32_32_32 + 0, // 15 + 16, // 16 Format5_6_5 + 16, // 17 Format1_5_5_5 + 16, // 18 Format5_5_5_1 + 16, // 19 Format4_4_4_4 + 32, // 20 Format8_24 + 32, // 21 Format24_8 + 64, // 22 FormatX24_8_32 + 0, // 23 + 0, // 24 + 0, // 25 + 0, // 26 + 0, // 27 + 0, // 28 + 0, // 29 + 0, // 30 + 0, // 31 + 16, // 32 FormatGB_GR + 16, // 33 FormatBG_RG + 32, // 34 Format5_9_9_9 + 64, // 35 FormatBc1 + 128, // 36 FormatBc2 + 128, // 37 FormatBc3 + 64, // 38 FormatBc4 + 128, // 39 FormatBc5 + 128, // 40 FormatBc6 + 128, // 41 FormatBc7 }; -s32 ComponentOffset(DataFormat format, u32 comp) { +u32 NumBitsPerBlock(DataFormat format) { const u32 index = static_cast(format); - if (index >= component_offset.size() || comp >= 4) { - return -1; - } - return component_offset[index][comp]; + ASSERT_MSG(index < BITS_PER_BLOCK.size(), "Invalid data format = {}", format); + return BITS_PER_BLOCK[index]; } } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index faba8e285..bd0f778f4 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -310,10 +310,8 @@ constexpr bool IsInteger(const NumberFormat nfmt) { std::string_view NameOf(DataFormat fmt); std::string_view NameOf(NumberFormat fmt); -int NumComponents(DataFormat format); -int NumBits(DataFormat format); -u32 ComponentBits(DataFormat format, u32 comp); -s32 ComponentOffset(DataFormat format, u32 comp); +u32 NumComponents(DataFormat format); +u32 NumBitsPerBlock(DataFormat format); } // namespace AmdGpu diff --git a/src/video_core/host_shaders/detilers/micro_128bpp.comp b/src/video_core/host_shaders/detilers/micro_128bpp.comp index a09a0b4c4..a43073a8b 100644 --- a/src/video_core/host_shaders/detilers/micro_128bpp.comp +++ b/src/video_core/host_shaders/detilers/micro_128bpp.comp @@ -16,7 +16,7 @@ layout(push_constant) uniform image_info { uint num_levels; uint pitch; uint height; - uint sizes[14]; + uint sizes[16]; } info; // Inverse morton LUT, small enough to fit into K$ diff --git a/src/video_core/host_shaders/detilers/micro_16bpp.comp b/src/video_core/host_shaders/detilers/micro_16bpp.comp index 909a14acc..5f1240d64 100644 --- a/src/video_core/host_shaders/detilers/micro_16bpp.comp +++ b/src/video_core/host_shaders/detilers/micro_16bpp.comp @@ -18,7 +18,7 @@ layout(push_constant) uniform image_info { uint num_levels; uint pitch; uint height; - uint sizes[14]; + uint sizes[16]; } info; #define MICRO_TILE_DIM 8 diff --git a/src/video_core/host_shaders/detilers/micro_32bpp.comp b/src/video_core/host_shaders/detilers/micro_32bpp.comp index cdc8d0018..605523e4d 100644 --- a/src/video_core/host_shaders/detilers/micro_32bpp.comp +++ b/src/video_core/host_shaders/detilers/micro_32bpp.comp @@ -16,7 +16,7 @@ layout(push_constant) uniform image_info { uint num_levels; uint pitch; uint height; - uint sizes[14]; + uint sizes[16]; } info; // Inverse morton LUT, small enough to fit into K$ diff --git a/src/video_core/host_shaders/detilers/micro_64bpp.comp b/src/video_core/host_shaders/detilers/micro_64bpp.comp index c128ba5a1..1bca44067 100644 --- a/src/video_core/host_shaders/detilers/micro_64bpp.comp +++ b/src/video_core/host_shaders/detilers/micro_64bpp.comp @@ -16,7 +16,7 @@ layout(push_constant) uniform image_info { uint num_levels; uint pitch; uint height; - uint sizes[14]; + uint sizes[16]; } info; // Inverse morton LUT, small enough to fit into K$ diff --git a/src/video_core/host_shaders/detilers/micro_8bpp.comp b/src/video_core/host_shaders/detilers/micro_8bpp.comp index ecf706450..1d9b48daa 100644 --- a/src/video_core/host_shaders/detilers/micro_8bpp.comp +++ b/src/video_core/host_shaders/detilers/micro_8bpp.comp @@ -19,7 +19,7 @@ layout(push_constant) uniform image_info { uint num_levels; uint pitch; uint height; - uint sizes[14]; + uint sizes[16]; } info; #define MICRO_TILE_DIM 8 diff --git a/src/video_core/texture_cache/host_compatibility.cpp b/src/video_core/texture_cache/host_compatibility.cpp new file mode 100644 index 000000000..327709e64 --- /dev/null +++ b/src/video_core/texture_cache/host_compatibility.cpp @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright © 2023 Skyline Team and Contributors (https://github.com/skyline-emu/) +// Copyright © 2015-2023 The Khronos Group Inc. +// Copyright © 2015-2023 Valve Corporation +// Copyright © 2015-2023 LunarG, Inc. + +#include +#include "common/enum.h" +#include "video_core/texture_cache/host_compatibility.h" + +namespace VideoCore { + +/** + * @brief All classes of format compatibility according to the Vulkan specification + * @url + * https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/d37c676f/layers/generated/vk_format_utils.h#L47-L131 + */ +enum class CompatibilityClass { + NONE = 0, + _128BIT = 1 << 0, + _16BIT = 1 << 1, + _192BIT = 1 << 2, + _24BIT = 1 << 3, + _256BIT = 1 << 4, + _32BIT = 1 << 5, + _48BIT = 1 << 6, + _64BIT = 1 << 7, + _8BIT = 1 << 8, + _96BIT = 1 << 9, + BC1_RGB = 1 << 10, + BC1_RGBA = 1 << 11, + BC2 = 1 << 12, + BC3 = 1 << 13, + BC4 = 1 << 14, + BC5 = 1 << 15, + BC6H = 1 << 16, + BC7 = 1 << 17, + D16 = 1 << 18, + D16S8 = 1 << 19, + D24 = 1 << 20, + D24S8 = 1 << 21, + D32 = 1 << 22, + D32S8 = 1 << 23, + S8 = 1 << 24, +}; +DECLARE_ENUM_FLAG_OPERATORS(CompatibilityClass) + +/** + * @brief The format compatibility class according to the Vulkan specification + * @url + * https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#formats-compatibility-classes + * @url + * https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/d37c676f/layers/generated/vk_format_utils.cpp#L70-L812 + */ +static const std::unordered_map FORMAT_TABLE = { + {vk::Format::eA1R5G5B5UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eA2B10G10R10SintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2B10G10R10SnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2B10G10R10SscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2B10G10R10UintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2B10G10R10UnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2B10G10R10UscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10SintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10SnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10SscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10UintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10UnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA2R10G10B10UscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA4B4G4R4UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eA4R4G4B4UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eA8B8G8R8SintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8SnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8SrgbPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8SscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8UintPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8UnormPack32, CompatibilityClass::_32BIT}, + {vk::Format::eA8B8G8R8UscaledPack32, CompatibilityClass::_32BIT}, + {vk::Format::eB10G11R11UfloatPack32, CompatibilityClass::_32BIT}, + {vk::Format::eB4G4R4A4UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eB5G5R5A1UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eB5G6R5UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eB8G8R8A8Sint, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Snorm, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Srgb, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Sscaled, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Uint, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Unorm, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8A8Uscaled, CompatibilityClass::_32BIT}, + {vk::Format::eB8G8R8Sint, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Snorm, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Srgb, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Sscaled, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Uint, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Unorm, CompatibilityClass::_24BIT}, + {vk::Format::eB8G8R8Uscaled, CompatibilityClass::_24BIT}, + {vk::Format::eBc1RgbaSrgbBlock, CompatibilityClass::BC1_RGBA | CompatibilityClass::_64BIT}, + {vk::Format::eBc1RgbaUnormBlock, CompatibilityClass::BC1_RGBA | CompatibilityClass::_64BIT}, + {vk::Format::eBc1RgbSrgbBlock, CompatibilityClass::BC1_RGB | CompatibilityClass::_64BIT}, + {vk::Format::eBc1RgbUnormBlock, CompatibilityClass::BC1_RGB | CompatibilityClass::_64BIT}, + {vk::Format::eBc2SrgbBlock, CompatibilityClass::BC2 | CompatibilityClass::_128BIT}, + {vk::Format::eBc2UnormBlock, CompatibilityClass::BC2 | CompatibilityClass::_128BIT}, + {vk::Format::eBc3SrgbBlock, CompatibilityClass::BC3 | CompatibilityClass::_128BIT}, + {vk::Format::eBc3UnormBlock, CompatibilityClass::BC3 | CompatibilityClass::_128BIT}, + {vk::Format::eBc4SnormBlock, CompatibilityClass::BC4 | CompatibilityClass::_64BIT}, + {vk::Format::eBc4UnormBlock, CompatibilityClass::BC4 | CompatibilityClass::_64BIT}, + {vk::Format::eBc5SnormBlock, CompatibilityClass::BC5 | CompatibilityClass::_128BIT}, + {vk::Format::eBc5UnormBlock, CompatibilityClass::BC5 | CompatibilityClass::_128BIT}, + {vk::Format::eBc6HSfloatBlock, CompatibilityClass::BC6H | CompatibilityClass::_128BIT}, + {vk::Format::eBc6HUfloatBlock, CompatibilityClass::BC6H | CompatibilityClass::_128BIT}, + {vk::Format::eBc7SrgbBlock, CompatibilityClass::BC7 | CompatibilityClass::_128BIT}, + {vk::Format::eBc7UnormBlock, CompatibilityClass::BC7 | CompatibilityClass::_128BIT}, + {vk::Format::eD16Unorm, CompatibilityClass::D16}, + {vk::Format::eD16UnormS8Uint, CompatibilityClass::D16S8}, + {vk::Format::eD24UnormS8Uint, CompatibilityClass::D24S8}, + {vk::Format::eD32Sfloat, CompatibilityClass::D32}, + {vk::Format::eD32SfloatS8Uint, CompatibilityClass::D32S8}, + {vk::Format::eE5B9G9R9UfloatPack32, CompatibilityClass::_32BIT}, + {vk::Format::eR10X6G10X6Unorm2Pack16, CompatibilityClass::_32BIT}, + {vk::Format::eR10X6UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eR12X4G12X4Unorm2Pack16, CompatibilityClass::_32BIT}, + {vk::Format::eR12X4UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eR16G16B16A16Sfloat, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Sint, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Snorm, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Sscaled, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Uint, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Unorm, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16A16Uscaled, CompatibilityClass::_64BIT}, + {vk::Format::eR16G16B16Sfloat, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Sint, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Snorm, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Sscaled, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Uint, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Unorm, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16B16Uscaled, CompatibilityClass::_48BIT}, + {vk::Format::eR16G16Sfloat, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Sint, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Snorm, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Sscaled, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Uint, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Unorm, CompatibilityClass::_32BIT}, + {vk::Format::eR16G16Uscaled, CompatibilityClass::_32BIT}, + {vk::Format::eR16Sfloat, CompatibilityClass::_16BIT}, + {vk::Format::eR16Sint, CompatibilityClass::_16BIT}, + {vk::Format::eR16Snorm, CompatibilityClass::_16BIT}, + {vk::Format::eR16Sscaled, CompatibilityClass::_16BIT}, + {vk::Format::eR16Uint, CompatibilityClass::_16BIT}, + {vk::Format::eR16Unorm, CompatibilityClass::_16BIT}, + {vk::Format::eR16Uscaled, CompatibilityClass::_16BIT}, + {vk::Format::eR32G32B32A32Sfloat, CompatibilityClass::_128BIT}, + {vk::Format::eR32G32B32A32Sint, CompatibilityClass::_128BIT}, + {vk::Format::eR32G32B32A32Uint, CompatibilityClass::_128BIT}, + {vk::Format::eR32G32B32Sfloat, CompatibilityClass::_96BIT}, + {vk::Format::eR32G32B32Sint, CompatibilityClass::_96BIT}, + {vk::Format::eR32G32B32Uint, CompatibilityClass::_96BIT}, + {vk::Format::eR32G32Sfloat, CompatibilityClass::_64BIT}, + {vk::Format::eR32G32Sint, CompatibilityClass::_64BIT}, + {vk::Format::eR32G32Uint, CompatibilityClass::_64BIT}, + {vk::Format::eR32Sfloat, CompatibilityClass::_32BIT}, + {vk::Format::eR32Sint, CompatibilityClass::_32BIT}, + {vk::Format::eR32Uint, CompatibilityClass::_32BIT}, + {vk::Format::eR4G4B4A4UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eR4G4UnormPack8, CompatibilityClass::_8BIT}, + {vk::Format::eR5G5B5A1UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eR5G6B5UnormPack16, CompatibilityClass::_16BIT}, + {vk::Format::eR64G64B64A64Sfloat, CompatibilityClass::_256BIT}, + {vk::Format::eR64G64B64A64Sint, CompatibilityClass::_256BIT}, + {vk::Format::eR64G64B64A64Uint, CompatibilityClass::_256BIT}, + {vk::Format::eR64G64B64Sfloat, CompatibilityClass::_192BIT}, + {vk::Format::eR64G64B64Sint, CompatibilityClass::_192BIT}, + {vk::Format::eR64G64B64Uint, CompatibilityClass::_192BIT}, + {vk::Format::eR64G64Sfloat, CompatibilityClass::_128BIT}, + {vk::Format::eR64G64Sint, CompatibilityClass::_128BIT}, + {vk::Format::eR64G64Uint, CompatibilityClass::_128BIT}, + {vk::Format::eR64Sfloat, CompatibilityClass::_64BIT}, + {vk::Format::eR64Sint, CompatibilityClass::_64BIT}, + {vk::Format::eR64Uint, CompatibilityClass::_64BIT}, + {vk::Format::eR8G8B8A8Sint, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Snorm, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Srgb, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Sscaled, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Uint, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Unorm, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8A8Uscaled, CompatibilityClass::_32BIT}, + {vk::Format::eR8G8B8Sint, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Snorm, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Srgb, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Sscaled, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Uint, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Unorm, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8B8Uscaled, CompatibilityClass::_24BIT}, + {vk::Format::eR8G8Sint, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Snorm, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Srgb, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Sscaled, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Uint, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Unorm, CompatibilityClass::_16BIT}, + {vk::Format::eR8G8Uscaled, CompatibilityClass::_16BIT}, + {vk::Format::eR8Sint, CompatibilityClass::_8BIT}, + {vk::Format::eR8Snorm, CompatibilityClass::_8BIT}, + {vk::Format::eR8Srgb, CompatibilityClass::_8BIT}, + {vk::Format::eR8Sscaled, CompatibilityClass::_8BIT}, + {vk::Format::eR8Uint, CompatibilityClass::_8BIT}, + {vk::Format::eR8Unorm, CompatibilityClass::_8BIT}, + {vk::Format::eR8Uscaled, CompatibilityClass::_8BIT}, + {vk::Format::eS8Uint, CompatibilityClass::S8}, + {vk::Format::eX8D24UnormPack32, CompatibilityClass::D24}, + {vk::Format::eUndefined, CompatibilityClass::NONE}, +}; + +bool IsVulkanFormatCompatible(vk::Format base, vk::Format view) { + if (base == view) { + return true; + } + const auto base_comp = FORMAT_TABLE.at(base); + const auto view_comp = FORMAT_TABLE.at(view); + return (base_comp & view_comp) == view_comp; +} + +} // namespace VideoCore diff --git a/src/video_core/texture_cache/host_compatibility.h b/src/video_core/texture_cache/host_compatibility.h index a73f7e6be..b0579137b 100644 --- a/src/video_core/texture_cache/host_compatibility.h +++ b/src/video_core/texture_cache/host_compatibility.h @@ -6,387 +6,11 @@ #pragma once -#include #include "video_core/renderer_vulkan/vk_common.h" namespace VideoCore { -/** - * @brief All classes of format compatibility according to the Vulkan specification - * @url - * https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/d37c676f75f545a3e5a98d7dfb89864391a1db1e/layers/generated/vk_format_utils.h#L47-L131 - * @note This is copied directly from Vulkan Validation Layers and doesn't follow the Skyline naming - * conventions - */ -enum class FORMAT_COMPATIBILITY_CLASS { - NONE = 0, - _10BIT_2PLANE_420, - _10BIT_2PLANE_422, - _10BIT_2PLANE_444, - _10BIT_3PLANE_420, - _10BIT_3PLANE_422, - _10BIT_3PLANE_444, - _12BIT_2PLANE_420, - _12BIT_2PLANE_422, - _12BIT_2PLANE_444, - _12BIT_3PLANE_420, - _12BIT_3PLANE_422, - _12BIT_3PLANE_444, - _128BIT, - _16BIT, - _16BIT_2PLANE_420, - _16BIT_2PLANE_422, - _16BIT_2PLANE_444, - _16BIT_3PLANE_420, - _16BIT_3PLANE_422, - _16BIT_3PLANE_444, - _192BIT, - _24BIT, - _256BIT, - _32BIT, - _32BIT_B8G8R8G8, - _32BIT_G8B8G8R8, - _48BIT, - _64BIT, - _64BIT_B10G10R10G10, - _64BIT_B12G12R12G12, - _64BIT_B16G16R16G16, - _64BIT_G10B10G10R10, - _64BIT_G12B12G12R12, - _64BIT_G16B16G16R16, - _64BIT_R10G10B10A10, - _64BIT_R12G12B12A12, - _8BIT, - _8BIT_2PLANE_420, - _8BIT_2PLANE_422, - _8BIT_2PLANE_444, - _8BIT_3PLANE_420, - _8BIT_3PLANE_422, - _8BIT_3PLANE_444, - _96BIT, - ASTC_10X10, - ASTC_10X5, - ASTC_10X6, - ASTC_10X8, - ASTC_12X10, - ASTC_12X12, - ASTC_4X4, - ASTC_5X4, - ASTC_5X5, - ASTC_6X5, - ASTC_6X6, - ASTC_8X5, - ASTC_8X6, - ASTC_8X8, - BC1_RGB, - BC1_RGBA, - BC2, - BC3, - BC4, - BC5, - BC6H, - BC7, - D16, - D16S8, - D24, - D24S8, - D32, - D32S8, - EAC_R, - EAC_RG, - ETC2_EAC_RGBA, - ETC2_RGB, - ETC2_RGBA, - PVRTC1_2BPP, - PVRTC1_4BPP, - PVRTC2_2BPP, - PVRTC2_4BPP, - S8 -}; -/** - * @brief The format compatibility class according to the Vulkan specification - * @url - * https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#formats-compatibility-classes - * @url - * https://github.com/KhronosGroup/Vulkan-ValidationLayers/blob/d37c676f75f545a3e5a98d7dfb89864391a1db1e/layers/generated/vk_format_utils.cpp#L70-L812 - * @note This is copied directly from Vulkan Validation Layers and doesn't follow the Skyline naming - * conventions - */ -static const std::unordered_map vkFormatClassTable{ - {VK_FORMAT_A1R5G5B5_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_A2B10G10R10_SINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2B10G10R10_SNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2B10G10R10_SSCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2B10G10R10_UINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2B10G10R10_UNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2B10G10R10_USCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_SINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_SNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_SSCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_UINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_UNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A2R10G10B10_USCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_A8B8G8R8_SINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_SNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_SRGB_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_SSCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_UINT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_UNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_A8B8G8R8_USCALED_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_ASTC_10x10_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_10X10}, - {VK_FORMAT_ASTC_10x10_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X10}, - {VK_FORMAT_ASTC_10x10_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X10}, - {VK_FORMAT_ASTC_10x5_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_10X5}, - {VK_FORMAT_ASTC_10x5_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X5}, - {VK_FORMAT_ASTC_10x5_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X5}, - {VK_FORMAT_ASTC_10x6_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_10X6}, - {VK_FORMAT_ASTC_10x6_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X6}, - {VK_FORMAT_ASTC_10x6_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X6}, - {VK_FORMAT_ASTC_10x8_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_10X8}, - {VK_FORMAT_ASTC_10x8_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X8}, - {VK_FORMAT_ASTC_10x8_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_10X8}, - {VK_FORMAT_ASTC_12x10_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_12X10}, - {VK_FORMAT_ASTC_12x10_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_12X10}, - {VK_FORMAT_ASTC_12x10_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_12X10}, - {VK_FORMAT_ASTC_12x12_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_12X12}, - {VK_FORMAT_ASTC_12x12_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_12X12}, - {VK_FORMAT_ASTC_12x12_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_12X12}, - {VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_4X4}, - {VK_FORMAT_ASTC_4x4_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_4X4}, - {VK_FORMAT_ASTC_4x4_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_4X4}, - {VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_5X4}, - {VK_FORMAT_ASTC_5x4_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_5X4}, - {VK_FORMAT_ASTC_5x4_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_5X4}, - {VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_5X5}, - {VK_FORMAT_ASTC_5x5_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_5X5}, - {VK_FORMAT_ASTC_5x5_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_5X5}, - {VK_FORMAT_ASTC_6x5_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_6X5}, - {VK_FORMAT_ASTC_6x5_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_6X5}, - {VK_FORMAT_ASTC_6x5_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_6X5}, - {VK_FORMAT_ASTC_6x6_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_6X6}, - {VK_FORMAT_ASTC_6x6_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_6X6}, - {VK_FORMAT_ASTC_6x6_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_6X6}, - {VK_FORMAT_ASTC_8x5_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_8X5}, - {VK_FORMAT_ASTC_8x5_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X5}, - {VK_FORMAT_ASTC_8x5_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X5}, - {VK_FORMAT_ASTC_8x6_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_8X6}, - {VK_FORMAT_ASTC_8x6_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X6}, - {VK_FORMAT_ASTC_8x6_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X6}, - {VK_FORMAT_ASTC_8x8_SFLOAT_BLOCK_EXT, FORMAT_COMPATIBILITY_CLASS::ASTC_8X8}, - {VK_FORMAT_ASTC_8x8_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X8}, - {VK_FORMAT_ASTC_8x8_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ASTC_8X8}, - {VK_FORMAT_B10G11R11_UFLOAT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16, - FORMAT_COMPATIBILITY_CLASS::_64BIT_B10G10R10G10}, - {VK_FORMAT_B12X4G12X4R12X4G12X4_422_UNORM_4PACK16, - FORMAT_COMPATIBILITY_CLASS::_64BIT_B12G12R12G12}, - {VK_FORMAT_B16G16R16G16_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_64BIT_B16G16R16G16}, - {VK_FORMAT_B4G4R4A4_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_B5G5R5A1_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_B5G6R5_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_B8G8R8A8_SINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_SNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_SRGB, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_UINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_UNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8A8_USCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_B8G8R8G8_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT_B8G8R8G8}, - {VK_FORMAT_B8G8R8_SINT, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_SNORM, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_SRGB, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_UINT, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_UNORM, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_B8G8R8_USCALED, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_BC1_RGBA_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC1_RGBA}, - {VK_FORMAT_BC1_RGBA_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC1_RGBA}, - {VK_FORMAT_BC1_RGB_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC1_RGB}, - {VK_FORMAT_BC1_RGB_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC1_RGB}, - {VK_FORMAT_BC2_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC2}, - {VK_FORMAT_BC2_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC2}, - {VK_FORMAT_BC3_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC3}, - {VK_FORMAT_BC3_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC3}, - {VK_FORMAT_BC4_SNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC4}, - {VK_FORMAT_BC4_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC4}, - {VK_FORMAT_BC5_SNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC5}, - {VK_FORMAT_BC5_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC5}, - {VK_FORMAT_BC6H_SFLOAT_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC6H}, - {VK_FORMAT_BC6H_UFLOAT_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC6H}, - {VK_FORMAT_BC7_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC7}, - {VK_FORMAT_BC7_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::BC7}, - {VK_FORMAT_D16_UNORM, FORMAT_COMPATIBILITY_CLASS::D16}, - {VK_FORMAT_D16_UNORM_S8_UINT, FORMAT_COMPATIBILITY_CLASS::D16S8}, - {VK_FORMAT_D24_UNORM_S8_UINT, FORMAT_COMPATIBILITY_CLASS::D24S8}, - {VK_FORMAT_D32_SFLOAT, FORMAT_COMPATIBILITY_CLASS::D32}, - {VK_FORMAT_D32_SFLOAT_S8_UINT, FORMAT_COMPATIBILITY_CLASS::D32S8}, - {VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_EAC_R11G11_SNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::EAC_RG}, - {VK_FORMAT_EAC_R11G11_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::EAC_RG}, - {VK_FORMAT_EAC_R11_SNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::EAC_R}, - {VK_FORMAT_EAC_R11_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::EAC_R}, - {VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_RGBA}, - {VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_RGBA}, - {VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_EAC_RGBA}, - {VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_EAC_RGBA}, - {VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_RGB}, - {VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, FORMAT_COMPATIBILITY_CLASS::ETC2_RGB}, - {VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16, - FORMAT_COMPATIBILITY_CLASS::_64BIT_G10B10G10R10}, - {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_10BIT_2PLANE_420}, - {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_10BIT_2PLANE_422}, - {VK_FORMAT_G10X6_B10X6R10X6_2PLANE_444_UNORM_3PACK16_EXT, - FORMAT_COMPATIBILITY_CLASS::_10BIT_2PLANE_444}, - {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_10BIT_3PLANE_420}, - {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_10BIT_3PLANE_422}, - {VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_10BIT_3PLANE_444}, - {VK_FORMAT_G12X4B12X4G12X4R12X4_422_UNORM_4PACK16, - FORMAT_COMPATIBILITY_CLASS::_64BIT_G12B12G12R12}, - {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_420_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_12BIT_2PLANE_420}, - {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_422_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_12BIT_2PLANE_422}, - {VK_FORMAT_G12X4_B12X4R12X4_2PLANE_444_UNORM_3PACK16_EXT, - FORMAT_COMPATIBILITY_CLASS::_12BIT_2PLANE_444}, - {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_420_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_12BIT_3PLANE_420}, - {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_422_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_12BIT_3PLANE_422}, - {VK_FORMAT_G12X4_B12X4_R12X4_3PLANE_444_UNORM_3PACK16, - FORMAT_COMPATIBILITY_CLASS::_12BIT_3PLANE_444}, - {VK_FORMAT_G16B16G16R16_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_64BIT_G16B16G16R16}, - {VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT_2PLANE_420}, - {VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT_2PLANE_422}, - {VK_FORMAT_G16_B16R16_2PLANE_444_UNORM_EXT, FORMAT_COMPATIBILITY_CLASS::_16BIT_2PLANE_444}, - {VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT_3PLANE_420}, - {VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT_3PLANE_422}, - {VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT_3PLANE_444}, - {VK_FORMAT_G8B8G8R8_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT_G8B8G8R8}, - {VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT_2PLANE_420}, - {VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT_2PLANE_422}, - {VK_FORMAT_G8_B8R8_2PLANE_444_UNORM_EXT, FORMAT_COMPATIBILITY_CLASS::_8BIT_2PLANE_444}, - {VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT_3PLANE_420}, - {VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT_3PLANE_422}, - {VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT_3PLANE_444}, - {VK_FORMAT_PVRTC1_2BPP_SRGB_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC1_2BPP}, - {VK_FORMAT_PVRTC1_2BPP_UNORM_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC1_2BPP}, - {VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC1_4BPP}, - {VK_FORMAT_PVRTC1_4BPP_UNORM_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC1_4BPP}, - {VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC2_2BPP}, - {VK_FORMAT_PVRTC2_2BPP_UNORM_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC2_2BPP}, - {VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC2_4BPP}, - {VK_FORMAT_PVRTC2_4BPP_UNORM_BLOCK_IMG, FORMAT_COMPATIBILITY_CLASS::PVRTC2_4BPP}, - {VK_FORMAT_R10X6G10X6B10X6A10X6_UNORM_4PACK16, FORMAT_COMPATIBILITY_CLASS::_64BIT_R10G10B10A10}, - {VK_FORMAT_R10X6G10X6_UNORM_2PACK16, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R10X6_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R12X4G12X4B12X4A12X4_UNORM_4PACK16, FORMAT_COMPATIBILITY_CLASS::_64BIT_R12G12B12A12}, - {VK_FORMAT_R12X4G12X4_UNORM_2PACK16, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R12X4_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16G16B16A16_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_SINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_SNORM, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_SSCALED, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_UINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_UNORM, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16A16_USCALED, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R16G16B16_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_SINT, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_SNORM, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_SSCALED, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_UINT, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_UNORM, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16B16_USCALED, FORMAT_COMPATIBILITY_CLASS::_48BIT}, - {VK_FORMAT_R16G16_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_SINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_SNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_SSCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_UINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_UNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16G16_USCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R16_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_SINT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_SNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_SSCALED, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_UINT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R16_USCALED, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R32G32B32A32_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R32G32B32A32_SINT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R32G32B32A32_UINT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R32G32B32_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_96BIT}, - {VK_FORMAT_R32G32B32_SINT, FORMAT_COMPATIBILITY_CLASS::_96BIT}, - {VK_FORMAT_R32G32B32_UINT, FORMAT_COMPATIBILITY_CLASS::_96BIT}, - {VK_FORMAT_R32G32_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R32G32_SINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R32G32_UINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R32_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R32_SINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R32_UINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R4G4B4A4_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R4G4_UNORM_PACK8, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R5G5B5A1_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R5G6B5_UNORM_PACK16, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R64G64B64A64_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_256BIT}, - {VK_FORMAT_R64G64B64A64_SINT, FORMAT_COMPATIBILITY_CLASS::_256BIT}, - {VK_FORMAT_R64G64B64A64_UINT, FORMAT_COMPATIBILITY_CLASS::_256BIT}, - {VK_FORMAT_R64G64B64_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_192BIT}, - {VK_FORMAT_R64G64B64_SINT, FORMAT_COMPATIBILITY_CLASS::_192BIT}, - {VK_FORMAT_R64G64B64_UINT, FORMAT_COMPATIBILITY_CLASS::_192BIT}, - {VK_FORMAT_R64G64_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R64G64_SINT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R64G64_UINT, FORMAT_COMPATIBILITY_CLASS::_128BIT}, - {VK_FORMAT_R64_SFLOAT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R64_SINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R64_UINT, FORMAT_COMPATIBILITY_CLASS::_64BIT}, - {VK_FORMAT_R8G8B8A8_SINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_SNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_SRGB, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_UINT, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_UNORM, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8A8_USCALED, FORMAT_COMPATIBILITY_CLASS::_32BIT}, - {VK_FORMAT_R8G8B8_SINT, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_SNORM, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_SRGB, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_UINT, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_UNORM, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8B8_USCALED, FORMAT_COMPATIBILITY_CLASS::_24BIT}, - {VK_FORMAT_R8G8_SINT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_SNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_SRGB, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_UINT, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_UNORM, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8G8_USCALED, FORMAT_COMPATIBILITY_CLASS::_16BIT}, - {VK_FORMAT_R8_SINT, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_SNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_SRGB, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_SSCALED, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_UINT, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_UNORM, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_R8_USCALED, FORMAT_COMPATIBILITY_CLASS::_8BIT}, - {VK_FORMAT_S8_UINT, FORMAT_COMPATIBILITY_CLASS::S8}, - {VK_FORMAT_X8_D24_UNORM_PACK32, FORMAT_COMPATIBILITY_CLASS::D24}, - {VK_FORMAT_UNDEFINED, FORMAT_COMPATIBILITY_CLASS::NONE}, -}; +/// Returns true if the two formats are compatible according to Vulkan's format compatibility rules +bool IsVulkanFormatCompatible(vk::Format base, vk::Format view); -/** - * @return If the two formats are compatible according to Vulkan's format compatibility rules - * @url - * https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#formats-compatibility - */ -static bool IsVulkanFormatCompatible(vk::Format lhs, vk::Format rhs) { - if (lhs == rhs) { - return true; - } - return vkFormatClassTable.at(VkFormat(lhs)) == vkFormatClassTable.at(VkFormat(rhs)); -} } // namespace VideoCore diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 522e6fd5b..8522f1307 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -14,62 +14,6 @@ namespace VideoCore { using namespace Vulkan; -bool ImageInfo::IsBlockCoded() const { - switch (pixel_format) { - case vk::Format::eBc1RgbaSrgbBlock: - case vk::Format::eBc1RgbaUnormBlock: - case vk::Format::eBc1RgbSrgbBlock: - case vk::Format::eBc1RgbUnormBlock: - case vk::Format::eBc2SrgbBlock: - case vk::Format::eBc2UnormBlock: - case vk::Format::eBc3SrgbBlock: - case vk::Format::eBc3UnormBlock: - case vk::Format::eBc4SnormBlock: - case vk::Format::eBc4UnormBlock: - case vk::Format::eBc5SnormBlock: - case vk::Format::eBc5UnormBlock: - case vk::Format::eBc6HSfloatBlock: - case vk::Format::eBc6HUfloatBlock: - case vk::Format::eBc7SrgbBlock: - case vk::Format::eBc7UnormBlock: - return true; - default: - return false; - } -} - -bool ImageInfo::IsPacked() const { - switch (pixel_format) { - case vk::Format::eB5G5R5A1UnormPack16: - [[fallthrough]]; - case vk::Format::eB5G6R5UnormPack16: - return true; - default: - return false; - } -} - -bool ImageInfo::IsDepthStencil() const { - switch (pixel_format) { - case vk::Format::eD16Unorm: - case vk::Format::eD16UnormS8Uint: - case vk::Format::eD32Sfloat: - case vk::Format::eD32SfloatS8Uint: - return true; - default: - return false; - } -} - -bool ImageInfo::HasStencil() const { - if (pixel_format == vk::Format::eD32SfloatS8Uint || - pixel_format == vk::Format::eD24UnormS8Uint || - pixel_format == vk::Format::eD16UnormS8Uint) { - return true; - } - return false; -} - static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | @@ -161,6 +105,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, if (info.props.is_volume) { flags |= vk::ImageCreateFlagBits::e2DArrayCompatible; } + if (info.props.is_block) { + flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; + } usage_flags = ImageUsageFlags(info); format_features = FormatFeatureFlags(usage_flags); @@ -372,9 +319,9 @@ void Image::CopyImage(const Image& image) { boost::container::small_vector image_copy{}; for (u32 m = 0; m < image.info.resources.levels; ++m) { - const auto mip_w = std::max(info.size.width >> m, 1u); - const auto mip_h = std::max(info.size.height >> m, 1u); - const auto mip_d = std::max(info.size.depth >> m, 1u); + const auto mip_w = std::max(image.info.size.width >> m, 1u); + const auto mip_h = std::max(image.info.size.height >> m, 1u); + const auto mip_d = std::max(image.info.size.depth >> m, 1u); image_copy.emplace_back(vk::ImageCopy{ .srcSubresource{ diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 39322f449..769c4211f 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -81,7 +81,7 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, tiling_mode = buffer.GetTilingMode(); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()); num_samples = buffer.NumSamples(); - num_bits = NumBits(buffer.GetDataFmt()); + num_bits = NumBitsPerBlock(buffer.GetDataFmt()); type = vk::ImageType::e2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); @@ -142,7 +142,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& de resources.levels = image.NumLevels(); resources.layers = image.NumLayers(); num_samples = image.NumSamples(); - num_bits = NumBits(image.GetDataFmt()); + num_bits = NumBitsPerBlock(image.GetDataFmt()); guest_address = image.Address(); @@ -152,6 +152,80 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& de UpdateSize(); } +bool ImageInfo::IsBlockCoded() const { + switch (pixel_format) { + case vk::Format::eBc1RgbaSrgbBlock: + case vk::Format::eBc1RgbaUnormBlock: + case vk::Format::eBc1RgbSrgbBlock: + case vk::Format::eBc1RgbUnormBlock: + case vk::Format::eBc2SrgbBlock: + case vk::Format::eBc2UnormBlock: + case vk::Format::eBc3SrgbBlock: + case vk::Format::eBc3UnormBlock: + case vk::Format::eBc4SnormBlock: + case vk::Format::eBc4UnormBlock: + case vk::Format::eBc5SnormBlock: + case vk::Format::eBc5UnormBlock: + case vk::Format::eBc6HSfloatBlock: + case vk::Format::eBc6HUfloatBlock: + case vk::Format::eBc7SrgbBlock: + case vk::Format::eBc7UnormBlock: + return true; + default: + return false; + } +} + +bool ImageInfo::IsPacked() const { + switch (pixel_format) { + case vk::Format::eB5G5R5A1UnormPack16: + [[fallthrough]]; + case vk::Format::eB5G6R5UnormPack16: + return true; + default: + return false; + } +} + +bool ImageInfo::IsDepthStencil() const { + switch (pixel_format) { + case vk::Format::eD16Unorm: + case vk::Format::eD16UnormS8Uint: + case vk::Format::eD32Sfloat: + case vk::Format::eD32SfloatS8Uint: + return true; + default: + return false; + } +} + +bool ImageInfo::HasStencil() const { + if (pixel_format == vk::Format::eD32SfloatS8Uint || + pixel_format == vk::Format::eD24UnormS8Uint || + pixel_format == vk::Format::eD16UnormS8Uint) { + return true; + } + return false; +} + +bool ImageInfo::IsCompatible(const ImageInfo& info) const { + return (pixel_format == info.pixel_format && num_samples == info.num_samples && + num_bits == info.num_bits); +} + +bool ImageInfo::IsTilingCompatible(u32 lhs, u32 rhs) const { + if (lhs == rhs) { + return true; + } + if (lhs == 0x0e && rhs == 0x0d) { + return true; + } + if (lhs == 0x0d && rhs == 0x0e) { + return true; + } + return false; +} + void ImageInfo::UpdateSize() { mips_layout.clear(); MipInfo mip_info{}; @@ -163,7 +237,6 @@ void ImageInfo::UpdateSize() { if (props.is_block) { mip_w = (mip_w + 3) / 4; mip_h = (mip_h + 3) / 4; - bpp *= 16; } mip_w = std::max(mip_w, 1u); mip_h = std::max(mip_h, 1u); diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index ca4d9f5e9..47718a095 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -25,6 +25,11 @@ struct ImageInfo { bool IsTiled() const { return tiling_mode != AmdGpu::TilingMode::Display_Linear; } + Extent3D BlockDim() const { + const u32 shift = props.is_block ? 2 : 0; + return Extent3D{size.width >> shift, size.height >> shift, size.depth}; + } + bool IsBlockCoded() const; bool IsPacked() const; bool IsDepthStencil() const; @@ -33,24 +38,8 @@ struct ImageInfo { s32 MipOf(const ImageInfo& info) const; s32 SliceOf(const ImageInfo& info, s32 mip) const; - /// Verifies if images are compatible for subresource merging. - bool IsCompatible(const ImageInfo& info) const { - return (pixel_format == info.pixel_format && num_samples == info.num_samples && - num_bits == info.num_bits); - } - - bool IsTilingCompatible(u32 lhs, u32 rhs) const { - if (lhs == rhs) { - return true; - } - if (lhs == 0x0e && rhs == 0x0d) { - return true; - } - if (lhs == 0x0d && rhs == 0x0e) { - return true; - } - return false; - } + bool IsCompatible(const ImageInfo& info) const; + bool IsTilingCompatible(u32 lhs, u32 rhs) const; void UpdateSize(); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 4b173c313..f070b9132 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -199,7 +199,8 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag scheduler.CurrentTick() - tex_cache_image.tick_accessed_last > NumFramesBeforeRemoval; if (image_info.guest_address == tex_cache_image.info.guest_address) { // Equal address - if (image_info.size != tex_cache_image.info.size) { + if (image_info.BlockDim() != tex_cache_image.info.BlockDim() || + image_info.num_bits != tex_cache_image.info.num_bits) { // Very likely this kind of overlap is caused by allocation from a pool. if (safe_to_delete) { FreeImage(cache_image_id); @@ -211,15 +212,19 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {depth_image_id, -1, -1}; } + if (image_info.IsBlockCoded() && !tex_cache_image.info.IsBlockCoded()) { + // Compressed view of uncompressed image with same block size. + // We need to recreate the image with compressed format and copy. + return {ExpandImage(image_info, cache_image_id), -1, -1}; + } + if (image_info.pixel_format != tex_cache_image.info.pixel_format || image_info.guest_size <= tex_cache_image.info.guest_size) { auto result_id = merged_image_id ? merged_image_id : cache_image_id; const auto& result_image = slot_images[result_id]; - return { - IsVulkanFormatCompatible(image_info.pixel_format, result_image.info.pixel_format) - ? result_id - : ImageId{}, - -1, -1}; + const bool is_compatible = + IsVulkanFormatCompatible(result_image.info.pixel_format, image_info.pixel_format); + return {is_compatible ? result_id : ImageId{}, -1, -1}; } if (image_info.type == tex_cache_image.info.type && @@ -340,7 +345,7 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { continue; } if (False(flags & FindFlags::RelaxFmt) && - (!IsVulkanFormatCompatible(info.pixel_format, cache_image.info.pixel_format) || + (!IsVulkanFormatCompatible(cache_image.info.pixel_format, info.pixel_format) || (cache_image.info.type != info.type && info.size != Extent3D{1, 1, 1}))) { continue; } @@ -512,9 +517,9 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule // So this calculation should be very uncommon and reasonably fast // For now we'll just check up to 64 first pixels const auto addr = std::bit_cast(image.info.guest_address); - const auto w = std::min(image.info.size.width, u32(8)); - const auto h = std::min(image.info.size.height, u32(8)); - const auto size = w * h * image.info.num_bits / 8; + const u32 w = std::min(image.info.size.width, u32(8)); + const u32 h = std::min(image.info.size.height, u32(8)); + const u32 size = w * h * image.info.num_bits >> (3 + image.info.props.is_block ? 4 : 0); const u64 hash = XXH3_64bits(addr, size); if (image.hash == hash) { image.flags &= ~ImageFlagBits::MaybeCpuDirty; diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index d7fc54338..683ac08db 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -25,10 +25,9 @@ namespace VideoCore { const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const { - const auto bpp = info.num_bits * (info.props.is_block ? 16 : 1); switch (info.tiling_mode) { case AmdGpu::TilingMode::Texture_MicroTiled: - switch (bpp) { + switch (info.num_bits) { case 8: return &detilers[DetilerType::Micro8]; case 16: @@ -43,7 +42,7 @@ const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const { return nullptr; } case AmdGpu::TilingMode::Texture_Volume: - switch (bpp) { + switch (info.num_bits) { case 8: return &detilers[DetilerType::Macro8]; case 32: @@ -55,7 +54,7 @@ const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const { } break; case AmdGpu::TilingMode::Display_MicroTiled: - switch (bpp) { + switch (info.num_bits) { case 64: return &detilers[DetilerType::Display_Micro64]; default: @@ -71,7 +70,7 @@ struct DetilerParams { u32 num_levels; u32 pitch0; u32 height; - u32 sizes[14]; + std::array sizes; }; TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) @@ -276,7 +275,7 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o params.sizes[0] = tiles_per_row; params.sizes[1] = tiles_per_slice; } else { - ASSERT(info.resources.levels <= 14); + ASSERT(info.resources.levels <= params.sizes.size()); std::memset(¶ms.sizes, 0, sizeof(params.sizes)); for (int m = 0; m < info.resources.levels; ++m) { params.sizes[m] = info.mips_layout[m].size + (m > 0 ? params.sizes[m - 1] : 0); @@ -287,8 +286,7 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o ¶ms); ASSERT((image_size % 64) == 0); - const auto bpp = info.num_bits * (info.props.is_block ? 16u : 1u); - const auto num_tiles = image_size / (64 * (bpp / 8)); + const auto num_tiles = image_size / (64 * (info.num_bits / 8)); cmdbuf.dispatch(num_tiles, 1, 1); return {out_buffer.first, 0}; } From 14b082f5ea1ac520a6b1c5ce5c930e5c7ada5bb3 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 9 Jun 2025 01:28:00 +0300 Subject: [PATCH 21/28] buffer_cache: Inline data to cpu unless gpu modified (#3061) --- src/video_core/buffer_cache/buffer_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8a5283d83..f53c111e9 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -293,7 +293,7 @@ void BufferCache::BindIndexBuffer(u32 index_offset) { void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); - if (!is_gds && !IsRegionRegistered(address, num_bytes)) { + if (!is_gds && !IsRegionGpuModified(address, num_bytes)) { memcpy(std::bit_cast(address), value, num_bytes); return; } From ae2053c487e67e16a1f33bf5af9280714bc435fd Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Sun, 8 Jun 2025 15:41:58 -0700 Subject: [PATCH 22/28] fix: Disable eBlockTexelViewCompatible on MoltenVK --- src/video_core/texture_cache/image.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 8522f1307..d8070da61 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -105,7 +105,8 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, if (info.props.is_volume) { flags |= vk::ImageCreateFlagBits::e2DArrayCompatible; } - if (info.props.is_block) { + // Not supported by MoltenVK. + if (info.props.is_block && instance->GetDriverID() != vk::DriverId::eMoltenvk) { flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible; } From c20d02dd4094dbab2e990fcd27987e8de1bb4522 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 9 Jun 2025 03:31:51 +0300 Subject: [PATCH 23/28] shader_recompiler: Better handling of geometry shader scenario G (#3064) --- src/shader_recompiler/frontend/copy_shader.cpp | 3 +++ src/shader_recompiler/frontend/copy_shader.h | 5 +++-- .../frontend/translate/translate.h | 1 + .../ir/passes/readlane_elimination_pass.cpp | 1 + .../ir/passes/ring_access_elimination.cpp | 15 ++++++++++++++- src/shader_recompiler/runtime_info.h | 1 + src/video_core/amdgpu/liverpool.h | 10 +++++++++- .../renderer_vulkan/vk_pipeline_cache.cpp | 1 + 8 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/shader_recompiler/frontend/copy_shader.cpp b/src/shader_recompiler/frontend/copy_shader.cpp index 8750e2b18..4b5869e1d 100644 --- a/src/shader_recompiler/frontend/copy_shader.cpp +++ b/src/shader_recompiler/frontend/copy_shader.cpp @@ -67,6 +67,9 @@ CopyShaderData ParseCopyShader(std::span code) { if (last_attr != IR::Attribute::Position0) { data.num_attrs = static_cast(last_attr) - static_cast(IR::Attribute::Param0) + 1; + const auto it = data.attr_map.begin(); + const u32 comp_stride = std::next(it)->first - it->first; + data.output_vertices = comp_stride / 64; } return data; diff --git a/src/shader_recompiler/frontend/copy_shader.h b/src/shader_recompiler/frontend/copy_shader.h index 55cc31ebd..24c7060ed 100644 --- a/src/shader_recompiler/frontend/copy_shader.h +++ b/src/shader_recompiler/frontend/copy_shader.h @@ -3,8 +3,8 @@ #pragma once +#include #include -#include #include "common/types.h" #include "shader_recompiler/ir/attribute.h" @@ -12,8 +12,9 @@ namespace Shader { struct CopyShaderData { - std::unordered_map> attr_map; + std::map> attr_map; u32 num_attrs{0}; + u32 output_vertices{0}; }; CopyShaderData ParseCopyShader(std::span code); diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index f8ffb9638..96ca924a3 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include "shader_recompiler/frontend/instruction.h" #include "shader_recompiler/info.h" #include "shader_recompiler/ir/basic_block.h" diff --git a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp index fbe382d41..9c5f64f84 100644 --- a/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp +++ b/src/shader_recompiler/ir/passes/readlane_elimination_pass.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include #include "shader_recompiler/ir/program.h" namespace Shader::Optimization { diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 071b94ac0..02745bf9a 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -91,6 +91,19 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim const auto& gs_info = runtime_info.gs_info; info.gs_copy_data = Shader::ParseCopyShader(gs_info.vs_copy); + u32 output_vertices = gs_info.output_vertices; + if (info.gs_copy_data.output_vertices && + info.gs_copy_data.output_vertices != output_vertices) { + ASSERT_MSG(output_vertices > info.gs_copy_data.output_vertices && + gs_info.mode == AmdGpu::Liverpool::GsMode::Mode::ScenarioG, + "Invalid geometry shader vertex configuration scenario = {}, max_vert_out = " + "{}, output_vertices = {}", + u32(gs_info.mode), output_vertices, info.gs_copy_data.output_vertices); + LOG_WARNING(Render_Vulkan, "MAX_VERT_OUT {} is larger than actual output vertices {}", + output_vertices, info.gs_copy_data.output_vertices); + output_vertices = info.gs_copy_data.output_vertices; + } + ForEachInstruction([&](IR::IREmitter& ir, IR::Inst& inst) { const auto opcode = inst.GetOpcode(); switch (opcode) { @@ -122,7 +135,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim const auto offset = inst.Flags().inst_offset.Value(); const auto data = ir.BitCast(IR::U32{inst.Arg(2)}); - const auto comp_ofs = gs_info.output_vertices * 4u; + const auto comp_ofs = output_vertices * 4u; const auto output_size = comp_ofs * gs_info.out_vertex_data_size; const auto vc_read_ofs = (((offset / comp_ofs) * comp_ofs) % output_size) * 16u; diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 53d2d5303..5a0408e2c 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -149,6 +149,7 @@ struct GeometryRuntimeInfo { u32 out_vertex_data_size{}; AmdGpu::PrimitiveType in_primitive; GsOutputPrimTypes out_primitive; + AmdGpu::Liverpool::GsMode::Mode mode; std::span vs_copy; u64 vs_copy_hash; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 2f33c7302..d88a44375 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -1179,8 +1179,16 @@ struct Liverpool { }; union GsMode { + enum class Mode : u32 { + Off = 0, + ScenarioA = 1, + ScenarioB = 2, + ScenarioG = 3, + ScenarioC = 4, + }; + u32 raw; - BitField<0, 3, u32> mode; + BitField<0, 3, Mode> mode; BitField<3, 2, u32> cut_mode; BitField<22, 2, u32> onchip; }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index b72f77e55..cd8552515 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -146,6 +146,7 @@ const Shader::RuntimeInfo& PipelineCache::BuildRuntimeInfo(Stage stage, LogicalS } gs_info.in_vertex_data_size = regs.vgt_esgs_ring_itemsize; gs_info.out_vertex_data_size = regs.vgt_gs_vert_itemsize[0]; + gs_info.mode = regs.vgt_gs_mode.mode; const auto params_vc = Liverpool::GetParams(regs.vs_program); gs_info.vs_copy = params_vc.code; gs_info.vs_copy_hash = params_vc.hash; From 12f4a8f073804454132c1da38b79636782134705 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 9 Jun 2025 09:35:43 +0300 Subject: [PATCH 24/28] tile_manager: Downgrade assert to error (#3067) --- src/video_core/texture_cache/tile_manager.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 683ac08db..dd6fae457 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -269,7 +269,10 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o params.height = info.size.height; if (info.tiling_mode == AmdGpu::TilingMode::Texture_Volume || info.tiling_mode == AmdGpu::TilingMode::Display_MicroTiled) { - ASSERT(info.resources.levels == 1); + if (info.resources.levels != 1) { + LOG_ERROR(Render_Vulkan, "Unexpected mipmaps for volume and display tilings {}", + info.resources.levels); + } const auto tiles_per_row = info.pitch / 8u; const auto tiles_per_slice = tiles_per_row * ((info.size.height + 7u) / 8u); params.sizes[0] = tiles_per_row; From d7051f15f4c97ed093be64c5d743e7af37bf4557 Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Mon, 9 Jun 2025 11:26:10 +0300 Subject: [PATCH 25/28] texture_cache: Basic handling of partially resident images (#3066) * texture_cache: Avoid gpu tracking assert on sparse image At the moment just take the easy way of creating the entire image normally and uploading unmapped subresources are zero * tile_manager: Downgrade assert to error * fix macos --- src/core/libraries/kernel/memory.cpp | 3 ++ src/core/memory.cpp | 40 ++++++++++++++++++++ src/core/memory.h | 16 ++++++++ src/video_core/buffer_cache/buffer_cache.cpp | 7 +++- src/video_core/buffer_cache/buffer_cache.h | 5 +++ 5 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/core/libraries/kernel/memory.cpp b/src/core/libraries/kernel/memory.cpp index 5e94199e1..f02ddafdc 100644 --- a/src/core/libraries/kernel/memory.cpp +++ b/src/core/libraries/kernel/memory.cpp @@ -660,6 +660,9 @@ int PS4_SYSV_ABI sceKernelSetPrtAperture(int id, VAddr address, size_t size) { "PRT aperture id = {}, address = {:#x}, size = {:#x} is set but not used", id, address, size); + auto* memory = Core::Memory::Instance(); + memory->SetPrtArea(id, address, size); + PrtApertures[id] = {address, size}; return ORBIS_OK; } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 54cae910b..e738f85a1 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -95,6 +95,46 @@ u64 MemoryManager::ClampRangeSize(VAddr virtual_addr, u64 size) { return clamped_size; } +void MemoryManager::SetPrtArea(u32 id, VAddr address, u64 size) { + PrtArea& area = prt_areas[id]; + if (area.mapped) { + rasterizer->UnmapMemory(area.start, area.end - area.start); + } + + area.start = address; + area.end = address + size; + area.mapped = true; + + // Pretend the entire PRT area is mapped to avoid GPU tracking errors. + // The caches will use CopySparseMemory to fetch data which avoids unmapped areas. + rasterizer->MapMemory(address, size); +} + +void MemoryManager::CopySparseMemory(VAddr virtual_addr, u8* dest, u64 size) { + const bool is_sparse = std::ranges::any_of( + prt_areas, [&](const PrtArea& area) { return area.Overlaps(virtual_addr, size); }); + if (!is_sparse) { + std::memcpy(dest, std::bit_cast(virtual_addr), size); + return; + } + + auto vma = FindVMA(virtual_addr); + ASSERT_MSG(vma->second.Contains(virtual_addr, 0), + "Attempted to access invalid GPU address {:#x}", virtual_addr); + while (size) { + u64 copy_size = std::min(vma->second.size - (virtual_addr - vma->first), size); + if (vma->second.IsFree()) { + std::memset(dest, 0, copy_size); + } else { + std::memcpy(dest, std::bit_cast(virtual_addr), copy_size); + } + size -= copy_size; + virtual_addr += copy_size; + dest += copy_size; + ++vma; + } +} + bool MemoryManager::TryWriteBacking(void* address, const void* data, u32 num_bytes) { const VAddr virtual_addr = std::bit_cast(address); const auto& vma = FindVMA(virtual_addr)->second; diff --git a/src/core/memory.h b/src/core/memory.h index b3ebe3c27..68f9c26c4 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -172,6 +172,10 @@ public: u64 ClampRangeSize(VAddr virtual_addr, u64 size); + void SetPrtArea(u32 id, VAddr address, u64 size); + + void CopySparseMemory(VAddr source, u8* dest, u64 size); + bool TryWriteBacking(void* address, const void* data, u32 num_bytes); void SetupMemoryRegions(u64 flexible_size, bool use_extended_mem1, bool use_extended_mem2); @@ -275,6 +279,18 @@ private: size_t pool_budget{}; Vulkan::Rasterizer* rasterizer{}; + struct PrtArea { + VAddr start; + VAddr end; + bool mapped; + + bool Overlaps(VAddr test_address, u64 test_size) const { + const VAddr overlap_end = test_address + test_size; + return start < overlap_end && test_address < end; + } + }; + std::array prt_areas{}; + friend class ::Core::Devtools::Widget::MemoryMapViewer; }; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index f53c111e9..e470f8e77 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -6,6 +6,7 @@ #include "common/debug.h" #include "common/scope_exit.h" #include "common/types.h" +#include "core/memory.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/host_shaders/fault_buffer_process_comp.h" @@ -28,7 +29,7 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, rasterizer{rasterizer_}, liverpool{liverpool_}, - texture_cache{texture_cache_}, tracker{tracker_}, + memory{Core::Memory::Instance()}, texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, download_buffer(instance, scheduler, MemoryUsage::Download, DownloadBufferSize), @@ -365,7 +366,9 @@ std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, return ObtainBuffer(gpu_addr, size, false, false); } // In all other cases, just do a CPU copy to the staging buffer. - const u32 offset = staging_buffer.Copy(gpu_addr, size, 16); + const auto [data, offset] = staging_buffer.Map(size, 16); + memory->CopySparseMemory(gpu_addr, data, size); + staging_buffer.Commit(); return {&staging_buffer, offset}; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2d6551a7f..c2faf12c8 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -17,6 +17,10 @@ namespace AmdGpu { struct Liverpool; } +namespace Core { +class MemoryManager; +} + namespace Shader { namespace Gcn { struct FetchShaderData; @@ -183,6 +187,7 @@ private: Vulkan::Scheduler& scheduler; Vulkan::Rasterizer& rasterizer; AmdGpu::Liverpool* liverpool; + Core::MemoryManager* memory; TextureCache& texture_cache; PageManager& tracker; StreamBuffer staging_buffer; From 046cf5041235cbd413ca9d0ada28ffcdd0651cb3 Mon Sep 17 00:00:00 2001 From: mailwl Date: Mon, 9 Jun 2025 11:27:37 +0300 Subject: [PATCH 26/28] PM4 type 2 in acb (#3047) * Stub PM4 type 0 * fix command size * revert command size to actual * return unreachable for PM4t0 * remove skipping command body --- src/video_core/amdgpu/liverpool.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index e031d0ebc..464f02e3a 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -228,9 +228,12 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spantype; switch (type) { + default: + UNREACHABLE_MSG("Wrong PM4 type {}", type); + break; case 0: - case 1: - UNREACHABLE_MSG("Unsupported PM4 type {}", type); + UNREACHABLE_MSG("Unimplemented PM4 type 0, base reg: {}, size: {}", + header->type0.base.Value(), header->type0.NumWords()); break; case 2: // Type-2 packet are used for padding purposes @@ -826,6 +829,19 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq break; } + if (header->type == 2) { + // Type-2 packet are used for padding purposes + next_dw_off = 1; + acb += next_dw_off; + acb_dwords -= next_dw_off; + + if constexpr (!is_indirect) { + *queue.read_addr += next_dw_off; + *queue.read_addr %= queue.ring_size_dw; + } + continue; + } + if (header->type != 3) { // No other types of packets were spotted so far UNREACHABLE_MSG("Invalid PM4 type {}", header->type.Value()); From a71bfb30a2002eafd8d9bf1a2742875fc1ab4eb0 Mon Sep 17 00:00:00 2001 From: Lander Gallastegi Date: Mon, 9 Jun 2025 12:04:21 +0200 Subject: [PATCH 27/28] shader_recompiler: Patch SRT walker on segfault (#2991) * Patch srt walker access violations * Fix range * clang-format lolz * Lower log from warning to debug --- src/core/signals.h | 1 + .../passes/flatten_extended_userdata_pass.cpp | 63 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/core/signals.h b/src/core/signals.h index 6ee525e10..0409b73ae 100644 --- a/src/core/signals.h +++ b/src/core/signals.h @@ -5,6 +5,7 @@ #include #include "common/singleton.h" +#include "common/types.h" namespace Core { diff --git a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp index bbf3fe8fb..7253e18c1 100644 --- a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp +++ b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp @@ -10,6 +10,8 @@ #include "common/io_file.h" #include "common/logging/log.h" #include "common/path_util.h" +#include "common/signal_context.h" +#include "core/signals.h" #include "shader_recompiler/info.h" #include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/opcodes.h" @@ -24,6 +26,7 @@ using namespace Xbyak::util; static Xbyak::CodeGenerator g_srt_codegen(32_MB); +static const u8* g_srt_codegen_start = nullptr; namespace { @@ -54,6 +57,57 @@ static void DumpSrtProgram(const Shader::Info& info, const u8* code, size_t code #endif } +static bool SrtWalkerSignalHandler(void* context, void* fault_address) { + // Only handle if the fault address is within the SRT code range + const u8* code_start = g_srt_codegen_start; + const u8* code_end = code_start + g_srt_codegen.getSize(); + const void* code = Common::GetRip(context); + if (code < code_start || code >= code_end) { + return false; // Not in SRT code range + } + + // Patch instruction to zero register + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + ZyanStatus status = Common::Decoder::Instance()->decodeInstruction(instruction, operands, + const_cast(code), 15); + + ASSERT(ZYAN_SUCCESS(status) && instruction.mnemonic == ZYDIS_MNEMONIC_MOV && + operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && + operands[1].type == ZYDIS_OPERAND_TYPE_MEMORY); + + size_t len = instruction.length; + const size_t patch_size = 3; + u8* code_patch = const_cast(reinterpret_cast(code)); + + // We can only encounter rdi or r10d as the first operand in a + // fault memory access for SRT walker. + switch (operands[0].reg.value) { + case ZYDIS_REGISTER_RDI: + // mov rdi, [rdi + (off_dw << 2)] -> xor rdi, rdi + code_patch[0] = 0x48; + code_patch[1] = 0x31; + code_patch[2] = 0xFF; + break; + case ZYDIS_REGISTER_R10D: + // mov r10d, [rdi + (off_dw << 2)] -> xor r10d, r10d + code_patch[0] = 0x45; + code_patch[1] = 0x31; + code_patch[2] = 0xD2; + break; + default: + UNREACHABLE_MSG("Unsupported register for SRT walker patch"); + return false; + } + + // Fill nops + memset(code_patch + patch_size, 0x90, len - patch_size); + + LOG_DEBUG(Render_Recompiler, "Patched SRT walker at {}", code); + + return true; +} + using namespace Shader; struct PassInfo { @@ -141,6 +195,15 @@ static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { return; } + // Register the signal handler for SRT walker, if not already registered + if (g_srt_codegen_start == nullptr) { + g_srt_codegen_start = c.getCurr(); + auto* signals = Core::Signals::Instance(); + // Call after the memory invalidation handler + constexpr u32 priority = 1; + signals->RegisterAccessViolationHandler(SrtWalkerSignalHandler, priority); + } + info.srt_info.walker_func = c.getCurr(); pass_info.dst_off_dw = NumUserDataRegs; From 217d32b5024be79d77de1d541d3c757210e33990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Mon, 9 Jun 2025 21:03:38 +0200 Subject: [PATCH 28/28] Handle DS_READ_U16, DS_WRITE_B16, DS_ADD_U64 (#3007) * Handle DS_READ_U16 & DS_WRITE_B16 * Refactor DS translation * Translate DS_ADD_U64 * format * Fix RingAccessElimination after changing WriteShared64 type * Simplify bounds checking in generated SPIR-V --- .../backend/spirv/emit_spirv.cpp | 5 ++ .../backend/spirv/emit_spirv_atomic.cpp | 85 ++++++++++++------- .../backend/spirv/emit_spirv_bounds.h | 48 +++++++++++ .../backend/spirv/emit_spirv_instructions.h | 4 + .../spirv/emit_spirv_shared_memory.cpp | 83 +++++++++++++----- .../backend/spirv/spirv_emit_context.cpp | 31 +++++-- .../backend/spirv/spirv_emit_context.h | 12 +-- .../frontend/translate/data_share.cpp | 56 +++++++++--- .../frontend/translate/translate.h | 1 + src/shader_recompiler/ir/ir_emitter.cpp | 11 ++- src/shader_recompiler/ir/ir_emitter.h | 2 +- src/shader_recompiler/ir/opcodes.inc | 8 +- .../ir/passes/ring_access_elimination.cpp | 6 +- .../ir/passes/shader_info_collection_pass.cpp | 2 + .../passes/shared_memory_to_storage_pass.cpp | 15 +++- src/shader_recompiler/profile.h | 2 +- .../renderer_vulkan/vk_instance.cpp | 29 ++++++- src/video_core/renderer_vulkan/vk_instance.h | 9 ++ .../renderer_vulkan/vk_pipeline_cache.cpp | 3 +- 19 files changed, 323 insertions(+), 89 deletions(-) create mode 100644 src/shader_recompiler/backend/spirv/emit_spirv_bounds.h diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index f2e6279f4..37d7eea35 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -303,6 +303,11 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct ctx.AddCapability(spv::Capability::PhysicalStorageBufferAddresses); ctx.AddExtension("SPV_KHR_physical_storage_buffer"); } + if (info.uses_shared && profile.supports_workgroup_explicit_memory_layout) { + ctx.AddExtension("SPV_KHR_workgroup_memory_explicit_layout"); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayoutKHR); + ctx.AddCapability(spv::Capability::WorkgroupMemoryExplicitLayout16BitAccessKHR); + } } void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp index a342b47b6..13fd8e180 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -15,42 +17,40 @@ std::pair AtomicArgs(EmitContext& ctx) { Id SharedAtomicU32(EmitContext& ctx, Id offset, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics, value); + }); +} + +Id SharedAtomicU64(EmitContext& ctx, Id offset, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics, value); + }); } Id SharedAtomicU32_IncDec(EmitContext& ctx, Id offset, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id)) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index)}; const auto [scope, semantics]{AtomicArgs(ctx)}; - return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); -} - -Id BufferAtomicU32BoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { - if (Sirit::ValidId(buffer_size)) { - // Bounds checking enabled, wrap in a conditional branch to make sure that - // the atomic is not mistakenly executed when the index is out of bounds. - const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); - const Id ib_label = ctx.OpLabel(); - const Id oob_label = ctx.OpLabel(); - const Id end_label = ctx.OpLabel(); - ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); - ctx.OpBranchConditional(in_bounds, ib_label, oob_label); - ctx.AddLabel(ib_label); - const Id ib_result = emit_func(); - ctx.OpBranch(end_label); - ctx.AddLabel(oob_label); - const Id oob_result = ctx.u32_zero_value; - ctx.OpBranch(end_label); - ctx.AddLabel(end_label); - return ctx.OpPhi(ctx.U32[1], ib_result, ib_label, oob_result, oob_label); - } - // Bounds checking not enabled, just perform the atomic operation. - return emit_func(); + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + return (ctx.*atomic_func)(ctx.U32[1], pointer, scope, semantics); + }); } Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, @@ -63,7 +63,7 @@ Id BufferAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, value); }); } @@ -79,11 +79,26 @@ Id BufferAtomicU32CmpSwap(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre const auto [id, pointer_type] = buffer[EmitContext::PointerType::U32]; const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); const auto [scope, semantics]{AtomicArgs(ctx)}; - return BufferAtomicU32BoundsCheck(ctx, index, buffer.size_dwords, [&] { + return AccessBoundsCheck<32>(ctx, index, buffer.size_dwords, [&] { return (ctx.*atomic_func)(ctx.U32[1], ptr, scope, semantics, semantics, value, cmp_value); }); } +Id BufferAtomicU64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value, + Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { + const auto& buffer = ctx.buffers[handle]; + if (Sirit::ValidId(buffer.offset)) { + address = ctx.OpIAdd(ctx.U32[1], address, buffer.offset); + } + const Id index = ctx.OpShiftRightLogical(ctx.U32[1], address, ctx.ConstU32(3u)); + const auto [id, pointer_type] = buffer[EmitContext::PointerType::U64]; + const Id ptr = ctx.OpAccessChain(pointer_type, id, ctx.u32_zero_value, index); + const auto [scope, semantics]{AtomicArgs(ctx)}; + return AccessBoundsCheck<64>(ctx, index, buffer.size_qwords, [&] { + return (ctx.*atomic_func)(ctx.U64, ptr, scope, semantics, value); + }); +} + Id ImageAtomicU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id value, Id (Sirit::Module::*atomic_func)(Id, Id, Id, Id, Id)) { const auto& texture = ctx.images[handle & 0xFFFF]; @@ -105,6 +120,10 @@ Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value) { + return SharedAtomicU64(ctx, offset, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value) { return SharedAtomicU32(ctx, offset, value, &Sirit::Module::OpAtomicUMax); } @@ -149,6 +168,10 @@ Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); } +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { + return BufferAtomicU64(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicIAdd); +} + Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { return BufferAtomicU32(ctx, inst, handle, address, value, &Sirit::Module::OpAtomicSMin); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h new file mode 100644 index 000000000..41e70c8c3 --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_bounds.h @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { + +template +auto AccessBoundsCheck(EmitContext& ctx, Id index, Id buffer_size, auto emit_func) { + Id zero_value{}; + Id result_type{}; + if constexpr (bit_size == 64) { + zero_value = ctx.u64_zero_value; + result_type = ctx.U64; + } else if constexpr (bit_size == 32) { + zero_value = ctx.u32_zero_value; + result_type = ctx.U32[1]; + } else if constexpr (bit_size == 16) { + zero_value = ctx.u16_zero_value; + result_type = ctx.U16; + } else { + static_assert(false, "type not supported"); + } + if (Sirit::ValidId(buffer_size)) { + // Bounds checking enabled, wrap in a conditional branch to make sure that + // the atomic is not mistakenly executed when the index is out of bounds. + const Id in_bounds = ctx.OpULessThan(ctx.U1[1], index, buffer_size); + const Id ib_label = ctx.OpLabel(); + const Id end_label = ctx.OpLabel(); + ctx.OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); + ctx.OpBranchConditional(in_bounds, ib_label, end_label); + const auto last_label = ctx.last_label; + ctx.AddLabel(ib_label); + const auto ib_result = emit_func(); + ctx.OpBranch(end_label); + ctx.AddLabel(end_label); + if (Sirit::ValidId(ib_result)) { + return ctx.OpPhi(result_type, ib_result, ib_label, zero_value, last_label); + } else { + return Id{0}; + } + } + // Bounds checking not enabled, just perform the atomic operation. + return emit_func(); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index b9707224c..3441c5a23 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -86,6 +86,7 @@ void EmitStoreBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addre void EmitStoreBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicIAdd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); +Id EmitBufferAtomicIAdd64(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicUMin32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSMax32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); @@ -120,11 +121,14 @@ Id EmitUndefU8(EmitContext& ctx); Id EmitUndefU16(EmitContext& ctx); Id EmitUndefU32(EmitContext& ctx); Id EmitUndefU64(EmitContext& ctx); +Id EmitLoadSharedU16(EmitContext& ctx, Id offset); Id EmitLoadSharedU32(EmitContext& ctx, Id offset); Id EmitLoadSharedU64(EmitContext& ctx, Id offset); +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value); void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value); +Id EmitSharedAtomicIAdd64(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicSMax32(EmitContext& ctx, Id offset, Id value); Id EmitSharedAtomicUMin32(EmitContext& ctx, Id offset, Id value); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp index 8b1610d61..c59406499 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_shared_memory.cpp @@ -1,43 +1,86 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/div_ceil.h" +#include "shader_recompiler/backend/spirv/emit_spirv_bounds.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" namespace Shader::Backend::SPIRV { +Id EmitLoadSharedU16(EmitContext& ctx, Id offset) { + const Id shift_id{ctx.ConstU32(1U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + return AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U16, pointer); + }); +} + Id EmitLoadSharedU32(EmitContext& ctx, Id offset) { const Id shift_id{ctx.ConstU32(2U)}; - const Id index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, index); - return ctx.OpLoad(ctx.U32[1], pointer); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + return AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + return ctx.OpLoad(ctx.U32[1], pointer); + }); } Id EmitLoadSharedU64(EmitContext& ctx, Id offset) { - const Id shift_id{ctx.ConstU32(2U)}; - const Id base_index{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift_id)}; - const Id next_index{ctx.OpIAdd(ctx.U32[1], base_index, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, base_index)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_index)}; - return ctx.OpCompositeConstruct(ctx.U32[2], ctx.OpLoad(ctx.U32[1], lhs_pointer), - ctx.OpLoad(ctx.U32[1], rhs_pointer)); + const Id shift_id{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift_id)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + return AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U64, pointer); + }); +} + +void EmitWriteSharedU16(EmitContext& ctx, Id offset, Id value) { + const Id shift{ctx.ConstU32(1U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 2u)}; + + AccessBoundsCheck<16>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u16, ctx.shared_memory_u16, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return Id{0}; + }); } void EmitWriteSharedU32(EmitContext& ctx, Id offset, Id value) { const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id pointer = ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset); - ctx.OpStore(pointer, value); + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 4u)}; + + AccessBoundsCheck<32>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer = + ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, ctx.u32_zero_value, index); + ctx.OpStore(pointer, value); + return Id{0}; + }); } void EmitWriteSharedU64(EmitContext& ctx, Id offset, Id value) { - const Id shift{ctx.ConstU32(2U)}; - const Id word_offset{ctx.OpShiftRightArithmetic(ctx.U32[1], offset, shift)}; - const Id next_offset{ctx.OpIAdd(ctx.U32[1], word_offset, ctx.ConstU32(1U))}; - const Id lhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, word_offset)}; - const Id rhs_pointer{ctx.OpAccessChain(ctx.shared_u32, ctx.shared_memory_u32, next_offset)}; - ctx.OpStore(lhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 0U)); - ctx.OpStore(rhs_pointer, ctx.OpCompositeExtract(ctx.U32[1], value, 1U)); + const Id shift{ctx.ConstU32(3U)}; + const Id index{ctx.OpShiftRightLogical(ctx.U32[1], offset, shift)}; + const u32 num_elements{Common::DivCeil(ctx.runtime_info.cs_info.shared_memory_size, 8u)}; + + AccessBoundsCheck<64>(ctx, index, ctx.ConstU32(num_elements), [&] { + const Id pointer{ + ctx.OpAccessChain(ctx.shared_u64, ctx.shared_memory_u64, ctx.u32_zero_value, index)}; + ctx.OpStore(pointer, value); + return Id{0}; + }); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 9e51f8e60..672856397 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -146,6 +146,7 @@ void EmitContext::DefineArithmeticTypes() { false_value = ConstantFalse(U1[1]); u8_one_value = Constant(U8, 1U); u8_zero_value = Constant(U8, 0U); + u16_zero_value = Constant(U16, 0U); u32_one_value = ConstU32(1U); u32_zero_value = ConstU32(0U); f32_zero_value = ConstF32(0.0f); @@ -285,6 +286,8 @@ void EmitContext::DefineBufferProperties() { Name(buffer.size_shorts, fmt::format("buf{}_short_size", binding)); buffer.size_dwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(2U)); Name(buffer.size_dwords, fmt::format("buf{}_dword_size", binding)); + buffer.size_qwords = OpShiftRightLogical(U32[1], buffer.size, ConstU32(3U)); + Name(buffer.size_qwords, fmt::format("buf{}_qword_size", binding)); } } } @@ -979,13 +982,27 @@ void EmitContext::DefineSharedMemory() { } ASSERT(info.stage == Stage::Compute); const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - const u32 num_elements{Common::DivCeil(shared_memory_size, 4U)}; - const Id type{TypeArray(U32[1], ConstU32(num_elements))}; - shared_memory_u32_type = TypePointer(spv::StorageClass::Workgroup, type); - shared_u32 = TypePointer(spv::StorageClass::Workgroup, U32[1]); - shared_memory_u32 = AddGlobalVariable(shared_memory_u32_type, spv::StorageClass::Workgroup); - Name(shared_memory_u32, "shared_mem"); - interfaces.push_back(shared_memory_u32); + + const auto make_type = [&](Id element_type, u32 element_size) { + const u32 num_elements{Common::DivCeil(shared_memory_size, element_size)}; + const Id array_type{TypeArray(element_type, ConstU32(num_elements))}; + Decorate(array_type, spv::Decoration::ArrayStride, element_size); + + const Id struct_type{TypeStruct(array_type)}; + MemberDecorate(struct_type, 0u, spv::Decoration::Offset, 0u); + Decorate(struct_type, spv::Decoration::Block); + + const Id pointer = TypePointer(spv::StorageClass::Workgroup, struct_type); + const Id element_pointer = TypePointer(spv::StorageClass::Workgroup, element_type); + const Id variable = AddGlobalVariable(pointer, spv::StorageClass::Workgroup); + Decorate(variable, spv::Decoration::Aliased); + interfaces.push_back(variable); + + return std::make_tuple(variable, element_pointer, pointer); + }; + std::tie(shared_memory_u16, shared_u16, shared_memory_u16_type) = make_type(U16, 2u); + std::tie(shared_memory_u32, shared_u32, shared_memory_u32_type) = make_type(U32[1], 4u); + std::tie(shared_memory_u64, shared_u64, shared_memory_u64_type) = make_type(U64, 8u); } Id EmitContext::DefineFloat32ToUfloatM5(u32 mantissa_bits, const std::string_view name) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 20d936cf0..93c4ed265 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -235,17 +235,16 @@ public: Id false_value{}; Id u8_one_value{}; Id u8_zero_value{}; + Id u16_zero_value{}; Id u32_one_value{}; Id u32_zero_value{}; Id f32_zero_value{}; Id u64_one_value{}; Id u64_zero_value{}; - Id shared_u8{}; Id shared_u16{}; Id shared_u32{}; - Id shared_u32x2{}; - Id shared_u32x4{}; + Id shared_u64{}; Id input_u32{}; Id input_f32{}; @@ -285,13 +284,13 @@ public: Id image_u32{}; Id image_f32{}; - Id shared_memory_u8{}; Id shared_memory_u16{}; Id shared_memory_u32{}; - Id shared_memory_u32x2{}; - Id shared_memory_u32x4{}; + Id shared_memory_u64{}; + Id shared_memory_u16_type{}; Id shared_memory_u32_type{}; + Id shared_memory_u64_type{}; Id bary_coord_persp_id{}; Id bary_coord_linear_id{}; @@ -320,6 +319,7 @@ public: Id size; Id size_shorts; Id size_dwords; + Id size_qwords; std::array aliases; const BufferSpv& operator[](PointerType alias) const { diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index c29497ada..4b6a58fd0 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -13,6 +13,8 @@ void Translator::EmitDataShare(const GcnInst& inst) { // DS case Opcode::DS_ADD_U32: return DS_ADD_U32(inst, false); + case Opcode::DS_ADD_U64: + return DS_ADD_U64(inst, false); case Opcode::DS_SUB_U32: return DS_SUB_U32(inst, false); case Opcode::DS_INC_U32: @@ -61,10 +63,14 @@ void Translator::EmitDataShare(const GcnInst& inst) { return DS_READ(32, false, true, false, inst); case Opcode::DS_READ2ST64_B32: return DS_READ(32, false, true, true, inst); + case Opcode::DS_READ_U16: + return DS_READ(16, false, false, false, inst); case Opcode::DS_CONSUME: return DS_CONSUME(inst); case Opcode::DS_APPEND: return DS_APPEND(inst); + case Opcode::DS_WRITE_B16: + return DS_WRITE(16, false, false, false, inst); case Opcode::DS_WRITE_B64: return DS_WRITE(64, false, false, false, inst); case Opcode::DS_WRITE2_B64: @@ -123,6 +129,18 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { } } +void Translator::DS_ADD_U64(const GcnInst& inst, bool rtn) { + const IR::U32 addr{GetSrc(inst.src[0])}; + const IR::U64 data{GetSrc64(inst.src[1])}; + const IR::U32 offset = + ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0)); + const IR::U32 addr_offset = ir.IAdd(addr, offset); + const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); + if (rtn) { + SetDst64(inst.dst[0], IR::U64{original_val}); + } +} + void Translator::DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn) { const IR::U32 addr{GetSrc(inst.src[0])}; const IR::U32 data{GetSrc(inst.src[1])}; @@ -201,23 +219,28 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data0), addr0); } else { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)), - addr0); + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data0), + ir.GetVectorReg(data0 + 1))), + addr0); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); if (bit_size == 32) { ir.WriteShared(32, ir.GetVectorReg(data1), addr1); } else { - ir.WriteShared( - 64, ir.CompositeConstruct(ir.GetVectorReg(data1), ir.GetVectorReg(data1 + 1)), - addr1); + ir.WriteShared(64, + ir.PackUint2x32(ir.CompositeConstruct(ir.GetVectorReg(data1), + ir.GetVectorReg(data1 + 1))), + addr1); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.CompositeConstruct(ir.GetVectorReg(data0), ir.GetVectorReg(data0 + 1)); - ir.WriteShared(bit_size, data, addr0); + ir.WriteShared(bit_size, ir.PackUint2x32(data), addr0); + } else if (bit_size == 16) { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); @@ -289,22 +312,29 @@ void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data0}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data0, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data0}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } const IR::U32 addr1 = ir.IAdd(addr, ir.Imm32(u32(inst.control.ds.offset1 * adj))); const IR::Value data1 = ir.LoadShared(bit_size, is_signed, addr1); if (bit_size == 32) { ir.SetVectorReg(dst_reg++, IR::U32{data1}); } else { - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 0)}); - ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(data1, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data1}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg++, IR::U32{ir.CompositeExtract(vector, 1)}); } } else if (bit_size == 64) { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::Value data = ir.LoadShared(bit_size, is_signed, addr0); - ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(data, 0)}); - ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(data, 1)}); + const auto vector = ir.UnpackUint2x32(IR::U64{data}); + ir.SetVectorReg(dst_reg, IR::U32{ir.CompositeExtract(vector, 0)}); + ir.SetVectorReg(dst_reg + 1, IR::U32{ir.CompositeExtract(vector, 1)}); + } else if (bit_size == 16) { + const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); + const IR::U16 data = IR::U16{ir.LoadShared(bit_size, is_signed, addr0)}; + ir.SetVectorReg(dst_reg, ir.UConvert(32, data)); } else { const IR::U32 addr0 = ir.IAdd(addr, ir.Imm32(offset)); const IR::U32 data = IR::U32{ir.LoadShared(bit_size, is_signed, addr0)}; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 96ca924a3..086b325aa 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -271,6 +271,7 @@ public: // Data share // DS void DS_ADD_U32(const GcnInst& inst, bool rtn); + void DS_ADD_U64(const GcnInst& inst, bool rtn); void DS_MIN_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_MAX_U32(const GcnInst& inst, bool is_signed, bool rtn); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index e6cc32829..2c37c8099 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -293,10 +293,12 @@ void IREmitter::SetPatch(Patch patch, const F32& value) { Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { switch (bit_size) { + case 16: + return Inst(Opcode::LoadSharedU16, offset); case 32: return Inst(Opcode::LoadSharedU32, offset); case 64: - return Inst(Opcode::LoadSharedU64, offset); + return Inst(Opcode::LoadSharedU64, offset); default: UNREACHABLE_MSG("Invalid bit size {}", bit_size); } @@ -304,6 +306,9 @@ Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) { switch (bit_size) { + case 16: + Inst(Opcode::WriteSharedU16, offset, value); + break; case 32: Inst(Opcode::WriteSharedU32, offset, value); break; @@ -315,10 +320,12 @@ void IREmitter::WriteShared(int bit_size, const Value& value, const U32& offset) } } -U32F32 IREmitter::SharedAtomicIAdd(const U32& address, const U32F32& data) { +U32U64 IREmitter::SharedAtomicIAdd(const U32& address, const U32U64& data) { switch (data.Type()) { case Type::U32: return Inst(Opcode::SharedAtomicIAdd32, address, data); + case Type::U64: + return Inst(Opcode::SharedAtomicIAdd64, address, data); default: ThrowInvalidType(data.Type()); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 0e41f4b2d..eae44ed04 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -99,7 +99,7 @@ public: [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); - [[nodiscard]] U32F32 SharedAtomicIAdd(const U32& address, const U32F32& data); + [[nodiscard]] U32U64 SharedAtomicIAdd(const U32& address, const U32U64& data); [[nodiscard]] U32 SharedAtomicIMin(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicIMax(const U32& address, const U32& data, bool is_signed); [[nodiscard]] U32 SharedAtomicAnd(const U32& address, const U32& data); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 5b3216be6..e96e32297 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -30,13 +30,16 @@ OPCODE(EmitVertex, Void, OPCODE(EmitPrimitive, Void, ) // Shared memory operations +OPCODE(LoadSharedU16, U16, U32, ) OPCODE(LoadSharedU32, U32, U32, ) -OPCODE(LoadSharedU64, U32x2, U32, ) +OPCODE(LoadSharedU64, U64, U32, ) +OPCODE(WriteSharedU16, Void, U32, U16, ) OPCODE(WriteSharedU32, Void, U32, U32, ) -OPCODE(WriteSharedU64, Void, U32, U32x2, ) +OPCODE(WriteSharedU64, Void, U32, U64, ) // Shared atomic operations OPCODE(SharedAtomicIAdd32, U32, U32, U32, ) +OPCODE(SharedAtomicIAdd64, U64, U32, U64, ) OPCODE(SharedAtomicSMin32, U32, U32, U32, ) OPCODE(SharedAtomicUMin32, U32, U32, U32, ) OPCODE(SharedAtomicSMax32, U32, U32, U32, ) @@ -116,6 +119,7 @@ OPCODE(StoreBufferFormatF32, Void, Opaq // Buffer atomic operations OPCODE(BufferAtomicIAdd32, U32, Opaque, Opaque, U32 ) +OPCODE(BufferAtomicIAdd64, U64, Opaque, Opaque, U64 ) OPCODE(BufferAtomicSMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicUMin32, U32, Opaque, Opaque, U32 ) OPCODE(BufferAtomicSMax32, U32, Opaque, Opaque, U32 ) diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 02745bf9a..b292b41b9 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -39,11 +39,13 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim ASSERT(addr->Arg(1).IsImmediate()); offset = addr->Arg(1).U32(); } - IR::Value data = inst.Arg(1).Resolve(); + IR::Value data = is_composite ? ir.UnpackUint2x32(IR::U64{inst.Arg(1).Resolve()}) + : inst.Arg(1).Resolve(); for (s32 i = 0; i < num_components; i++) { const auto attrib = IR::Attribute::Param0 + (offset / 16); const auto comp = (offset / 4) % 4; - const IR::U32 value = IR::U32{is_composite ? data.Inst()->Arg(i) : data}; + const IR::U32 value = + IR::U32{is_composite ? ir.CompositeExtract(data, i) : data}; ir.SetAttribute(attrib, ir.BitCast(value), comp); offset += 4; } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index d4759b32e..ba8d1cca6 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -34,8 +34,10 @@ void Visit(Info& info, const IR::Inst& inst) { info.uses_patches |= 1U << IR::GenericPatchIndex(patch); break; } + case IR::Opcode::LoadSharedU16: case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: + case IR::Opcode::WriteSharedU16: case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: info.uses_shared = true; diff --git a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp index 25aaf257c..409c05940 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_to_storage_pass.cpp @@ -16,6 +16,7 @@ static bool IsSharedAccess(const IR::Inst& inst) { case IR::Opcode::WriteSharedU64: case IR::Opcode::SharedAtomicAnd32: case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: case IR::Opcode::SharedAtomicOr32: case IR::Opcode::SharedAtomicSMax32: case IR::Opcode::SharedAtomicUMax32: @@ -33,9 +34,11 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ if (program.info.stage != Stage::Compute) { return; } - // Only perform the transform if the host shared memory is insufficient. + // Only perform the transform if the host shared memory is insufficient + // or the device does not support VK_KHR_workgroup_memory_explicit_layout const u32 shared_memory_size = runtime_info.cs_info.shared_memory_size; - if (shared_memory_size <= profile.max_shared_memory_size) { + if (shared_memory_size <= profile.max_shared_memory_size && + profile.supports_workgroup_explicit_memory_layout) { return; } // Add buffer binding for shared memory storage buffer. @@ -60,6 +63,7 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.BufferAtomicAnd(handle, inst.Arg(0), inst.Arg(1), {})); continue; case IR::Opcode::SharedAtomicIAdd32: + case IR::Opcode::SharedAtomicIAdd64: inst.ReplaceUsesWithAndRemove( ir.BufferAtomicIAdd(handle, inst.Arg(0), inst.Arg(1), {})); continue; @@ -93,12 +97,19 @@ void SharedMemoryToStoragePass(IR::Program& program, const RuntimeInfo& runtime_ ir.Imm32(shared_memory_size)); const IR::U32 address = ir.IAdd(IR::U32{inst.Arg(0)}, offset); switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU16: + inst.ReplaceUsesWithAndRemove(ir.LoadBufferU16(handle, address, {})); + break; case IR::Opcode::LoadSharedU32: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(1, handle, address, {})); break; case IR::Opcode::LoadSharedU64: inst.ReplaceUsesWithAndRemove(ir.LoadBufferU32(2, handle, address, {})); break; + case IR::Opcode::WriteSharedU16: + ir.StoreBufferU16(handle, address, IR::U32{inst.Arg(1)}, {}); + inst.Invalidate(); + break; case IR::Opcode::WriteSharedU32: ir.StoreBufferU32(1, handle, address, inst.Arg(1), {}); inst.Invalidate(); diff --git a/src/shader_recompiler/profile.h b/src/shader_recompiler/profile.h index 853e4854d..7d313180f 100644 --- a/src/shader_recompiler/profile.h +++ b/src/shader_recompiler/profile.h @@ -23,13 +23,13 @@ struct Profile { bool support_fp32_denorm_preserve{}; bool support_fp32_denorm_flush{}; bool support_fp32_round_to_zero{}; - bool support_explicit_workgroup_layout{}; bool support_legacy_vertex_attributes{}; bool supports_image_load_store_lod{}; bool supports_native_cube_calc{}; bool supports_trinary_minmax{}; bool supports_robust_buffer_access{}; bool supports_image_fp32_atomic_min_max{}; + bool supports_workgroup_explicit_memory_layout{}; bool has_broken_spirv_clamp{}; bool lower_left_origin_mode{}; bool needs_manual_interpolation{}; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 9584329f0..0591e06ce 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -212,7 +212,8 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT, vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT, vk::PhysicalDevicePortabilitySubsetFeaturesKHR, - vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT>(); + vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR>(); features = feature_chain.get().features; const vk::StructureChain properties_chain = physical_device.getProperties2< @@ -283,6 +284,20 @@ bool Instance::CreateDevice() { LOG_INFO(Render_Vulkan, "- shaderImageFloat32AtomicMinMax: {}", shader_atomic_float2_features.shaderImageFloat32AtomicMinMax); } + workgroup_memory_explicit_layout = + add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); + if (workgroup_memory_explicit_layout) { + workgroup_memory_explicit_layout_features = + feature_chain.get(); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayout: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout); + LOG_INFO(Render_Vulkan, "- workgroupMemoryExplicitLayoutScalarBlockLayout: {}", + workgroup_memory_explicit_layout_features + .workgroupMemoryExplicitLayoutScalarBlockLayout); + LOG_INFO( + Render_Vulkan, "- workgroupMemoryExplicitLayout16BitAccess: {}", + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess); + } const bool calibrated_timestamps = TRACY_GPU_ENABLED ? add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME) : false; @@ -420,6 +435,15 @@ bool Instance::CreateDevice() { .shaderImageFloat32AtomicMinMax = shader_atomic_float2_features.shaderImageFloat32AtomicMinMax, }, + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR{ + .workgroupMemoryExplicitLayout = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout, + .workgroupMemoryExplicitLayoutScalarBlockLayout = + workgroup_memory_explicit_layout_features + .workgroupMemoryExplicitLayoutScalarBlockLayout, + .workgroupMemoryExplicitLayout16BitAccess = + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess, + }, #ifdef __APPLE__ portability_features, #endif @@ -452,6 +476,9 @@ bool Instance::CreateDevice() { if (!shader_atomic_float2) { device_chain.unlink(); } + if (!workgroup_memory_explicit_layout) { + device_chain.unlink(); + } auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); if (device_result != vk::Result::eSuccess) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 30848e8b7..c687e6f67 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -171,6 +171,12 @@ public: return shader_atomic_float2 && shader_atomic_float2_features.shaderImageFloat32AtomicMinMax; } + /// Returns true when VK_KHR_workgroup_memory_explicit_layout is supported. + bool IsWorkgroupMemoryExplicitLayoutSupported() const { + return workgroup_memory_explicit_layout && + workgroup_memory_explicit_layout_features.workgroupMemoryExplicitLayout16BitAccess; + } + /// Returns true when geometry shaders are supported by the device bool IsGeometryStageSupported() const { return features.geometryShader; @@ -349,6 +355,8 @@ private: vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT dynamic_state_3_features; vk::PhysicalDeviceRobustness2FeaturesEXT robustness2_features; vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT shader_atomic_float2_features; + vk::PhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR + workgroup_memory_explicit_layout_features; vk::DriverIdKHR driver_id; vk::UniqueDebugUtilsMessengerEXT debug_callback{}; std::string vendor_name; @@ -374,6 +382,7 @@ private: bool amd_gcn_shader{}; bool amd_shader_trinary_minmax{}; bool shader_atomic_float2{}; + bool workgroup_memory_explicit_layout{}; bool portability_subset{}; }; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index cd8552515..2c3f4ba2f 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -210,7 +210,6 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, .support_fp32_denorm_preserve = bool(vk12_props.shaderDenormPreserveFloat32), .support_fp32_denorm_flush = bool(vk12_props.shaderDenormFlushToZeroFloat32), .support_fp32_round_to_zero = bool(vk12_props.shaderRoundingModeRTZFloat32), - .support_explicit_workgroup_layout = true, .support_legacy_vertex_attributes = instance_.IsLegacyVertexAttributesSupported(), .supports_image_load_store_lod = instance_.IsImageLoadStoreLodSupported(), .supports_native_cube_calc = instance_.IsAmdGcnShaderSupported(), @@ -218,6 +217,8 @@ PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, // TODO: Emitted bounds checks cause problems with phi control flow; needs to be fixed. .supports_robust_buffer_access = true, // instance_.IsRobustBufferAccess2Supported(), .supports_image_fp32_atomic_min_max = instance_.IsShaderAtomicFloatImage32MinMaxSupported(), + .supports_workgroup_explicit_memory_layout = + instance_.IsWorkgroupMemoryExplicitLayoutSupported(), .needs_manual_interpolation = instance.IsFragmentShaderBarycentricSupported() && instance.GetDriverID() == vk::DriverId::eNvidiaProprietary, .needs_lds_barriers = instance.GetDriverID() == vk::DriverId::eNvidiaProprietary ||