From d9108cd39a66f9b6275e303558f0f0f92a807c7b Mon Sep 17 00:00:00 2001 From: TheTurtle Date: Fri, 8 Aug 2025 15:27:11 +0300 Subject: [PATCH] video_core: Rework tile manager (#3374) * video_core: Rework detiling * video_core: Support tiling and macrotile detiling * clang format * image_info: Cleanups * resource: Revert some changes * texture_cache: Fix small error * image_info: Set depth flag on depth promote * buffer_cache: Remove level check * tile_manager: Handle case of staging buffer causing flush * image_info: Add 2D thick array mode * image_info: Add slices to mip size * tile_manager: Set bank swizzle * buffer_cache: Support image copies from DmaData * vk_rasterizer: Accelerate trivial render target copies with compute Before tiling PR compute image copies were done with the following sequence vkCmdCopyImageToBuffer (in SynchronizeBufferFromImage) -> vkCmdDispatch (copy) -> vkCmdCopyBufferToImage (in RefreshImage) With the tiling PR it added extra tiling/detiling steps vkCmdCopyImageToBuffer -> vkCmdDispatch (tiling) -> vkCmdDispatch (copy) -> vkCmdDispatch (detiling) -> vkCmdCopyBufferToImage This is quite a bit of overhead for a simple image copy. This commit tries to detect trivial image copies i.e cs shaders that copy the full source image to all of the destination. So now all this sequence is just a vkCmdCopyImage. How much it triggers depends on the guest * texture_cache: Fix build * image: Copy all subresources with buffer too --- CMakeLists.txt | 2 + src/core/devtools/widget/reg_popup.cpp | 2 +- src/shader_recompiler/info.h | 4 +- src/shader_recompiler/specialization.h | 2 +- src/video_core/amdgpu/liverpool.h | 20 +- src/video_core/amdgpu/pixel_format.cpp | 71 ++- src/video_core/amdgpu/pixel_format.h | 9 +- src/video_core/amdgpu/resource.h | 103 ++-- src/video_core/amdgpu/tiling.cpp | 554 ++++++++++++++++++ src/video_core/amdgpu/tiling.h | 149 +++++ src/video_core/buffer_cache/buffer_cache.cpp | 236 ++++---- src/video_core/buffer_cache/buffer_cache.h | 13 +- src/video_core/host_shaders/CMakeLists.txt | 1 + src/video_core/host_shaders/tiling.comp | 444 ++++++++++++++ .../renderer_vulkan/vk_instance.cpp | 7 + src/video_core/renderer_vulkan/vk_instance.h | 6 + src/video_core/renderer_vulkan/vk_presenter.h | 1 + .../renderer_vulkan/vk_rasterizer.cpp | 94 ++- .../renderer_vulkan/vk_rasterizer.h | 1 + src/video_core/texture_cache/image.cpp | 123 ++-- src/video_core/texture_cache/image.h | 2 +- src/video_core/texture_cache/image_info.cpp | 178 ++---- src/video_core/texture_cache/image_info.h | 68 ++- src/video_core/texture_cache/image_view.cpp | 35 +- src/video_core/texture_cache/image_view.h | 7 +- .../texture_cache/texture_cache.cpp | 150 ++--- src/video_core/texture_cache/texture_cache.h | 21 +- src/video_core/texture_cache/tile.h | 34 +- src/video_core/texture_cache/tile_manager.cpp | 413 +++++++------ src/video_core/texture_cache/tile_manager.h | 46 +- 30 files changed, 1999 insertions(+), 797 deletions(-) create mode 100644 src/video_core/amdgpu/tiling.cpp create mode 100644 src/video_core/amdgpu/tiling.h create mode 100644 src/video_core/host_shaders/tiling.comp diff --git a/CMakeLists.txt b/CMakeLists.txt index f09e3a1ed..94f5d4dce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -925,6 +925,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/amdgpu/pm4_cmds.h src/video_core/amdgpu/pm4_opcodes.h src/video_core/amdgpu/resource.h + src/video_core/amdgpu/tiling.cpp + src/video_core/amdgpu/tiling.h src/video_core/amdgpu/types.h src/video_core/amdgpu/default_context.cpp src/video_core/buffer_cache/buffer.cpp diff --git a/src/core/devtools/widget/reg_popup.cpp b/src/core/devtools/widget/reg_popup.cpp index 7bb38df24..90d8c9681 100644 --- a/src/core/devtools/widget/reg_popup.cpp +++ b/src/core/devtools/widget/reg_popup.cpp @@ -64,7 +64,7 @@ void RegPopup::DrawColorBuffer(const AmdGpu::Liverpool::ColorBuffer& buffer) { "NumSamples()", buffer.NumSamples(), "NumSlices()", buffer.NumSlices(), "GetColorSliceSize()", buffer.GetColorSliceSize(), - "GetTilingMode()", buffer.GetTilingMode(), + "GetTileMode()", buffer.GetTileMode(), "IsTiled()", buffer.IsTiled(), "NumFormat()", buffer.GetNumberFmt() ); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index d80f2956b..16c841581 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -326,12 +326,12 @@ constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept } if (!image.Valid()) { LOG_DEBUG(Render_Vulkan, "Encountered invalid image sharp"); - image = is_depth ? AmdGpu::Image::NullDepth() : AmdGpu::Image::Null(); + image = AmdGpu::Image::Null(is_depth); } else if (is_depth) { const auto data_fmt = image.GetDataFmt(); if (data_fmt != AmdGpu::DataFormat::Format16 && data_fmt != AmdGpu::DataFormat::Format32) { LOG_DEBUG(Render_Vulkan, "Encountered non-depth image used with depth instruction!"); - image = AmdGpu::Image::NullDepth(); + image = AmdGpu::Image::Null(true); } } return image; diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index d3e671c58..7901b8db6 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -6,8 +6,8 @@ #include #include "common/types.h" -#include "frontend/fetch_shader.h" #include "shader_recompiler/backend/bindings.h" +#include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/info.h" namespace Shader { diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index d693a0a38..941a79c2d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -22,7 +22,7 @@ #include "common/unique_function.h" #include "shader_recompiler/params.h" #include "video_core/amdgpu/pixel_format.h" -#include "video_core/amdgpu/resource.h" +#include "video_core/amdgpu/tiling.h" #include "video_core/amdgpu/types.h" namespace Vulkan { @@ -426,7 +426,7 @@ struct Liverpool { BitField<0, 2, ZFormat> format; BitField<2, 2, u32> num_samples; BitField<13, 3, u32> tile_split; - BitField<20, 3, u32> tile_mode_index; + BitField<20, 3, TileMode> tile_mode_index; BitField<23, 4, u32> decompress_on_n_zplanes; BitField<27, 1, u32> allow_expclear; BitField<28, 1, u32> read_size; @@ -502,6 +502,14 @@ struct Liverpool { const auto bpe = NumBits() >> 3; // in bytes return (depth_slice.tile_max + 1) * 64 * bpe * NumSamples(); } + + TileMode GetTileMode() const { + return z_info.tile_mode_index.Value(); + } + + bool IsTiled() const { + return GetTileMode() != TileMode::DisplayLinearAligned; + } }; enum class ClipSpace : u32 { @@ -888,7 +896,7 @@ struct Liverpool { u32 u32all; } info; union Color0Attrib { - BitField<0, 5, TilingMode> tile_mode_index; + BitField<0, 5, TileMode> tile_mode_index; BitField<5, 5, u32> fmask_tile_mode_index; BitField<10, 2, u32> fmask_bank_height; BitField<12, 3, u32> num_samples_log2; @@ -949,13 +957,13 @@ struct Liverpool { return slice_size; } - TilingMode GetTilingMode() const { - return info.linear_general ? TilingMode::Display_Linear + TileMode GetTileMode() const { + return info.linear_general ? TileMode::DisplayLinearAligned : attrib.tile_mode_index.Value(); } bool IsTiled() const { - return GetTilingMode() != TilingMode::Display_Linear; + return GetTileMode() != TileMode::DisplayLinearAligned; } [[nodiscard]] DataFormat GetDataFmt() const { diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 682cdf357..d88b05f41 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -178,7 +178,7 @@ static constexpr std::array BITS_PER_BLOCK = { 64, // 12 Format16_16_16_16 96, // 13 Format32_32_32 128, // 14 Format32_32_32_32 - 0, // 15 + -1, // 15 16, // 16 Format5_6_5 16, // 17 Format1_5_5_5 16, // 18 Format5_5_5_1 @@ -186,15 +186,15 @@ static constexpr std::array BITS_PER_BLOCK = { 32, // 20 Format8_24 32, // 21 Format24_8 64, // 22 FormatX24_8_32 - 0, // 23 - 0, // 24 - 0, // 25 - 0, // 26 - 0, // 27 - 0, // 28 - 0, // 29 - 0, // 30 - 0, // 31 + -1, // 23 + -1, // 24 + -1, // 25 + -1, // 26 + -1, // 27 + -1, // 28 + -1, // 29 + -1, // 30 + -1, // 31 16, // 32 FormatGB_GR 16, // 33 FormatBG_RG 32, // 34 Format5_9_9_9 @@ -213,4 +213,55 @@ u32 NumBitsPerBlock(DataFormat format) { return BITS_PER_BLOCK[index]; } +static constexpr std::array BITS_PER_ELEMENT = { + 0, // 0 FormatInvalid + 8, // 1 Format8 + 16, // 2 Format16 + 16, // 3 Format8_8 + 32, // 4 Format32 + 32, // 5 Format16_16 + 32, // 6 Format10_11_11 + 32, // 7 Format11_11_10 + 32, // 8 Format10_10_10_2 + 32, // 9 Format2_10_10_10 + 32, // 10 Format8_8_8_8 + 64, // 11 Format32_32 + 64, // 12 Format16_16_16_16 + 96, // 13 Format32_32_32 + 128, // 14 Format32_32_32_32 + -1, // 15 + 16, // 16 Format5_6_5 + 16, // 17 Format1_5_5_5 + 16, // 18 Format5_5_5_1 + 16, // 19 Format4_4_4_4 + 32, // 20 Format8_24 + 32, // 21 Format24_8 + 64, // 22 FormatX24_8_32 + -1, // 23 + -1, // 24 + -1, // 25 + -1, // 26 + -1, // 27 + -1, // 28 + -1, // 29 + -1, // 30 + -1, // 31 + 16, // 32 FormatGB_GR + 16, // 33 FormatBG_RG + 32, // 34 Format5_9_9_9 + 4, // 35 FormatBc1 + 8, // 36 FormatBc2 + 8, // 37 FormatBc3 + 4, // 38 FormatBc4 + 8, // 39 FormatBc5 + 8, // 40 FormatBc6 + 8, // 41 FormatBc7 +}; + +u32 NumBitsPerElement(DataFormat format) { + const u32 index = static_cast(format); + ASSERT_MSG(index < BITS_PER_ELEMENT.size(), "Invalid data format = {}", format); + return BITS_PER_ELEMENT[index]; +} + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 45c688e57..e7ad27dd3 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -85,7 +85,7 @@ enum class NumberClass { Uint, }; -enum class CompSwizzle : u8 { +enum class CompSwizzle : u32 { Zero = 0, One = 1, Red = 4, @@ -313,7 +313,11 @@ constexpr NumberClass GetNumberClass(const NumberFormat nfmt) { } constexpr bool IsInteger(const NumberFormat nfmt) { - return nfmt == AmdGpu::NumberFormat::Sint || nfmt == AmdGpu::NumberFormat::Uint; + return nfmt == NumberFormat::Sint || nfmt == NumberFormat::Uint; +} + +constexpr bool IsBlockCoded(DataFormat format) { + return format >= DataFormat::FormatBc1 && format <= DataFormat::FormatBc7; } std::string_view NameOf(DataFormat fmt); @@ -321,6 +325,7 @@ std::string_view NameOf(NumberFormat fmt); u32 NumComponents(DataFormat format); u32 NumBitsPerBlock(DataFormat format); +u32 NumBitsPerElement(DataFormat format); } // namespace AmdGpu diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index ff9cfe2cc..742cdee86 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -7,6 +7,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "video_core/amdgpu/pixel_format.h" +#include "video_core/amdgpu/tiling.h" namespace AmdGpu { @@ -138,37 +139,6 @@ constexpr std::string_view NameOf(ImageType type) { } } -enum class TilingMode : u32 { - Depth_MacroTiled = 0u, - Display_Linear = 0x8u, - Display_MicroTiled = 0x9u, - Display_MacroTiled = 0xAu, - Texture_MicroTiled = 0xDu, - Texture_MacroTiled = 0xEu, - Texture_Volume = 0x13u, -}; - -constexpr std::string_view NameOf(TilingMode type) { - switch (type) { - case TilingMode::Depth_MacroTiled: - return "Depth_MacroTiled"; - case TilingMode::Display_Linear: - return "Display_Linear"; - case TilingMode::Display_MicroTiled: - return "Display_MicroTiled"; - case TilingMode::Display_MacroTiled: - return "Display_MacroTiled"; - case TilingMode::Texture_MicroTiled: - return "Texture_MicroTiled"; - case TilingMode::Texture_MacroTiled: - return "Texture_MacroTiled"; - case TilingMode::Texture_Volume: - return "Texture_Volume"; - default: - return "Unknown"; - } -} - struct Image { u64 base_address : 38; u64 mtype_l2 : 2; @@ -212,28 +182,15 @@ struct Image { u64 alt_tile_mode : 1; u64 : 39; - static constexpr Image Null() { + static constexpr Image Null(bool is_depth) { Image image{}; - image.data_format = u64(DataFormat::Format8_8_8_8); - image.num_format = u64(NumberFormat::Unorm); + image.data_format = u64(is_depth ? DataFormat::Format32 : DataFormat::Format8_8_8_8); + image.num_format = u64(is_depth ? NumberFormat::Float : NumberFormat::Unorm); image.dst_sel_x = u64(CompSwizzle::Red); image.dst_sel_y = u64(CompSwizzle::Green); image.dst_sel_z = u64(CompSwizzle::Blue); image.dst_sel_w = u64(CompSwizzle::Alpha); - image.tiling_index = u64(TilingMode::Texture_MicroTiled); - image.type = u64(ImageType::Color2D); - return image; - } - - static constexpr Image NullDepth() { - Image image{}; - image.data_format = u64(DataFormat::Format32); - image.num_format = u64(NumberFormat::Float); - image.dst_sel_x = u64(CompSwizzle::Red); - image.dst_sel_y = u64(CompSwizzle::Green); - image.dst_sel_z = u64(CompSwizzle::Blue); - image.dst_sel_w = u64(CompSwizzle::Alpha); - image.tiling_index = u64(TilingMode::Texture_MicroTiled); + image.tiling_index = u64(TileMode::Thin1DThin); image.type = u64(ImageType::Color2D); return image; } @@ -314,16 +271,26 @@ struct Image { return MapNumberConversion(NumberFormat(num_format), DataFormat(data_format)); } - TilingMode GetTilingMode() const { - if (tiling_index >= 0 && tiling_index <= 7) { - return tiling_index == 5 ? TilingMode::Texture_MicroTiled - : TilingMode::Depth_MacroTiled; - } - return static_cast(tiling_index); + TileMode GetTileMode() const { + return static_cast(tiling_index); } bool IsTiled() const { - return GetTilingMode() != TilingMode::Display_Linear; + return GetTileMode() != TileMode::DisplayLinearAligned && + GetTileMode() != TileMode::DisplayLinearGeneral; + } + + u8 GetBankSwizzle() const { + const auto tile_mode = GetTileMode(); + const auto array_mode = GetArrayMode(tile_mode); + const auto dfmt = GetDataFmt(); + if (!alt_tile_mode || dfmt == DataFormat::FormatInvalid || !IsMacroTiled(array_mode)) { + return 0; + } + const u32 bpp = NumBitsPerElement(dfmt); + const auto macro_tile_mode = CalculateMacrotileMode(tile_mode, bpp, NumSamples()); + const u32 banks = GetAltNumBanks(macro_tile_mode); + return (((banks - 1) << 4) & base_address) >> 4; } bool IsFmask() const noexcept { @@ -331,7 +298,21 @@ struct Image { GetDataFmt() <= DataFormat::FormatFmask64_8; } - [[nodiscard]] ImageType GetViewType(const bool is_array) const noexcept { + ImageType GetBaseType() const noexcept { + const auto base_type = GetType(); + if (base_type == ImageType::Color1DArray) { + return ImageType::Color1D; + } + if (base_type == ImageType::Color2DArray) { + return ImageType::Color2D; + } + if (base_type == ImageType::Color2DMsaa || base_type == ImageType::Color2DMsaaArray) { + return ImageType::Color2D; + } + return base_type; + } + + ImageType GetViewType(const bool is_array) const noexcept { const auto base_type = GetType(); if (IsCube()) { // Cube needs to remain array type regardless of instruction array specifier. @@ -422,13 +403,7 @@ enum class Filter : u64 { }; constexpr bool IsAnisoFilter(const Filter filter) { - switch (filter) { - case Filter::AnisoPoint: - case Filter::AnisoLinear: - return true; - default: - return false; - } + return filter == Filter::AnisoPoint || filter == Filter::AnisoLinear; } enum class MipFilter : u64 { @@ -495,7 +470,7 @@ struct Sampler { } float LodBias() const noexcept { - return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / + return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / 256.0f; } diff --git a/src/video_core/amdgpu/tiling.cpp b/src/video_core/amdgpu/tiling.cpp new file mode 100644 index 000000000..e16d695b1 --- /dev/null +++ b/src/video_core/amdgpu/tiling.cpp @@ -0,0 +1,554 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "video_core/amdgpu/tiling.h" + +#include + +namespace AmdGpu { + +static constexpr u32 MICROTILE_SIZE = 8; +static constexpr u32 DRAM_ROW_SIZE = 1024; + +std::string_view NameOf(TileMode tile_mode) { + return magic_enum::enum_name(tile_mode); +} + +ArrayMode GetArrayMode(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth1DThin: + case TileMode::Display1DThin: + case TileMode::Thin1DThin: + return ArrayMode::Array1DTiledThin1; + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Display2DThin: + case TileMode::Thin2DThin: + return ArrayMode::Array2DTiledThin1; + case TileMode::DisplayThinPrt: + case TileMode::ThinThinPrt: + return ArrayMode::ArrayPrtTiledThin1; + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::Display2DThinPrt: + case TileMode::Thin2DThinPrt: + return ArrayMode::ArrayPrt2DTiledThin1; + case TileMode::Thin3DThin: + case TileMode::Thin3DThinPrt: + return ArrayMode::Array3DTiledThin1; + case TileMode::Thick1DThick: + return ArrayMode::Array1DTiledThick; + case TileMode::Thick2DThick: + return ArrayMode::Array2DTiledThick; + case TileMode::Thick3DThick: + return ArrayMode::Array3DTiledThick; + case TileMode::ThickThickPrt: + return ArrayMode::ArrayPrtTiledThick; + case TileMode::Thick2DThickPrt: + return ArrayMode::ArrayPrt2DTiledThick; + case TileMode::Thick3DThickPrt: + return ArrayMode::ArrayPrt3DTiledThick; + case TileMode::Thick2DXThick: + return ArrayMode::Array2DTiledXThick; + case TileMode::Thick3DXThick: + return ArrayMode::Array3DTiledXThick; + case TileMode::DisplayLinearAligned: + return ArrayMode::ArrayLinearAligned; + case TileMode::DisplayLinearGeneral: + return ArrayMode::ArrayLinearGeneral; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +MicroTileMode GetMicroTileMode(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + return MicroTileMode::Depth; + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::DisplayLinearGeneral: + return MicroTileMode::Display; + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + return MicroTileMode::Thin; + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + return MicroTileMode::Thick; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +PipeConfig GetPipeConfig(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick2DThickPrt: + case TileMode::Thick2DXThick: + return PipeConfig::P8_32x32_16x16; + case TileMode::DisplayThinPrt: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick3DXThick: + return PipeConfig::P8_32x32_8x16; + case TileMode::DisplayLinearGeneral: + return PipeConfig::P2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +PipeConfig GetAltPipeConfig(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + return PipeConfig::P16_32x32_8x16; + case TileMode::DisplayLinearGeneral: + return PipeConfig::P2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetSampleSplit(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Thin1DThin: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + case TileMode::DisplayLinearGeneral: + return 1; + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + return 2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetTileSplit(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth1DThin: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + case TileMode::DisplayLinearGeneral: + return 64; + case TileMode::Depth2DThin128: + return 128; + case TileMode::Depth2DThin256: + case TileMode::Depth2DThinPrt256: + return 256; + case TileMode::Depth2DThin512: + return 512; + case TileMode::Depth2DThin1K: + case TileMode::Depth2DThinPrt1K: + return 1024; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetBankWidth(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetBankHeight(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x2_16_Dup: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + case MacroTileMode::Mode_1x8_16: + return 8; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetNumBanks(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 2; + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_4_Dup: + return 4; + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_8_Dup: + return 8; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 16; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetMacrotileAspect(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltBankHeight(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltNumBanks(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 2; + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + return 4; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 8; + case MacroTileMode::Mode_1x8_16: + return 16; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltMacrotileAspect(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 2; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +bool IsMacroTiled(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array1DTiledThick: + return false; + case ArrayMode::Array2DTiledThin1: + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array2DTiledXThick: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::Array3DTiledThin1: + case ArrayMode::Array3DTiledThick: + case ArrayMode::Array3DTiledXThick: + case ArrayMode::ArrayPrt3DTiledThick: + return true; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +bool IsPrt(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::ArrayPrt3DTiledThick: + return true; + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array1DTiledThick: + case ArrayMode::Array2DTiledThin1: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array2DTiledXThick: + case ArrayMode::Array3DTiledThin1: + case ArrayMode::Array3DTiledThick: + case ArrayMode::Array3DTiledXThick: + return false; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +u32 GetMicroTileThickness(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array2DTiledThin1: + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::Array3DTiledThin1: + return 1; + case ArrayMode::Array1DTiledThick: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array3DTiledThick: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThick: + return 4; + case ArrayMode::Array2DTiledXThick: + case ArrayMode::Array3DTiledXThick: + return 8; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +u32 GetPipeCount(PipeConfig pipe_cfg) { + switch (pipe_cfg) { + case PipeConfig::P2: + return 2; + case PipeConfig::P8_32x32_8x16: + case PipeConfig::P8_32x32_16x16: + return 8; + case PipeConfig::P16_32x32_8x16: + return 16; + default: + UNREACHABLE_MSG("Unknown pipe config = {}", u32(pipe_cfg)); + } +} + +MacroTileMode CalculateMacrotileMode(TileMode tile_mode, u32 bpp, u32 num_samples) { + ASSERT_MSG(std::has_single_bit(num_samples) && num_samples <= 16, "Invalid sample count {}", + num_samples); + ASSERT_MSG(bpp >= 1 && bpp <= 128, "Invalid bpp {}", bpp); + + const ArrayMode array_mode = GetArrayMode(tile_mode); + ASSERT_MSG(IsMacroTiled(array_mode), "Tile mode not macro tiled"); + + const MicroTileMode micro_tile_mode = GetMicroTileMode(tile_mode); + const u32 sample_split = GetSampleSplit(tile_mode); + const u32 tile_split_hw = GetTileSplit(tile_mode); + + const u32 tile_thickness = GetMicroTileThickness(array_mode); + const u32 tile_bytes_1x = bpp * MICROTILE_SIZE * MICROTILE_SIZE * tile_thickness / 8; + const u32 color_tile_split = std::max(256U, sample_split * tile_bytes_1x); + const u32 tile_split = + micro_tile_mode == MicroTileMode::Depth ? tile_split_hw : color_tile_split; + const u32 tilesplic = std::min(DRAM_ROW_SIZE, tile_split); + const u32 tile_bytes = std::min(tilesplic, num_samples * tile_bytes_1x); + const u32 mtm_idx = std::bit_width(tile_bytes / 64) - 1; + return IsPrt(array_mode) ? MacroTileMode(mtm_idx + 8) : MacroTileMode(mtm_idx); +} + +} // namespace AmdGpu diff --git a/src/video_core/amdgpu/tiling.h b/src/video_core/amdgpu/tiling.h new file mode 100644 index 000000000..3cf0d444d --- /dev/null +++ b/src/video_core/amdgpu/tiling.h @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/types.h" + +namespace AmdGpu { + +struct Image; + +static constexpr size_t NUM_TILE_MODES = 32; + +enum class PipeConfig : u32 { + P2 = 0, + P4_8x16 = 4, + P4_16x16 = 5, + P4_16x32 = 6, + P4_32x32 = 7, + P8_16x16_8x16 = 8, + P8_16x32_8x16 = 9, + P8_32x32_8x16 = 10, + P8_16x32_16x16 = 11, + P8_32x32_16x16 = 12, + P8_32x32_16x32 = 13, + P8_32x64_32x32 = 14, + P16_32x32_8x16 = 16, + P16_32x32_16x16 = 17, + P16 = 18, +}; + +enum class MicroTileMode : u32 { + Display = 0, + Thin = 1, + Depth = 2, + Rotated = 3, + Thick = 4, +}; + +enum class MacroTileMode : u32 { + Mode_1x4_16 = 0, + Mode_1x2_16 = 1, + Mode_1x1_16 = 2, + Mode_1x1_16_Dup = 3, + Mode_1x1_8 = 4, + Mode_1x1_4 = 5, + Mode_1x1_2 = 6, + Mode_1x1_2_Dup = 7, + Mode_1x8_16 = 8, + Mode_1x4_16_Dup = 9, + Mode_1x2_16_Dup = 10, + Mode_1x1_16_Dup2 = 11, + Mode_1x1_8_Dup = 12, + Mode_1x1_4_Dup = 13, + Mode_1x1_2_Dup2 = 14, + Mode_1x1_2_Dup3 = 15, +}; + +enum class ArrayMode : u32 { + ArrayLinearGeneral = 0, + ArrayLinearAligned = 1, + Array1DTiledThin1 = 2, + Array1DTiledThick = 3, + Array2DTiledThin1 = 4, + ArrayPrtTiledThin1 = 5, + ArrayPrt2DTiledThin1 = 6, + Array2DTiledThick = 7, + Array2DTiledXThick = 8, + ArrayPrtTiledThick = 9, + ArrayPrt2DTiledThick = 10, + ArrayPrt3DTiledThin1 = 11, + Array3DTiledThin1 = 12, + Array3DTiledThick = 13, + Array3DTiledXThick = 14, + ArrayPrt3DTiledThick = 15, +}; + +enum class TileMode : u32 { + Depth2DThin64 = 0, + Depth2DThin128 = 1, + Depth2DThin256 = 2, + Depth2DThin512 = 3, + Depth2DThin1K = 4, + Depth1DThin = 5, + Depth2DThinPrt256 = 6, + Depth2DThinPrt1K = 7, + DisplayLinearAligned = 8, + Display1DThin = 9, + Display2DThin = 10, + DisplayThinPrt = 11, + Display2DThinPrt = 12, + Thin1DThin = 13, + Thin2DThin = 14, + Thin3DThin = 15, + ThinThinPrt = 16, + Thin2DThinPrt = 17, + Thin3DThinPrt = 18, + Thick1DThick = 19, + Thick2DThick = 20, + Thick3DThick = 21, + ThickThickPrt = 22, + Thick2DThickPrt = 23, + Thick3DThickPrt = 24, + Thick2DXThick = 25, + Thick3DXThick = 26, + DisplayLinearGeneral = 31, +}; + +std::string_view NameOf(TileMode tile_mode); + +ArrayMode GetArrayMode(TileMode tile_mode); + +MicroTileMode GetMicroTileMode(TileMode tile_mode); + +PipeConfig GetPipeConfig(TileMode tile_mode); + +PipeConfig GetAltPipeConfig(TileMode tile_mode); + +u32 GetSampleSplit(TileMode tile_mode); + +u32 GetTileSplit(TileMode tile_mode); + +u32 GetBankWidth(MacroTileMode mode); + +u32 GetBankHeight(MacroTileMode mode); + +u32 GetNumBanks(MacroTileMode mode); + +u32 GetMacrotileAspect(MacroTileMode mode); + +u32 GetAltBankHeight(MacroTileMode mode); + +u32 GetAltNumBanks(MacroTileMode mode); + +u32 GetAltMacrotileAspect(MacroTileMode mode); + +bool IsMacroTiled(ArrayMode array_mode); + +bool IsPrt(ArrayMode array_mode); + +u32 GetMicroTileThickness(ArrayMode array_mode); + +u32 GetPipeCount(PipeConfig pipe_cfg); + +MacroTileMode CalculateMacrotileMode(TileMode tile_mode, u32 bpp, u32 num_samples); + +} // namespace AmdGpu diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8cbeae87a..c1e203b30 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -396,7 +396,9 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, // Avoid using ObtainBuffer here as that might give us the stream buffer. const BufferId buffer_id = FindBuffer(src, num_bytes); auto& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, src, num_bytes, false, false); + if (SynchronizeBuffer(buffer, src, num_bytes, false, true)) { + texture_cache.InvalidateMemoryFromGPU(dst, num_bytes); + } return buffer; }(); auto& dst_buffer = [&] -> const Buffer& { @@ -854,7 +856,7 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { } } -void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, +bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; size_t total_size_bytes = 0; @@ -867,47 +869,47 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, total_size_bytes += range_size; }, [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); }); - SCOPE_EXIT { - if (is_texel_buffer) { - SynchronizeBufferFromImage(buffer, device_addr, size); - } - }; - if (!src_buffer) { - return; + + if (src_buffer) { + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | + vk::AccessFlagBits2::eTransferRead | + vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); + TouchBuffer(buffer); } - scheduler.EndRendering(); - const auto cmdbuf = scheduler.CommandBuffer(); - const vk::BufferMemoryBarrier2 pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | - vk::AccessFlagBits2::eTransferRead | vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = buffer.Handle(), - .offset = 0, - .size = buffer.SizeBytes(), - }; - const vk::BufferMemoryBarrier2 post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, - .buffer = buffer.Handle(), - .offset = 0, - .size = buffer.SizeBytes(), - }; - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &pre_barrier, - }); - cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &post_barrier, - }); - TouchBuffer(buffer); + if (is_texel_buffer) { + return SynchronizeBufferFromImage(buffer, device_addr, size); + } + return false; } vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span copies, @@ -944,115 +946,81 @@ vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span c } bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { - boost::container::small_vector image_ids; - texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { - if (image.info.guest_address != device_addr) { - return; - } - // Only perform sync if image is: - // - GPU modified; otherwise there are no changes to synchronize. - // - Not CPU dirty; otherwise we could overwrite CPU changes with stale GPU changes. - // - Not GPU dirty; otherwise we could overwrite GPU changes with stale image data. - if (False(image.flags & ImageFlagBits::GpuModified) || - True(image.flags & ImageFlagBits::Dirty)) { - return; - } - image_ids.push_back(image_id); - }); - if (image_ids.empty()) { + const ImageId image_id = texture_cache.FindImageFromRange(device_addr, size); + if (!image_id) { return false; } - ImageId image_id{}; - if (image_ids.size() == 1) { - // Sometimes image size might not exactly match with requested buffer size - // If we only found 1 candidate image use it without too many questions. - image_id = image_ids[0]; - } else { - for (s32 i = 0; i < image_ids.size(); ++i) { - Image& image = texture_cache.GetImage(image_ids[i]); - if (image.info.guest_size == size) { - image_id = image_ids[i]; - break; - } - } - if (!image_id) { - LOG_WARNING(Render_Vulkan, - "Failed to find exact image match for copy addr={:#x}, size={:#x}", - device_addr, size); - return false; - } - } Image& image = texture_cache.GetImage(image_id); ASSERT_MSG(device_addr == image.info.guest_address, "Texel buffer aliases image subresources {:x} : {:x}", device_addr, image.info.guest_address); - boost::container::small_vector copies; - u32 offset = buffer.Offset(image.info.guest_address); - const u32 num_layers = image.info.resources.layers; - const u32 max_offset = offset + size; - for (u32 m = 0; m < image.info.resources.levels; m++) { - const u32 width = std::max(image.info.size.width >> m, 1u); - const u32 height = std::max(image.info.size.height >> m, 1u); - const u32 depth = - image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const auto [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; - offset += mip_ofs; - if (offset + mip_size > max_offset) { + const u32 buf_offset = buffer.Offset(image.info.guest_address); + boost::container::small_vector buffer_copies; + u32 copy_size = 0; + for (u32 mip = 0; mip < image.info.resources.levels; mip++) { + const auto& mip_info = image.info.mips_layout[mip]; + const u32 width = std::max(image.info.size.width >> mip, 1u); + const u32 height = std::max(image.info.size.height >> mip, 1u); + const u32 depth = std::max(image.info.size.depth >> mip, 1u); + if (buf_offset + mip_info.offset + mip_info.size > buffer.SizeBytes()) { break; } - copies.push_back({ - .bufferOffset = offset, - .bufferRowLength = mip_pitch, - .bufferImageHeight = mip_height, + buffer_copies.push_back(vk::BufferImageCopy{ + .bufferOffset = mip_info.offset, + .bufferRowLength = mip_info.pitch, + .bufferImageHeight = mip_info.height, .imageSubresource{ .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, - .mipLevel = m, + .mipLevel = mip, .baseArrayLayer = 0, - .layerCount = num_layers, + .layerCount = image.info.resources.layers, }, .imageOffset = {0, 0, 0}, .imageExtent = {width, height, depth}, }); + copy_size += mip_info.size; } - if (!copies.empty()) { - scheduler.EndRendering(); - const vk::BufferMemoryBarrier2 pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = buffer.Handle(), - .offset = max_offset - size, - .size = size, - }; - const vk::BufferMemoryBarrier2 post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, - .buffer = buffer.Handle(), - .offset = max_offset - size, - .size = size, - }; - auto barriers = image.GetBarriers(vk::ImageLayout::eTransferSrcOptimal, - vk::AccessFlagBits2::eTransferRead, - vk::PipelineStageFlagBits2::eTransfer, {}); - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &pre_barrier, - .imageMemoryBarrierCount = static_cast(barriers.size()), - .pImageMemoryBarriers = barriers.data(), - }); - cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.Handle(), - copies); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &post_barrier, - }); + if (copy_size == 0) { + return false; } + scheduler.EndRendering(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = buf_offset, + .size = copy_size, + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = buffer.Handle(), + .offset = buf_offset, + .size = copy_size, + }; + auto barriers = + image.GetBarriers(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, + vk::PipelineStageFlagBits2::eTransfer, {}); + auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + .imageMemoryBarrierCount = static_cast(barriers.size()), + .pImageMemoryBarriers = barriers.data(), + }); + auto& tile_manager = texture_cache.GetTileManager(); + tile_manager.TileImage(image.image, buffer_copies, buffer.Handle(), buf_offset, image.info); + cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); return true; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2aa67ee42..aecc97db0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -102,15 +102,14 @@ public: /// Retrieves a utility buffer optimized for specified memory usage. StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept { - switch (usage) { - case MemoryUsage::Stream: + if (usage == MemoryUsage::Stream) { return stream_buffer; - case MemoryUsage::Download: + } else if (usage == MemoryUsage::Download) { return download_buffer; - case MemoryUsage::Upload: - return staging_buffer; - case MemoryUsage::DeviceLocal: + } else if (usage == MemoryUsage::DeviceLocal) { return device_buffer; + } else { + return staging_buffer; } } @@ -200,7 +199,7 @@ private: template void ChangeRegister(BufferId buffer_id); - void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, + bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer); vk::Buffer UploadCopies(Buffer& buffer, std::span copies, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index e88147eb5..486bc51dc 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -16,6 +16,7 @@ set(SHADER_FILES fs_tri.vert fsr.comp post_process.frag + tiling.comp ) set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) diff --git a/src/video_core/host_shaders/tiling.comp b/src/video_core/host_shaders/tiling.comp new file mode 100644 index 000000000..14bb21547 --- /dev/null +++ b/src/video_core/host_shaders/tiling.comp @@ -0,0 +1,444 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 core + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +// #define BITS_PER_PIXEL +// #define NUM_SAMPLES +// #define MICRO_TILE_MODE +// #define ARRAY_MODE +// #define MICRO_TILE_THICKNESS +// #define PIPE_CONFIG +// #define BANK_WIDTH +// #define BANK_HEIGHT +// #define NUM_BANKS +// #define NUM_BANK_BITS +// #define TILE_SPLIT_BYTES +// #define MACRO_TILE_ASPECT + +#define BYTES_PER_PIXEL (BITS_PER_PIXEL / 8) + +#if BITS_PER_PIXEL == 8 +#define BLOCK_TYPE uint8_t +#elif BITS_PER_PIXEL == 16 +#define BLOCK_TYPE uint16_t +#elif BITS_PER_PIXEL == 32 +#define BLOCK_TYPE uint32_t +#elif BITS_PER_PIXEL == 64 +#define BLOCK_TYPE u32vec2 +#elif BITS_PER_PIXEL == 96 +#define BLOCK_TYPE u32vec3 +#else +#define BLOCK_TYPE u32vec4 +#endif + +#if PIPE_CONFIG == ADDR_SURF_P2 +#define NUM_PIPES 2 +#define NUM_PIPE_BITS 1 +#else +#define NUM_PIPES 8 +#define NUM_PIPE_BITS 3 +#endif + +#define MICRO_TILE_WIDTH 8 +#define MICRO_TILE_HEIGHT 8 +#define MICRO_TILE_PIXELS (MICRO_TILE_WIDTH * MICRO_TILE_HEIGHT) +#define MICRO_TILE_BITS (MICRO_TILE_PIXELS * MICRO_TILE_THICKNESS * BITS_PER_PIXEL * NUM_SAMPLES) +#define MICRO_TILE_BYTES (MICRO_TILE_BITS / 8) + +#define NUM_PIPE_INTERLEAVE_BITS 8 + +#define ADDR_SURF_DISPLAY_MICRO_TILING 0 +#define ADDR_SURF_THIN_MICRO_TILING 1 +#define ADDR_SURF_DEPTH_MICRO_TILING 2 +#define ADDR_SURF_ROTATED_MICRO_TILING 3 + +#define ARRAY_LINEAR_GENERAL 0 +#define ARRAY_LINEAR_ALIGNED 1 +#define ARRAY_1D_TILED_THIN1 2 +#define ARRAY_1D_TILED_THICK 3 +#define ARRAY_2D_TILED_THIN1 4 +#define ARRAY_PRT_TILED_THIN1 5 +#define ARRAY_PRT_2D_TILED_THIN1 6 +#define ARRAY_2D_TILED_THICK 7 +#define ARRAY_2D_TILED_XTHICK 8 +#define ARRAY_PRT_TILED_THICK 9 +#define ARRAY_PRT_2D_TILED_THICK 10 +#define ARRAY_PRT_3D_TILED_THIN1 11 +#define ARRAY_3D_TILED_THIN1 12 +#define ARRAY_3D_TILED_THICK 13 +#define ARRAY_3D_TILED_XTHICK 14 +#define ARRAY_PRT_3D_TILED_THICK 15 + +#define ADDR_SURF_P2 0 +#define ADDR_SURF_P8_32x32_8x16 10 +#define ADDR_SURF_P8_32x32_16x16 12 + +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(x) (((x) + (BITS_PER_BYTE-1)) / BITS_PER_BYTE) + +#define _BIT(v, b) bitfieldExtract((v), (b), 1) + +struct MipInfo { + uint size; + uint pitch; + uint height; + uint offset; +}; + +layout (set = 0, binding = 0, scalar) buffer InputBuf { + BLOCK_TYPE tiled_data[]; +}; + +layout (set = 0, binding = 1, scalar) buffer OutputBuf { + BLOCK_TYPE linear_data[]; +}; + +layout (set = 0, binding = 2, scalar) uniform TilingInfo { + uint bank_swizzle; + uint num_slices; + uint num_mips; + MipInfo mips[16]; +} info; + +uint32_t ComputePixelIndexWithinMicroTile(uint32_t x, uint32_t y, uint32_t z) { + uint32_t p0 = 0; + uint32_t p1 = 0; + uint32_t p2 = 0; + uint32_t p3 = 0; + uint32_t p4 = 0; + uint32_t p5 = 0; + uint32_t p6 = 0; + uint32_t p7 = 0; + uint32_t p8 = 0; + + uint32_t x0 = _BIT(x, 0); + uint32_t x1 = _BIT(x, 1); + uint32_t x2 = _BIT(x, 2); + uint32_t y0 = _BIT(y, 0); + uint32_t y1 = _BIT(y, 1); + uint32_t y2 = _BIT(y, 2); + uint32_t z0 = _BIT(z, 0); + uint32_t z1 = _BIT(z, 1); + uint32_t z2 = _BIT(z, 2); + +#if MICRO_TILE_MODE == ADDR_SURF_DISPLAY_MICRO_TILING + #if BITS_PER_PIXEL == 8 + p0 = x0; + p1 = x1; + p2 = x2; + p3 = y1; + p4 = y0; + p5 = y2; + #elif BITS_PER_PIXEL == 16 + p0 = x0; + p1 = x1; + p2 = x2; + p3 = y0; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 32 + p0 = x0; + p1 = x1; + p2 = y0; + p3 = x2; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 64 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = x2; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 128 + p0 = y0; + p1 = x0; + p2 = x1; + p3 = x2; + p4 = y1; + p5 = y2; + #endif +#elif MICRO_TILE_MODE == ADDR_SURF_THIN_MICRO_TILING || MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + p0 = x0; + p1 = y0; + p2 = x1; + p3 = y1; + p4 = x2; + p5 = y2; +#else + #if BITS_PER_PIXEL == 8 || BITS_PER_PIXEL == 16 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = y1; + p4 = z0; + p5 = z1; + #elif BITS_PER_PIXEL == 32 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = z0; + p4 = y1; + p5 = z1; + #elif BITS_PER_PIXEL == 64 || BITS_PER_PIXEL == 128 + p0 = x0; + p1 = y0; + p2 = z0; + p3 = x1; + p4 = y1; + p5 = z1; + #endif + p6 = x2; + p7 = y2; + + #if MICRO_TILE_THICKNESS == 8 + p8 = z2; + #endif +#endif + + uint32_t pixel_number = + ((p0) | (p1 << 1) | (p2 << 2) | (p3 << 3) | (p4 << 4) | + (p5 << 5) | (p6 << 6) | (p7 << 7) | (p8 << 8)); + + return pixel_number; +} + +#if ARRAY_MODE == ARRAY_1D_TILED_THIN1 || ARRAY_MODE == ARRAY_1D_TILED_THICK +uint32_t ComputeSurfaceAddrFromCoordMicroTiled(uint32_t x, uint32_t y, uint32_t slice, uint32_t pitch, uint32_t height, uint32_t sample_index) { + uint32_t slice_bytes = BITS_TO_BYTES(pitch * height * MICRO_TILE_THICKNESS * BITS_PER_PIXEL * NUM_SAMPLES); + + uint32_t micro_tiles_per_row = pitch / MICRO_TILE_WIDTH; + uint32_t micro_tile_index_x = x / MICRO_TILE_WIDTH; + uint32_t micro_tile_index_y = y / MICRO_TILE_HEIGHT; + uint32_t micro_tile_index_z = slice / MICRO_TILE_THICKNESS; + + uint32_t slice_offset = micro_tile_index_z * slice_bytes; + uint32_t micro_tile_offset = (micro_tile_index_y * micro_tiles_per_row + micro_tile_index_x) * MICRO_TILE_BYTES; + + uint32_t pixel_index = ComputePixelIndexWithinMicroTile(x, y, slice); + + uint32_t sample_offset; + uint32_t pixel_offset; +#if MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + sample_offset = sample_index * BITS_PER_PIXEL; + pixel_offset = pixel_index * BITS_PER_PIXEL * NUM_SAMPLES; +#else + sample_offset = sample_index * (MICRO_TILE_BYTES * 8 / NUM_SAMPLES); + pixel_offset = pixel_index * BITS_PER_PIXEL; +#endif + + uint32_t elem_offset = (sample_offset + pixel_offset) / 8; + return slice_offset + micro_tile_offset + elem_offset; +} +#else +uint32_t ComputePipeFromCoord(uint32_t x, uint32_t y, uint32_t slice) { + uint32_t p0 = 0; + uint32_t p1 = 0; + uint32_t p2 = 0; + + uint32_t tx = x / MICRO_TILE_WIDTH; + uint32_t ty = y / MICRO_TILE_HEIGHT; + uint32_t x3 = _BIT(tx, 0); + uint32_t x4 = _BIT(tx, 1); + uint32_t x5 = _BIT(tx, 2); + uint32_t y3 = _BIT(ty, 0); + uint32_t y4 = _BIT(ty, 1); + uint32_t y5 = _BIT(ty, 2); + +#if PIPE_CONFIG == ADDR_SURF_P2 + p0 = x3 ^ y3; +#elif PIPE_CONFIG == ADDR_SURF_P8_32x32_8x16 + p0 = x4 ^ y3 ^ x5; + p1 = x3 ^ y4; + p2 = x5 ^ y5; +#elif PIPE_CONFIG == ADDR_SURF_P8_32x32_16x16 + p0 = x3 ^ y3 ^ x4; + p1 = x4 ^ y4; + p2 = x5 ^ y5; +#endif + + uint32_t pipe = p0 | (p1 << 1) | (p2 << 2); + + uint32_t pipe_swizzle = 0; +#if ARRAY_MODE == ARRAY_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THICK || ARRAY_MODE == ARRAY_3D_TILED_XTHICK + pipe_swizzle += max(1, NUM_PIPES / 2 - 1) * (slice / MICRO_TILE_THICKNESS); +#endif + pipe_swizzle &= (NUM_PIPES - 1); + pipe = pipe ^ pipe_swizzle; + return pipe; +} + +uint32_t ComputeBankFromCoord(uint32_t x, uint32_t y, uint32_t slice, uint32_t tile_split_slice) { + uint32_t b0 = 0; + uint32_t b1 = 0; + uint32_t b2 = 0; + uint32_t b3 = 0; + uint32_t slice_rotation = 0; + uint32_t tile_split_rotation = 0; + + uint32_t tx = x / MICRO_TILE_WIDTH / (BANK_WIDTH * NUM_PIPES); + uint32_t ty = y / MICRO_TILE_HEIGHT / BANK_HEIGHT; + + uint32_t x3 = _BIT(tx, 0); + uint32_t x4 = _BIT(tx, 1); + uint32_t x5 = _BIT(tx, 2); + uint32_t x6 = _BIT(tx, 3); + uint32_t y3 = _BIT(ty, 0); + uint32_t y4 = _BIT(ty, 1); + uint32_t y5 = _BIT(ty, 2); + uint32_t y6 = _BIT(ty, 3); + +#if NUM_BANKS == 16 + b0 = x3 ^ y6; + b1 = x4 ^ y5 ^ y6; + b2 = x5 ^ y4; + b3 = x6 ^ y3; +#elif NUM_BANKS == 8 + b0 = x3 ^ y5; + b1 = x4 ^ y4 ^ y5; + b2 = x5 ^ y3; +#elif NUM_BANKS == 4 + b0 = x3 ^ y4; + b1 = x4 ^ y3; +#elif NUM_BANKS == 2 + b0 = x3 ^ y3; +#endif + + uint32_t bank = b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); + +#if ARRAY_MODE == ARRAY_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_2D_TILED_THICK || ARRAY_MODE == ARRAY_2D_TILED_XTHICK + slice_rotation = ((NUM_BANKS / 2) - 1) * (slice / MICRO_TILE_THICKNESS); +#elif ARRAY_MODE == ARRAY_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THICK || ARRAY_MODE == ARRAY_3D_TILED_XTHICK + slice_rotation = max(1u, (NUM_PIPES / 2) - 1) * (slice / MICRO_TILE_THICKNESS) / NUM_PIPES; +#endif + +#if ARRAY_MODE == ARRAY_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THIN1 || \ + ARRAY_MODE == ARRAY_PRT_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_3D_TILED_THIN1 + tile_split_rotation = ((NUM_BANKS / 2) + 1) * tile_split_slice; +#endif + + bank ^= info.bank_swizzle + slice_rotation; + bank ^= tile_split_rotation; + bank &= (NUM_BANKS - 1); + + return bank; +} + +uint32_t ComputeSurfaceAddrFromCoordMacroTiled(uint32_t x, uint32_t y, uint32_t slice, uint32_t pitch, uint32_t height, uint32_t sample_index) { + uint32_t pixel_index = ComputePixelIndexWithinMicroTile(x, y, slice); + + uint32_t sample_offset; + uint32_t pixel_offset; +#if MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + sample_offset = sample_index * BITS_PER_PIXEL; + pixel_offset = pixel_index * BITS_PER_PIXEL * NUM_SAMPLES; +#else + sample_offset = sample_index * (MICRO_TILE_BITS / NUM_SAMPLES); + pixel_offset = pixel_index * BITS_PER_PIXEL; +#endif + + uint32_t element_offset = (pixel_offset + sample_offset) / 8; + + uint32_t slices_per_tile = 1; + uint32_t tile_split_slice = 0; +#if MICRO_TILE_BYTES > TILE_SPLIT_BYTES && MICRO_TILE_THICKNESS == 1 + slices_per_tile = MICRO_TILE_BYTES / TILE_SPLIT_BYTES; + tile_split_slice = element_offset / TILE_SPLIT_BYTES; + element_offset %= TILE_SPLIT_BYTES; + #undef MICRO_TILE_BYTES + #define MICRO_TILE_BYTES TILE_SPLIT_BYTES +#endif + + uint32_t macro_tile_pitch = (MICRO_TILE_WIDTH * BANK_WIDTH * NUM_PIPES) * MACRO_TILE_ASPECT; + uint32_t macro_tile_height = (MICRO_TILE_HEIGHT * BANK_HEIGHT * NUM_BANKS) / MACRO_TILE_ASPECT; + + uint32_t macro_tile_bytes = MICRO_TILE_BYTES * + (macro_tile_pitch / MICRO_TILE_WIDTH) * + (macro_tile_height / MICRO_TILE_HEIGHT) / (NUM_PIPES * NUM_BANKS); + + uint32_t macro_tiles_per_row = pitch / macro_tile_pitch; + + uint32_t macro_tile_index_x = x / macro_tile_pitch; + uint32_t macro_tile_index_y = y / macro_tile_height; + uint32_t macro_tile_offset = + ((macro_tile_index_y * macro_tiles_per_row) + macro_tile_index_x) * macro_tile_bytes; + uint32_t macro_tiles_per_slice = macro_tiles_per_row * (height / macro_tile_height); + + uint32_t slice_bytes = macro_tiles_per_slice * macro_tile_bytes; + uint32_t slice_offset = + slice_bytes * (tile_split_slice + slices_per_tile * (slice / MICRO_TILE_THICKNESS)); + + uint32_t tile_row_index = (y / MICRO_TILE_HEIGHT) % BANK_HEIGHT; + uint32_t tile_column_index = ((x / MICRO_TILE_WIDTH) / NUM_PIPES) % BANK_WIDTH; + uint32_t tile_index = (tile_row_index * BANK_WIDTH) + tile_column_index; + uint32_t tile_offset = tile_index * MICRO_TILE_BYTES; + + uint32_t total_offset = slice_offset + macro_tile_offset + element_offset + tile_offset; + +#if ARRAY_MODE == ARRAY_PRT_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_TILED_THICK || \ + ARRAY_MODE == ARRAY_PRT_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_2D_TILED_THICK || \ + ARRAY_MODE == ARRAY_PRT_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_3D_TILED_THICK + x %= macro_tile_pitch; + y %= macro_tile_height; +#endif + + uint32_t pipe = ComputePipeFromCoord(x, y, slice); + uint32_t bank = ComputeBankFromCoord(x, y, slice, tile_split_slice); + + uint32_t pipe_interleave_mask = (1 << NUM_PIPE_INTERLEAVE_BITS) - 1; + uint32_t pipe_interleave_offset = total_offset & pipe_interleave_mask; + uint32_t offset = total_offset >> NUM_PIPE_INTERLEAVE_BITS; + + uint32_t addr = pipe_interleave_offset; + uint32_t pipe_bits = pipe << NUM_PIPE_INTERLEAVE_BITS; + uint32_t bank_bits = bank << (NUM_PIPE_INTERLEAVE_BITS + NUM_PIPE_BITS); + uint32_t offset_bits = offset << (NUM_PIPE_INTERLEAVE_BITS + NUM_PIPE_BITS + NUM_BANK_BITS); + + addr |= pipe_bits; + addr |= bank_bits; + addr |= offset_bits; + + return addr; +} +#endif + +uint GetMipLevel(inout uint texel) { + uint mip = 0; + uint mip_size = info.mips[mip].size / BYTES_PER_PIXEL; + while (texel >= mip_size && mip < info.num_mips) { + texel -= mip_size; + ++mip; + mip_size = info.mips[mip].size / BYTES_PER_PIXEL; + } + return mip; +} + +void main() { + uint texel = gl_GlobalInvocationID.x; + uint mip = GetMipLevel(texel); + uint pitch = info.mips[mip].pitch; + uint height = info.mips[mip].height; + uint tiled_offset = info.mips[mip].offset; + uint x = texel % pitch; + uint y = (texel / pitch) % height; + uint slice = texel / (pitch * height); + +#if ARRAY_MODE == ARRAY_1D_TILED_THIN1 || ARRAY_MODE == ARRAY_1D_TILED_THICK + tiled_offset += ComputeSurfaceAddrFromCoordMicroTiled(x, y, slice, pitch, height, 0); +#else + tiled_offset += ComputeSurfaceAddrFromCoordMacroTiled(x, y, slice, pitch, height, 0); +#endif + +#ifdef IS_TILER + tiled_data[tiled_offset / BYTES_PER_PIXEL] = linear_data[gl_GlobalInvocationID.x]; +#else + linear_data[gl_GlobalInvocationID.x] = tiled_data[tiled_offset / BYTES_PER_PIXEL]; +#endif +} diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 03c13a4cb..e4e628c69 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -253,6 +253,7 @@ bool Instance::CreateDevice() { ASSERT(add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME)); // Optional + maintenance_8 = add_extension(VK_KHR_MAINTENANCE_8_EXTENSION_NAME); depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); dynamic_state_3 = add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); if (dynamic_state_3) { @@ -459,6 +460,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVertexAttributeDivisorFeatures{ .vertexAttributeInstanceRateDivisor = true, }, + vk::PhysicalDeviceMaintenance8FeaturesKHR{ + .maintenance8 = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, @@ -527,6 +531,9 @@ bool Instance::CreateDevice() { if (!provoking_vertex) { device_chain.unlink(); } + if (!maintenance_8) { + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index c34c12589..be316f6e8 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -109,6 +109,11 @@ public: return vk12_features.shaderInt8; } + /// Returns true if VK_KHR_maintenance8 is supported + bool IsMaintenance8Supported() const { + return maintenance_8; + } + /// Returns true when VK_EXT_custom_border_color is supported bool IsCustomBorderColorSupported() const { return custom_border_color; @@ -469,6 +474,7 @@ private: bool shader_atomic_float2{}; bool workgroup_memory_explicit_layout{}; bool portability_subset{}; + bool maintenance_8{}; bool supports_memory_budget{}; u64 total_memory_budget{}; std::vector valid_heaps; diff --git a/src/video_core/renderer_vulkan/vk_presenter.h b/src/video_core/renderer_vulkan/vk_presenter.h index 8ed2052ee..ea933b21c 100644 --- a/src/video_core/renderer_vulkan/vk_presenter.h +++ b/src/video_core/renderer_vulkan/vk_presenter.h @@ -5,6 +5,7 @@ #include +#include "core/libraries/videoout/buffer.h" #include "imgui/imgui_config.h" #include "imgui/imgui_texture.h" #include "video_core/amdgpu/liverpool.h" diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fa84a6b42..ec0c38bda 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -457,7 +457,7 @@ void Rasterizer::OnSubmit() { } bool Rasterizer::BindResources(const Pipeline* pipeline) { - if (IsComputeMetaClear(pipeline)) { + if (IsComputeImageCopy(pipeline) || IsComputeMetaClear(pipeline)) { return false; } @@ -523,22 +523,82 @@ bool Rasterizer::IsComputeMetaClear(const Pipeline* pipeline) { // If a shader wants to encode HTILE, for example, from a depth image it will have to compute // proper tile address from dispatch invocation id. This address calculation contains an xor // operation so use it as a heuristic for metadata writes that are probably not clears. - if (info.has_bitwise_xor) { - return false; - } - - // Assume if a shader writes metadata without address calculation, it is a clear shader. - for (const auto& desc : info.buffers) { - const VAddr address = desc.GetSharp(info).base_address; - if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { - // Assume all slices were updates - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return true; + if (!info.has_bitwise_xor) { + // Assume if a shader writes metadata without address calculation, it is a clear shader. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { + // Assume all slices were updates + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return true; + } } } return false; } +bool Rasterizer::IsComputeImageCopy(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + // Ensure shader only has 2 bound buffers + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { + return false; + } + + // Those 2 buffers must both be formatted. One must be source and another destination. + const auto& desc0 = info.buffers[0]; + const auto& desc1 = info.buffers[1]; + if (!desc0.is_formatted || !desc1.is_formatted || desc0.is_written == desc1.is_written) { + return false; + } + + // Buffers must have the same size and each thread of the dispatch must copy 1 dword of data + const AmdGpu::Buffer buf0 = desc0.GetSharp(info); + const AmdGpu::Buffer buf1 = desc1.GetSharp(info); + if (buf0.GetSize() != buf1.GetSize() || cs_pgm.dim_x != (buf0.GetSize() / 256)) { + return false; + } + + // Find images the buffer alias + const auto image0_id = texture_cache.FindImageFromRange(buf0.base_address, buf0.GetSize()); + if (!image0_id) { + return false; + } + const auto image1_id = + texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); + if (!image1_id) { + return false; + } + + // Image copy must be valid + VideoCore::Image& image0 = texture_cache.GetImage(image0_id); + VideoCore::Image& image1 = texture_cache.GetImage(image1_id); + if (image0.info.guest_size != image1.info.guest_size || + image0.info.pitch != image1.info.pitch || image0.info.guest_size != buf0.GetSize() || + image0.info.num_bits != image1.info.num_bits) { + return false; + } + + // Perform image copy + VideoCore::Image& src_image = desc0.is_written ? image1 : image0; + VideoCore::Image& dst_image = desc0.is_written ? image0 : image1; + if (instance.IsMaintenance8Supported() || + src_image.info.props.is_depth == dst_image.info.props.is_depth) { + dst_image.CopyImage(src_image); + } else { + const auto& copy_buffer = + buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::DeviceLocal); + dst_image.CopyImageWithBuffer(src_image, copy_buffer.Handle(), 0); + } + dst_image.flags |= VideoCore::ImageFlagBits::GpuModified; + dst_image.flags &= ~VideoCore::ImageFlagBits::Dirty; + return true; +} + void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, Shader::PushData& push_data) { buffer_bindings.clear(); @@ -687,7 +747,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | - (image.info.IsDepthStencil() + (image.info.props.is_depth ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite : vk::AccessFlagBits2::eColorAttachmentWrite), {}); @@ -698,7 +758,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin vk::AccessFlagBits2::eShaderWrite, desc.view_info.range); } else { - const auto new_layout = image.info.IsDepthStencil() + const auto new_layout = image.info.props.is_depth ? vk::ImageLayout::eDepthStencilReadOnlyOptimal : vk::ImageLayout::eShaderReadOnlyOptimal; image.Transit(new_layout, vk::AccessFlagBits2::eShaderRead, @@ -823,10 +883,8 @@ void Rasterizer::Resolve() { mrt0_hint}; VideoCore::TextureCache::RenderTargetDesc mrt1_desc{liverpool->regs.color_buffers[1], mrt1_hint}; - auto& mrt0_image = - texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, VideoCore::FindFlags::ExactFmt)); - auto& mrt1_image = - texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, VideoCore::FindFlags::ExactFmt)); + auto& mrt0_image = texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, true)); + auto& mrt1_image = texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, true)); VideoCore::SubresourceRange mrt0_range; mrt0_range.base.layer = liverpool->regs.color_buffers[0].view.slice_start; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a1d59021b..65de62bb4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -112,6 +112,7 @@ private: } bool IsComputeMetaClear(const Pipeline* pipeline); + bool IsComputeImageCopy(const Pipeline* pipeline); private: friend class VideoCore::BufferCache; diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 4ab2e991c..c2a8478ca 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -18,10 +18,10 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; - if (info.IsDepthStencil()) { + if (info.props.is_depth) { usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment; } else { - if (!info.IsBlockCoded()) { + if (!info.props.is_block) { usage |= vk::ImageUsageFlagBits::eColorAttachment; } // In cases where an image is created as a render/depth target and cleared with compute, @@ -35,6 +35,22 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { return usage; } +static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { + switch (type) { + case AmdGpu::ImageType::Color1D: + case AmdGpu::ImageType::Color1DArray: + return vk::ImageType::e1D; + case AmdGpu::ImageType::Color2D: + case AmdGpu::ImageType::Color2DMsaa: + case AmdGpu::ImageType::Color2DArray: + return vk::ImageType::e2D; + case AmdGpu::ImageType::Color3D: + return vk::ImageType::e3D; + default: + UNREACHABLE(); + } +} + static vk::FormatFeatureFlags2 FormatFeatureFlags(const vk::ImageUsageFlags usage_flags) { vk::FormatFeatureFlags2 feature_flags{}; if (usage_flags & vk::ImageUsageFlagBits::eTransferSrc) { @@ -132,7 +148,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, const auto supported_format = instance->GetSupportedFormat(info.pixel_format, format_features); const vk::PhysicalDeviceImageFormatInfo2 format_info{ .format = supported_format, - .type = info.type, + .type = ConvertImageType(info.type), .tiling = tiling, .usage = usage_flags, .flags = flags, @@ -141,7 +157,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, instance->GetPhysicalDevice().getImageFormatProperties2(format_info); if (image_format_properties.result == vk::Result::eErrorFormatNotSupported) { LOG_ERROR(Render_Vulkan, "image format {} type {} is not supported (flags {}, usage {})", - vk::to_string(supported_format), vk::to_string(info.type), + vk::to_string(supported_format), vk::to_string(format_info.type), vk::to_string(format_info.flags), vk::to_string(format_info.usage)); } const auto supported_samples = @@ -151,7 +167,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, const vk::ImageCreateInfo image_ci = { .flags = flags, - .imageType = info.type, + .imageType = ConvertImageType(info.type), .format = supported_format, .extent{ .width = info.size.width, @@ -168,9 +184,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, image.Create(image_ci); - Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {:#x}:{:#x}", - info.size.width, info.size.height, info.size.depth, info.guest_address, - info.guest_size); + Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {} {:#x}:{:#x}", + info.size.width, info.size.height, info.size.depth, + AmdGpu::NameOf(info.tile_mode), info.guest_address, info.guest_size); } boost::container::small_vector Image::GetBarriers( @@ -325,38 +341,41 @@ void Image::Upload(vk::Buffer buffer, u64 offset) { vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyImage(const Image& src_image) { - scheduler->EndRendering(); - Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); - - auto cmdbuf = scheduler->CommandBuffer(); +void Image::CopyImage(Image& src_image) { const auto& src_info = src_image.info; - - boost::container::small_vector image_copy{}; const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); - for (u32 m = 0; m < num_mips; ++m) { - const auto mip_w = std::max(src_info.size.width >> m, 1u); - const auto mip_h = std::max(src_info.size.height >> m, 1u); - const auto mip_d = std::max(src_info.size.depth >> m, 1u); + ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1); - image_copy.emplace_back(vk::ImageCopy{ + boost::container::small_vector image_copies; + for (u32 mip = 0; mip < num_mips; ++mip) { + const auto mip_w = std::max(src_info.size.width >> mip, 1u); + const auto mip_h = std::max(src_info.size.height >> mip, 1u); + const auto mip_d = std::max(src_info.size.depth >> mip, 1u); + + image_copies.emplace_back(vk::ImageCopy{ .srcSubresource{ - .aspectMask = src_image.aspect_mask, - .mipLevel = m, + .aspectMask = src_image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, .layerCount = src_info.resources.layers, }, .dstSubresource{ - .aspectMask = src_image.aspect_mask, - .mipLevel = m, + .aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, .layerCount = src_info.resources.layers, }, .extent = {mip_w, mip_h, mip_d}, }); } + + scheduler->EndRendering(); + src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); + + auto cmdbuf = scheduler->CommandBuffer(); cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, - image_copy); + image_copies); Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); @@ -364,32 +383,29 @@ void Image::CopyImage(const Image& src_image) { void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) { const auto& src_info = src_image.info; + const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); + ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1); - vk::BufferImageCopy buffer_image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = - { - .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth - : vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, + boost::container::small_vector buffer_copies; + for (u32 mip = 0; mip < num_mips; ++mip) { + const auto mip_w = std::max(src_info.size.width >> mip, 1u); + const auto mip_h = std::max(src_info.size.height >> mip, 1u); + const auto mip_d = std::max(src_info.size.depth >> mip, 1u); + + buffer_copies.emplace_back(vk::BufferImageCopy{ + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = src_image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, - .layerCount = 1, + .layerCount = src_info.resources.layers, }, - .imageOffset = - { - .x = 0, - .y = 0, - .z = 0, - }, - .imageExtent = - { - .width = src_info.size.width, - .height = src_info.size.height, - .depth = src_info.size.depth, - }, - }; + .imageOffset = {0, 0, 0}, + .imageExtent = {mip_w, mip_h, mip_d}, + }); + } const vk::BufferMemoryBarrier2 pre_copy_barrier = { .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, @@ -416,7 +432,6 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); auto cmdbuf = scheduler->CommandBuffer(); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -424,7 +439,7 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) }); cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer, - buffer_image_copy); + buffer_copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, @@ -432,11 +447,11 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) .pBufferMemoryBarriers = &post_copy_barrier, }); - buffer_image_copy.imageSubresource.aspectMask = - info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor; + for (auto& copy : buffer_copies) { + copy.imageSubresource.aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil; + } - cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, - buffer_image_copy); + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, buffer_copies); } void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) { diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index b791b548b..c30edad79 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -103,7 +103,7 @@ struct Image { std::optional range, vk::CommandBuffer cmdbuf = {}); void Upload(vk::Buffer buffer, u64 offset); - void CopyImage(const Image& src_image); + void CopyImage(Image& src_image); void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset); void CopyMip(const Image& src_image, u32 mip, u32 slice); diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index ed10a20bf..0e1f10bfe 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -2,12 +2,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" -#include "common/config.h" #include "core/libraries/kernel/process.h" +#include "core/libraries/videoout/buffer.h" +#include "shader_recompiler/info.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/texture_cache/image_info.h" #include "video_core/texture_cache/tile.h" +#include + namespace VideoCore { using namespace Vulkan; @@ -32,30 +35,15 @@ static vk::Format ConvertPixelFormat(const VideoOutFormat format) { return {}; } -static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { - switch (type) { - case AmdGpu::ImageType::Color1D: - case AmdGpu::ImageType::Color1DArray: - return vk::ImageType::e1D; - case AmdGpu::ImageType::Color2D: - case AmdGpu::ImageType::Color2DMsaa: - case AmdGpu::ImageType::Color2DArray: - return vk::ImageType::e2D; - case AmdGpu::ImageType::Color3D: - return vk::ImageType::e3D; - default: - UNREACHABLE(); - } -} - ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, VAddr cpu_address) noexcept { const auto& attrib = group.attrib; props.is_tiled = attrib.tiling_mode == TilingMode::Tile; - tiling_mode = props.is_tiled ? AmdGpu::TilingMode::Display_MacroTiled - : AmdGpu::TilingMode::Display_Linear; + tile_mode = + props.is_tiled ? AmdGpu::TileMode::Display2DThin : AmdGpu::TileMode::DisplayLinearAligned; + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = ConvertPixelFormat(attrib.pixel_format); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; size.width = attrib.width; size.height = attrib.height; pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127); @@ -63,26 +51,18 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, ASSERT(num_bits == 32); guest_address = cpu_address; - if (!props.is_tiled) { - guest_size = pitch * size.height * 4; - } else { - if (Libraries::Kernel::sceKernelIsNeoMode()) { - guest_size = pitch * ((size.height + 127) & (~127)) * 4; - } else { - guest_size = pitch * ((size.height + 63) & (~63)) * 4; - } - } - mips_layout.emplace_back(guest_size, pitch, 0); + UpdateSize(); } ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint /*= {}*/) noexcept { props.is_tiled = buffer.IsTiled(); - tiling_mode = buffer.GetTilingMode(); + tile_mode = buffer.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()); num_samples = buffer.NumSamples(); num_bits = NumBitsPerBlock(buffer.GetDataFmt()); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; @@ -94,17 +74,21 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, guest_address = buffer.Address(); const auto color_slice_sz = buffer.GetColorSliceSize(); guest_size = color_slice_sz * buffer.NumSlices(); - mips_layout.emplace_back(color_slice_sz, pitch, 0); - tiling_idx = static_cast(buffer.attrib.tile_mode_index.Value()); + mips_layout.emplace_back(guest_size, pitch, buffer.Height(), 0); alt_tile = Libraries::Kernel::sceKernelIsNeoMode() && buffer.info.alt_tile_mode; } ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slices, VAddr htile_address, const AmdGpu::Liverpool::CbDbExtent& hint, bool write_buffer) noexcept { - props.is_tiled = false; + tile_mode = buffer.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; + props.is_tiled = buffer.IsTiled(); + props.is_depth = true; + props.has_stencil = + buffer.stencil_info.format != AmdGpu::Liverpool::DepthBuffer::StencilFormat::Invalid; num_samples = buffer.NumSamples(); num_bits = buffer.NumBits(); size.width = hint.Valid() ? hint.width : buffer.Pitch(); @@ -120,21 +104,22 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice guest_address = write_buffer ? buffer.DepthWriteAddress() : buffer.DepthAddress(); const auto depth_slice_sz = buffer.GetDepthSliceSize(); guest_size = depth_slice_sz * num_slices; - mips_layout.emplace_back(depth_slice_sz, pitch, 0); + mips_layout.emplace_back(guest_size, pitch, buffer.Height(), 0); } ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& desc) noexcept { - tiling_mode = image.GetTilingMode(); + tile_mode = image.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); - // Override format if image is forced to be a depth target if (desc.is_depth) { pixel_format = LiverpoolToVK::PromoteFormatToDepth(pixel_format); + props.is_depth = true; } - type = ConvertImageType(image.GetType()); + type = image.GetBaseType(); props.is_tiled = image.IsTiled(); - props.is_volume = image.GetType() == AmdGpu::ImageType::Color3D; + props.is_volume = type == AmdGpu::ImageType::Color3D; props.is_pow2 = image.pow2pad; - props.is_block = IsBlockCoded(); + props.is_block = AmdGpu::IsBlockCoded(image.GetDataFmt()); size.width = image.width + 1; size.height = image.height + 1; size.depth = props.is_volume ? image.depth + 1 : 1; @@ -143,94 +128,34 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& de resources.layers = image.NumLayers(); num_samples = image.NumSamples(); num_bits = NumBitsPerBlock(image.GetDataFmt()); + bank_swizzle = image.GetBankSwizzle(); guest_address = image.Address(); - mips_layout.reserve(resources.levels); - tiling_idx = image.tiling_index; alt_tile = Libraries::Kernel::sceKernelIsNeoMode() && image.alt_tile_mode; UpdateSize(); } -bool ImageInfo::IsBlockCoded() const { - switch (pixel_format) { - case vk::Format::eBc1RgbaSrgbBlock: - case vk::Format::eBc1RgbaUnormBlock: - case vk::Format::eBc1RgbSrgbBlock: - case vk::Format::eBc1RgbUnormBlock: - case vk::Format::eBc2SrgbBlock: - case vk::Format::eBc2UnormBlock: - case vk::Format::eBc3SrgbBlock: - case vk::Format::eBc3UnormBlock: - case vk::Format::eBc4SnormBlock: - case vk::Format::eBc4UnormBlock: - case vk::Format::eBc5SnormBlock: - case vk::Format::eBc5UnormBlock: - case vk::Format::eBc6HSfloatBlock: - case vk::Format::eBc6HUfloatBlock: - case vk::Format::eBc7SrgbBlock: - case vk::Format::eBc7UnormBlock: - return true; - default: - return false; - } -} - -bool ImageInfo::IsDepthStencil() const { - switch (pixel_format) { - case vk::Format::eD16Unorm: - case vk::Format::eD16UnormS8Uint: - case vk::Format::eD32Sfloat: - case vk::Format::eD32SfloatS8Uint: - return true; - default: - return false; - } -} - -bool ImageInfo::HasStencil() const { - if (pixel_format == vk::Format::eD32SfloatS8Uint || - pixel_format == vk::Format::eD24UnormS8Uint || - pixel_format == vk::Format::eD16UnormS8Uint) { - return true; - } - return false; -} - bool ImageInfo::IsCompatible(const ImageInfo& info) const { return (pixel_format == info.pixel_format && num_samples == info.num_samples && num_bits == info.num_bits); } -bool ImageInfo::IsTilingCompatible(u32 lhs, u32 rhs) const { - if (lhs == rhs) { - return true; - } - if (lhs == 0x0e && rhs == 0x0d) { - return true; - } - if (lhs == 0x0d && rhs == 0x0e) { - return true; - } - return false; -} - void ImageInfo::UpdateSize() { mips_layout.clear(); MipInfo mip_info{}; guest_size = 0; - for (auto mip = 0u; mip < resources.levels; ++mip) { - auto bpp = num_bits; - auto mip_w = pitch >> mip; - auto mip_h = size.height >> mip; + for (s32 mip = 0; mip < resources.levels; ++mip) { + u32 mip_w = pitch >> mip; + u32 mip_h = size.height >> mip; if (props.is_block) { mip_w = (mip_w + 3) / 4; mip_h = (mip_h + 3) / 4; } mip_w = std::max(mip_w, 1u); mip_h = std::max(mip_h, 1u); - auto mip_d = std::max(size.depth >> mip, 1u); - auto thickness = 1; + u32 mip_d = std::max(size.depth >> mip, 1u); + u32 thickness = 1; if (props.is_pow2) { mip_w = std::bit_ceil(mip_w); @@ -238,35 +163,36 @@ void ImageInfo::UpdateSize() { mip_d = std::bit_ceil(mip_d); } - switch (tiling_mode) { - case AmdGpu::TilingMode::Display_Linear: { - std::tie(mip_info.pitch, mip_info.size) = - ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples); + switch (array_mode) { + case AmdGpu::ArrayMode::ArrayLinearGeneral: + case AmdGpu::ArrayMode::ArrayLinearAligned: { + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = + ImageSizeLinearAligned(mip_w, mip_h, num_bits, num_samples); break; } - case AmdGpu::TilingMode::Texture_Volume: + case AmdGpu::ArrayMode::Array1DTiledThick: thickness = 4; mip_d += (-mip_d) & (thickness - 1); [[fallthrough]]; - case AmdGpu::TilingMode::Display_MicroTiled: - case AmdGpu::TilingMode::Texture_MicroTiled: { - std::tie(mip_info.pitch, mip_info.size) = - ImageSizeMicroTiled(mip_w, mip_h, thickness, bpp, num_samples); + case AmdGpu::ArrayMode::Array1DTiledThin1: { + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = + ImageSizeMicroTiled(mip_w, mip_h, thickness, num_bits, num_samples); break; } - case AmdGpu::TilingMode::Display_MacroTiled: - case AmdGpu::TilingMode::Texture_MacroTiled: - case AmdGpu::TilingMode::Depth_MacroTiled: { + case AmdGpu::ArrayMode::Array2DTiledThick: + thickness = 4; + mip_d += (-mip_d) & (thickness - 1); + [[fallthrough]]; + case AmdGpu::ArrayMode::Array2DTiledThin1: { ASSERT(!props.is_block); - std::tie(mip_info.pitch, mip_info.size) = ImageSizeMacroTiled( - mip_w, mip_h, thickness, bpp, num_samples, tiling_idx, mip, alt_tile); + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = ImageSizeMacroTiled( + mip_w, mip_h, thickness, num_bits, num_samples, tile_mode, mip, alt_tile); break; } default: { - UNREACHABLE(); + UNREACHABLE_MSG("Unknown array mode {}", magic_enum::enum_name(array_mode)); } } - mip_info.height = mip_h; if (props.is_block) { mip_info.pitch = std::max(mip_info.pitch * 4, 32u); mip_info.height = std::max(mip_info.height * 4, 32u); @@ -283,7 +209,7 @@ s32 ImageInfo::MipOf(const ImageInfo& info) const { return -1; } - if (!IsTilingCompatible(info.tiling_idx, tiling_idx)) { + if (info.array_mode != array_mode) { return -1; } @@ -321,7 +247,7 @@ s32 ImageInfo::MipOf(const ImageInfo& info) const { } const auto mip_d = std::max(info.size.depth >> mip, 1u); - if (info.type == vk::ImageType::e3D && type == vk::ImageType::e2D) { + if (info.type == AmdGpu::ImageType::Color3D && type == AmdGpu::ImageType::Color2D) { // In case of 2D array to 3D copy, make sure we have proper number of layers. if (resources.layers != mip_d) { return -1; diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 9fa3b6c3d..00f56b1c7 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -3,16 +3,36 @@ #pragma once +#include + #include "common/types.h" -#include "core/libraries/videoout/buffer.h" -#include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" +#include "video_core/renderer_vulkan/vk_common.h" #include "video_core/texture_cache/types.h" -#include +namespace AmdGpu { +enum class ImageType : u64; +} + +namespace Libraries::VideoOut { +struct BufferAttributeGroup; +} + +namespace Shader { +struct ImageResource; +} namespace VideoCore { +struct ImageProperties { + u32 is_volume : 1; + u32 is_tiled : 1; + u32 is_pow2 : 1; + u32 is_block : 1; + u32 is_depth : 1; + u32 has_stencil : 1; +}; + struct ImageInfo { ImageInfo() = default; ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, VAddr cpu_address) noexcept; @@ -23,61 +43,49 @@ struct ImageInfo { ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& desc) noexcept; bool IsTiled() const { - return tiling_mode != AmdGpu::TilingMode::Display_Linear; + return tile_mode != AmdGpu::TileMode::DisplayLinearAligned; } Extent3D BlockDim() const { - const u32 shift = props.is_block ? 2 : 0; - return Extent3D{size.width >> shift, size.height >> shift, size.depth}; + return props.is_block ? Extent3D{size.width >> 2, size.height >> 2, size.depth} : size; } - bool IsBlockCoded() const; - bool IsDepthStencil() const; - bool HasStencil() const; - s32 MipOf(const ImageInfo& info) const; s32 SliceOf(const ImageInfo& info, s32 mip) const; bool IsCompatible(const ImageInfo& info) const; - bool IsTilingCompatible(u32 lhs, u32 rhs) const; - void UpdateSize(); struct { VAddr cmask_addr; VAddr fmask_addr; VAddr htile_addr; - u32 htile_clear_mask{u32(-1)}; + u32 htile_clear_mask = u32(-1); } meta_info{}; - struct { - u32 is_volume : 1; - u32 is_tiled : 1; - u32 is_pow2 : 1; - u32 is_block : 1; - } props{}; // Surface properties with impact on various calculation factors - + ImageProperties props{}; vk::Format pixel_format = vk::Format::eUndefined; - vk::ImageType type = vk::ImageType::e2D; + AmdGpu::ImageType type; SubresourceExtent resources; Extent3D size{1, 1, 1}; u32 num_bits{}; u32 num_samples = 1; - u32 pitch = 0; - AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear}; + u32 pitch{}; + AmdGpu::TileMode tile_mode = AmdGpu::TileMode::DisplayLinearAligned; + AmdGpu::ArrayMode array_mode = AmdGpu::ArrayMode::ArrayLinearAligned; struct MipInfo { u32 size; u32 pitch; u32 height; u32 offset; }; - boost::container::small_vector mips_layout; - VAddr guest_address{0}; - u32 guest_size{0}; - u32 tiling_idx{0}; // TODO: merge with existing! - bool alt_tile{false}; + boost::container::static_vector mips_layout; + VAddr guest_address{}; + u32 guest_size{}; + u8 bank_swizzle{}; + bool alt_tile{}; - VAddr stencil_addr{0}; - u32 stencil_size{0}; + VAddr stencil_addr{}; + u32 stencil_size{}; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 2e162ce83..1b2bc3ae7 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -29,19 +29,18 @@ vk::ImageViewType ConvertImageViewType(AmdGpu::ImageType type) { } } -bool IsViewTypeCompatible(vk::ImageViewType view_type, vk::ImageType image_type) { +bool IsViewTypeCompatible(AmdGpu::ImageType view_type, AmdGpu::ImageType image_type) { switch (view_type) { - case vk::ImageViewType::e1D: - case vk::ImageViewType::e1DArray: - return image_type == vk::ImageType::e1D; - case vk::ImageViewType::e2D: - case vk::ImageViewType::e2DArray: - return image_type == vk::ImageType::e2D || image_type == vk::ImageType::e3D; - case vk::ImageViewType::eCube: - case vk::ImageViewType::eCubeArray: - return image_type == vk::ImageType::e2D; - case vk::ImageViewType::e3D: - return image_type == vk::ImageType::e3D; + case AmdGpu::ImageType::Color1D: + case AmdGpu::ImageType::Color1DArray: + return image_type == AmdGpu::ImageType::Color1D; + case AmdGpu::ImageType::Color2D: + case AmdGpu::ImageType::Color2DArray: + case AmdGpu::ImageType::Color2DMsaa: + case AmdGpu::ImageType::Color2DMsaaArray: + return image_type == AmdGpu::ImageType::Color2D || image_type == AmdGpu::ImageType::Color3D; + case AmdGpu::ImageType::Color3D: + return image_type == AmdGpu::ImageType::Color3D; default: UNREACHABLE(); } @@ -63,7 +62,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, const Shader::ImageReso range.base.layer = image.base_array; range.extent.levels = image.NumViewLevels(desc.is_array); range.extent.layers = image.NumViewLayers(desc.is_array); - type = ConvertImageViewType(image.GetViewType(desc.is_array)); + type = image.GetViewType(desc.is_array); if (!is_storage) { mapping = Vulkan::LiverpoolToVK::ComponentMapping(image.DstSelect()); @@ -73,7 +72,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, const Shader::ImageReso ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer) noexcept { range.base.layer = col_buffer.view.slice_start; range.extent.layers = col_buffer.NumSlices() - range.base.layer; - type = range.extent.layers > 1 ? vk::ImageViewType::e2DArray : vk::ImageViewType::e2D; + type = range.extent.layers > 1 ? AmdGpu::ImageType::Color2DArray : AmdGpu::ImageType::Color2D; format = Vulkan::LiverpoolToVK::SurfaceFormat(col_buffer.GetDataFmt(), col_buffer.GetNumberFmt()); } @@ -86,7 +85,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, is_storage = ctl.depth_write_enable; range.base.layer = view.slice_start; range.extent.layers = view.NumSlices() - range.base.layer; - type = range.extent.layers > 1 ? vk::ImageViewType::e2DArray : vk::ImageViewType::e2D; + type = range.extent.layers > 1 ? AmdGpu::ImageType::Color2DArray : AmdGpu::ImageType::Color2D; } ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info_, Image& image, @@ -113,7 +112,7 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info const vk::ImageViewCreateInfo image_view_ci = { .pNext = &usage_ci, .image = image.image, - .viewType = info.type, + .viewType = ConvertImageViewType(info.type), .format = instance.GetSupportedFormat(format, image.format_features), .components = info.mapping, .subresourceRange{ @@ -124,9 +123,9 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info .layerCount = info.range.extent.layers, }, }; - if (!IsViewTypeCompatible(image_view_ci.viewType, image.info.type)) { + if (!IsViewTypeCompatible(info.type, image.info.type)) { LOG_ERROR(Render_Vulkan, "image view type {} is incompatible with image type {}", - vk::to_string(image_view_ci.viewType), vk::to_string(image.info.type)); + vk::to_string(image_view_ci.viewType), vk::to_string(image_view_ci.viewType)); } auto [view_result, view] = instance.GetDevice().createImageViewUnique(image_view_ci); diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index 6a17490bf..a0bcd157a 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -23,7 +23,7 @@ struct ImageViewInfo { ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, AmdGpu::Liverpool::DepthView view, AmdGpu::Liverpool::DepthControl ctl); - vk::ImageViewType type = vk::ImageViewType::e2D; + AmdGpu::ImageType type = AmdGpu::ImageType::Color2D; vk::Format format = vk::Format::eR8G8B8A8Unorm; SubresourceRange range; vk::ComponentMapping mapping{}; @@ -45,9 +45,8 @@ struct ImageView { ImageView(ImageView&&) = default; ImageView& operator=(ImageView&&) = default; - ImageId image_id{}; - Extent3D size{0, 0, 0}; - ImageViewInfo info{}; + ImageId image_id; + ImageViewInfo info; vk::UniqueImageView image_view; }; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index fa2029b8f..9f7894f1e 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -1,7 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include #include #include "common/assert.h" @@ -25,7 +24,8 @@ static constexpr u64 NumFramesBeforeRemoval = 32; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, - blit_helper{instance, scheduler}, tile_manager{instance, scheduler} { + blit_helper{instance, scheduler}, + tile_manager{instance, scheduler, buffer_cache.GetUtilityBuffer(MemoryUsage::Stream)} { // Create basic null image at fixed image ID. const auto null_id = GetNullImage(vk::Format::eR8G8B8A8Unorm); ASSERT(null_id.index == NULL_IMAGE_ID.index); @@ -63,8 +63,8 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { ImageInfo info{}; info.pixel_format = format; - info.type = vk::ImageType::e2D; - info.tiling_idx = static_cast(AmdGpu::TilingMode::Texture_MicroTiled); + info.type = AmdGpu::ImageType::Color2D; + info.tile_mode = AmdGpu::TileMode::Thin1DThin; info.num_bits = 32; info.UpdateSize(); @@ -107,8 +107,8 @@ void TextureCache::DownloadImageMemory(ImageId image_id) { .bufferImageHeight = image.info.size.height, .imageSubresource = { - .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth - : vk::ImageAspectFlagBits::eColor, + .aspectMask = image.info.props.is_depth ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = image.info.resources.layers, @@ -196,11 +196,12 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi ImageId cache_image_id) { auto& cache_image = slot_images[cache_image_id]; - if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) { + if (!cache_image.info.props.is_depth && !requested_info.props.is_depth) { return {}; } - const bool stencil_match = requested_info.HasStencil() == cache_image.info.HasStencil(); + const bool stencil_match = + requested_info.props.has_stencil == cache_image.info.props.has_stencil; const bool bpp_match = requested_info.num_bits == cache_image.info.num_bits; // If an image in the cache has less slices we need to expand it @@ -210,27 +211,27 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi case BindingType::Texture: // The guest requires a depth sampled texture, but cache can offer only Rxf. Need to // recreate the image. - recreate |= requested_info.IsDepthStencil() && !cache_image.info.IsDepthStencil(); + recreate |= requested_info.props.is_depth && !cache_image.info.props.is_depth; break; case BindingType::Storage: // If the guest is going to use previously created depth as storage, the image needs to be // recreated. (TODO: Probably a case with linear rgba8 aliasing is legit) - recreate |= cache_image.info.IsDepthStencil(); + recreate |= cache_image.info.props.is_depth; break; case BindingType::RenderTarget: // Render target can have only Rxf format. If the cache contains only Dx[S8] we need to // re-create the image. - ASSERT(!requested_info.IsDepthStencil()); - recreate |= cache_image.info.IsDepthStencil(); + ASSERT(!requested_info.props.is_depth); + recreate |= cache_image.info.props.is_depth; break; case BindingType::DepthTarget: // The guest has requested previously allocated texture to be bound as a depth target. // In this case we need to convert Rx float to a Dx[S8] as requested - recreate |= !cache_image.info.IsDepthStencil(); + recreate |= !cache_image.info.props.is_depth; // The guest is trying to bind a depth target and cache has it. Need to be sure that aspects // and bpp match - recreate |= cache_image.info.IsDepthStencil() && !(stencil_match && bpp_match); + recreate |= cache_image.info.props.is_depth && !(stencil_match && bpp_match); break; default: break; @@ -251,9 +252,13 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi if (cache_image.info.num_samples == 1 && new_info.num_samples == 1) { // Perform depth<->color copy using the intermediate copy buffer. - const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); - new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); - } else if (cache_image.info.num_samples == 1 && new_info.IsDepthStencil() && + if (instance.IsMaintenance8Supported()) { + new_image.CopyImage(cache_image); + } else { + const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); + new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); + } + } else if (cache_image.info.num_samples == 1 && new_info.props.is_depth && new_info.num_samples > 1) { // Perform a rendering pass to transfer the channels of source as samples in dest. blit_helper.BlitColorToMsDepth(cache_image, new_image); @@ -294,12 +299,12 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {depth_image_id, -1, -1}; } - if (image_info.IsBlockCoded() && !tex_cache_image.info.IsBlockCoded()) { - // Compressed view of uncompressed image with same block size. - // We need to recreate the image with compressed format and copy. + // Compressed view of uncompressed image with same block size. + if (image_info.props.is_block && !tex_cache_image.info.props.is_block) { return {ExpandImage(image_info, cache_image_id), -1, -1}; } + // Size and resources are less than or equal, use image view. if (image_info.pixel_format != tex_cache_image.info.pixel_format || image_info.guest_size <= tex_cache_image.info.guest_size) { auto result_id = merged_image_id ? merged_image_id : cache_image_id; @@ -309,16 +314,15 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {is_compatible ? result_id : ImageId{}, -1, -1}; } + // Size and resources are greater, expand the image. if (image_info.type == tex_cache_image.info.type && image_info.resources > tex_cache_image.info.resources) { - // Size and resources are greater, expand the image. return {ExpandImage(image_info, cache_image_id), -1, -1}; } - if (image_info.tiling_mode != tex_cache_image.info.tiling_mode) { - // Size is greater but resources are not, because the tiling mode is different. - // Likely this memory address is being reused for a different image with a different - // tiling mode. + // Size is greater but resources are not, because the tiling mode is different. + // Likely the address is reused for a image with a different tiling mode. + if (image_info.tile_mode != tex_cache_image.info.tile_mode) { if (safe_to_delete) { FreeImage(cache_image_id); } @@ -346,9 +350,9 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag // Left overlap, the image from cache is a possible subresource of the image requested if (auto mip = tex_cache_image.info.MipOf(image_info); mip >= 0) { if (auto slice = tex_cache_image.info.SliceOf(image_info, mip); slice >= 0) { + // We have a larger image created and a separate one, representing a subres of it + // bound as render target. In this case we need to rebind render target. if (tex_cache_image.binding.is_target) { - // We have a larger image created and a separate one, representing a subres of - // it, bound as render target. In this case we need to rebind render target. tex_cache_image.binding.needs_rebind = 1u; if (merged_image_id) { GetImage(merged_image_id).binding.is_target = 1u; @@ -385,7 +389,6 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { auto& src_image = slot_images[image_id]; auto& new_image = slot_images[new_image_id]; - src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); RefreshImage(new_image); new_image.CopyImage(src_image); @@ -400,7 +403,7 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { return new_image_id; } -ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { +ImageId TextureCache::FindImage(BaseDesc& desc, bool exact_fmt) { const auto& info = desc.info; if (info.guest_address == 0) [[unlikely]] { @@ -420,28 +423,22 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { if (cache_image.info.guest_address != info.guest_address) { continue; } - if (False(flags & FindFlags::RelaxSize) && cache_image.info.guest_size != info.guest_size) { + if (cache_image.info.guest_size != info.guest_size) { continue; } - if (False(flags & FindFlags::RelaxDim) && cache_image.info.size != info.size) { + if (cache_image.info.size != info.size) { continue; } - if (False(flags & FindFlags::RelaxFmt) && - (!IsVulkanFormatCompatible(cache_image.info.pixel_format, info.pixel_format) || - (cache_image.info.type != info.type && info.size != Extent3D{1, 1, 1}))) { + if (!IsVulkanFormatCompatible(cache_image.info.pixel_format, info.pixel_format) || + (cache_image.info.type != info.type && info.size != Extent3D{1, 1, 1})) { continue; } - if (True(flags & FindFlags::ExactFmt) && - info.pixel_format != cache_image.info.pixel_format) { + if (exact_fmt && info.pixel_format != cache_image.info.pixel_format) { continue; } image_id = cache_id; } - if (True(flags & FindFlags::NoCreate) && !image_id) { - return {}; - } - // Try to resolve overlaps (if any) int view_mip{-1}; int view_slice{-1}; @@ -463,8 +460,7 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { if (image_id) { Image& image_resolved = slot_images[image_id]; - if (True(flags & FindFlags::ExactFmt) && - info.pixel_format != image_resolved.info.pixel_format) { + if (exact_fmt && info.pixel_format != image_resolved.info.pixel_format) { // Cannot reuse this image as we need the exact requested format. image_id = {}; } else if (image_resolved.info.resources < info.resources) { @@ -495,6 +491,37 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { return image_id; } +ImageId TextureCache::FindImageFromRange(VAddr address, size_t size, bool ensure_valid) { + boost::container::small_vector image_ids; + ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { + if (image.info.guest_address != address) { + return; + } + if (ensure_valid && (False(image.flags & ImageFlagBits::GpuModified) || + True(image.flags & ImageFlagBits::Dirty))) { + return; + } + image_ids.push_back(image_id); + }); + if (image_ids.size() == 1) { + // Sometimes image size might not exactly match with requested buffer size + // If we only found 1 candidate image use it without too many questions. + return image_ids.back(); + } + if (!image_ids.empty()) { + for (s32 i = 0; i < image_ids.size(); ++i) { + Image& image = slot_images[image_ids[i]]; + if (image.info.guest_size == size) { + return image_ids[i]; + } + } + LOG_WARNING(Render_Vulkan, + "Failed to find exact image match for copy addr={:#x}, size={:#x}", address, + size); + } + return {}; +} + ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo& view_info) { Image& image = slot_images[image_id]; if (const ImageViewId view_id = image.FindView(view_info); view_id) { @@ -511,8 +538,7 @@ ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; if (desc.type == BindingType::Storage) { image.flags |= ImageFlagBits::GpuModified; - if (Config::readbackLinearImages() && - image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + if (Config::readbackLinearImages() && !image.info.props.is_tiled) { download_images.emplace(image_id); } } @@ -524,10 +550,6 @@ ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; - if (Config::readbackLinearImages() && - image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { - download_images.emplace(image_id); - } image.usage.render_target = 1u; UpdateImage(image_id); @@ -552,7 +574,7 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) { Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; image.usage.depth_target = 1u; - image.usage.stencil = image.info.HasStencil(); + image.usage.stencil = image.info.props.has_stencil; UpdateImage(image_id); // Register meta data for this depth buffer @@ -589,11 +611,7 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) { } void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_scheduler /*= nullptr*/) { - if (False(image.flags & ImageFlagBits::Dirty)) { - return; - } - - if (image.info.num_samples > 1) { + if (False(image.flags & ImageFlagBits::Dirty) || image.info.num_samples > 1) { return; } @@ -644,15 +662,10 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const u32 extent_width = mip_pitch ? std::min(mip_pitch, width) : width; const u32 extent_height = mip_height ? std::min(mip_height, height) : height; - const bool is_volume = image.info.tiling_mode == AmdGpu::TilingMode::Texture_Volume; - const u32 height_aligned = mip_height && image.info.IsTiled() && !is_volume - ? std::max(mip_height, 8U) - : mip_height; - image_copy.push_back({ .bufferOffset = mip_offset, .bufferRowLength = mip_pitch, - .bufferImageHeight = height_aligned, + .bufferImageHeight = mip_height, .imageSubresource{ .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, .mipLevel = m, @@ -674,13 +687,10 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size; - const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); - - const auto cmdbuf = sched_ptr->CommandBuffer(); - - // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard - if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, + const auto [in_buffer, in_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); + if (auto barrier = in_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, vk::PipelineStageFlagBits2::eTransfer)) { + const auto cmdbuf = sched_ptr->CommandBuffer(); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -689,7 +699,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule } const auto [buffer, offset] = - tile_manager.TryDetile(vk_buffer->Handle(), buf_offset, image.info); + !custom_scheduler ? tile_manager.DetileImage(in_buffer->Handle(), in_offset, image.info) + : std::make_pair(in_buffer->Handle(), in_offset); for (auto& copy : image_copy) { copy.bufferOffset += offset; } @@ -715,6 +726,7 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const auto image_barriers = image.GetBarriers(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, vk::PipelineStageFlagBits2::eTransfer, {}); + const auto cmdbuf = sched_ptr->CommandBuffer(); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -911,8 +923,8 @@ void TextureCache::RunGarbageCollector() { --num_deletions; auto& image = slot_images[image_id]; const bool download = image.SafeToDownload(); - const bool linear = image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear; - if (!linear && download) { + const bool tiled = image.info.IsTiled(); + if (tiled && download) { // This is a workaround for now. We can't handle non-linear image downloads. return false; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c4f09f6a0..b63a7abf2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -26,17 +26,6 @@ namespace VideoCore { class BufferCache; class PageManager; -enum class FindFlags { - NoCreate = 1 << 0, ///< Do not create an image if searching for one fails. - RelaxDim = 1 << 1, ///< Do not check the dimentions of image, only address. - RelaxSize = 1 << 2, ///< Do not check that the size matches exactly. - RelaxFmt = 1 << 3, ///< Do not check that format is compatible. - ExactFmt = 1 << 4, ///< Require the format to be exactly the same. -}; -DECLARE_ENUM_FLAG_OPERATORS(FindFlags) - -static constexpr u32 MaxInvalidateDist = 12_MB; - class TextureCache { // Default values for garbage collection static constexpr s64 DEFAULT_PRESSURE_GC_MEMORY = 1_GB + 512_MB; @@ -103,6 +92,10 @@ public: BufferCache& buffer_cache, PageManager& tracker); ~TextureCache(); + TileManager& GetTileManager() noexcept { + return tile_manager; + } + /// Invalidates any image in the logical page range. void InvalidateMemory(VAddr addr, size_t size); @@ -116,7 +109,10 @@ public: void ProcessDownloadImages(); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); + [[nodiscard]] ImageId FindImage(BaseDesc& desc, bool exact_fmt = false); + + /// Retrieves image whose address matches provided + [[nodiscard]] ImageId FindImageFromRange(VAddr address, size_t size, bool ensure_valid = true); /// Retrieves an image view with the properties of the specified image id. [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); @@ -145,6 +141,7 @@ public: [[nodiscard]] ImageId ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding, ImageId cache_img_id); + /// Creates a new image with provided image info and copies subresources from image_id [[nodiscard]] ImageId ExpandImage(const ImageInfo& info, ImageId image_id); /// Reuploads image contents. diff --git a/src/video_core/texture_cache/tile.h b/src/video_core/texture_cache/tile.h index 54938b801..68c9428fe 100644 --- a/src/video_core/texture_cache/tile.h +++ b/src/video_core/texture_cache/tile.h @@ -6,6 +6,10 @@ #include "common/assert.h" #include "common/types.h" +namespace AmdGpu { +enum class TileMode : u32; +} + namespace VideoCore { // clang-format off @@ -285,17 +289,17 @@ constexpr std::array macro_tile_extents_alt{ constexpr std::pair micro_tile_extent{8u, 8u}; constexpr auto hw_pipe_interleave = 256u; -constexpr std::pair GetMacroTileExtents(u32 tiling_idx, u32 bpp, u32 num_samples, - bool alt) { +constexpr std::pair GetMacroTileExtents(AmdGpu::TileMode tile_mode, u32 bpp, + u32 num_samples, bool alt) { ASSERT(num_samples <= 8); const auto samples_log = static_cast(std::log2(num_samples)); - const auto row = tiling_idx * 5; + const auto row = u32(tile_mode) * 5; const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64, 128 return (alt ? macro_tile_extents_alt : macro_tile_extents)[samples_log][row + column]; } -constexpr std::pair ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, - u32 num_samples) { +constexpr std::tuple ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, + u32 num_samples) { const auto pitch_align = std::max(8u, 64u / ((bpp + 7) / 8)); auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = height; @@ -305,11 +309,11 @@ constexpr std::pair ImageSizeLinearAligned(u32 pitch, u32 height, u pitch_aligned += pitch_align; log_sz = pitch_aligned * height_aligned * num_samples; } - return {pitch_aligned, (log_sz * bpp + 7) / 8}; + return {pitch_aligned, height_aligned, (log_sz * bpp + 7) / 8}; } -constexpr std::pair ImageSizeMicroTiled(u32 pitch, u32 height, u32 thickness, u32 bpp, - u32 num_samples) { +constexpr std::tuple ImageSizeMicroTiled(u32 pitch, u32 height, u32 thickness, + u32 bpp, u32 num_samples) { const auto& [pitch_align, height_align] = micro_tile_extent; auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); @@ -318,14 +322,14 @@ constexpr std::pair ImageSizeMicroTiled(u32 pitch, u32 height, u32 pitch_aligned += pitch_align; log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; } - return {pitch_aligned, log_sz}; + return {pitch_aligned, height_aligned, log_sz}; } -constexpr std::pair ImageSizeMacroTiled(u32 pitch, u32 height, u32 thickness, u32 bpp, - u32 num_samples, u32 tiling_idx, u32 mip_n, - bool alt) { - const auto& [pitch_align, height_align] = - GetMacroTileExtents(tiling_idx, bpp, num_samples, alt); +constexpr std::tuple ImageSizeMacroTiled(u32 pitch, u32 height, u32 thickness, + u32 bpp, u32 num_samples, + AmdGpu::TileMode tile_mode, u32 mip_n, + bool alt) { + const auto [pitch_align, height_align] = GetMacroTileExtents(tile_mode, bpp, num_samples, alt); ASSERT(pitch_align != 0 && height_align != 0); bool downgrade_to_micro = false; if (mip_n > 0) { @@ -341,7 +345,7 @@ constexpr std::pair ImageSizeMacroTiled(u32 pitch, u32 height, u32 const auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); const auto log_sz = pitch_aligned * height_aligned * num_samples; - return {pitch_aligned, (log_sz * bpp + 7) / 8}; + return {pitch_aligned, height_aligned, (log_sz * bpp + 7) / 8}; } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index dd6fae457..d872f8b2e 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "video_core/buffer_cache/buffer.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" @@ -8,82 +9,25 @@ #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/tile_manager.h" -#include "video_core/host_shaders/detilers/display_micro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_32bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_8bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_128bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_16bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_32bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_8bpp_comp.h" +#include "video_core/host_shaders/tiling_comp.h" -// #include #include #include namespace VideoCore { -const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const { - switch (info.tiling_mode) { - case AmdGpu::TilingMode::Texture_MicroTiled: - switch (info.num_bits) { - case 8: - return &detilers[DetilerType::Micro8]; - case 16: - return &detilers[DetilerType::Micro16]; - case 32: - return &detilers[DetilerType::Micro32]; - case 64: - return &detilers[DetilerType::Micro64]; - case 128: - return &detilers[DetilerType::Micro128]; - default: - return nullptr; - } - case AmdGpu::TilingMode::Texture_Volume: - switch (info.num_bits) { - case 8: - return &detilers[DetilerType::Macro8]; - case 32: - return &detilers[DetilerType::Macro32]; - case 64: - return &detilers[DetilerType::Macro64]; - default: - return nullptr; - } - break; - case AmdGpu::TilingMode::Display_MicroTiled: - switch (info.num_bits) { - case 64: - return &detilers[DetilerType::Display_Micro64]; - default: - return nullptr; - } - break; - default: - return nullptr; - } -} - -struct DetilerParams { - u32 num_levels; - u32 pitch0; - u32 height; - std::array sizes; +struct TilingInfo { + u32 bank_swizzle; + u32 num_slices; + u32 num_mips; + std::array mips; }; -TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) - : instance{instance}, scheduler{scheduler} { - static const std::array detiler_shaders{ - HostShaders::MICRO_8BPP_COMP, HostShaders::MICRO_16BPP_COMP, - HostShaders::MICRO_32BPP_COMP, HostShaders::MICRO_64BPP_COMP, - HostShaders::MICRO_128BPP_COMP, HostShaders::MACRO_8BPP_COMP, - HostShaders::MACRO_32BPP_COMP, HostShaders::MACRO_64BPP_COMP, - HostShaders::DISPLAY_MICRO_64BPP_COMP, - }; - - boost::container::static_vector bindings{ +TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + StreamBuffer& stream_buffer_) + : instance{instance}, scheduler{scheduler}, stream_buffer{stream_buffer_} { + const auto device = instance.GetDevice(); + const std::array bindings = {{ { .binding = 0, .descriptorType = vk::DescriptorType::eStorageBuffer, @@ -96,88 +40,52 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute, }, - }; + { + .binding = 2, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }}; const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, .bindingCount = static_cast(bindings.size()), .pBindings = bindings.data(), }; - auto desc_layout_result = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + auto desc_layout_result = device.createDescriptorSetLayoutUnique(desc_layout_ci); ASSERT_MSG(desc_layout_result.result == vk::Result::eSuccess, "Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result.result)); desc_layout = std::move(desc_layout_result.value); - const vk::PushConstantRange push_constants = { - .stageFlags = vk::ShaderStageFlagBits::eCompute, - .offset = 0, - .size = sizeof(DetilerParams), + const vk::DescriptorSetLayout set_layout = *desc_layout; + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &set_layout, + .pushConstantRangeCount = 0U, + .pPushConstantRanges = nullptr, }; - - for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) { - auto& ctx = detilers[pl_id]; - - const auto& module = Vulkan::Compile( - detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); - - // Set module debug name - auto module_name = magic_enum::enum_name(static_cast(pl_id)); - Vulkan::SetObjectName(instance.GetDevice(), module, module_name); - - const vk::PipelineShaderStageCreateInfo shader_ci = { - .stage = vk::ShaderStageFlagBits::eCompute, - .module = module, - .pName = "main", - }; - - const vk::DescriptorSetLayout set_layout = *desc_layout; - const vk::PipelineLayoutCreateInfo layout_info = { - .setLayoutCount = 1U, - .pSetLayouts = &set_layout, - .pushConstantRangeCount = 1, - .pPushConstantRanges = &push_constants, - }; - auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info); - ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", - vk::to_string(layout_result)); - ctx.pl_layout = std::move(layout); - - const vk::ComputePipelineCreateInfo compute_pipeline_ci = { - .stage = shader_ci, - .layout = *ctx.pl_layout, - }; - auto result = instance.GetDevice().createComputePipelineUnique( - /*pipeline_cache*/ {}, compute_pipeline_ci); - if (result.result == vk::Result::eSuccess) { - ctx.pl = std::move(result.value); - } else { - UNREACHABLE_MSG("Detiler pipeline creation failed!"); - } - - // Once pipeline is compiled, we don't need the shader module anymore - instance.GetDevice().destroyShaderModule(module); - } + auto [layout_result, layout] = device.createPipelineLayoutUnique(layout_info); + ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", + vk::to_string(layout_result)); + pl_layout = std::move(layout); } TileManager::~TileManager() = default; -TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*= false*/) { - const auto usage = vk::BufferUsageFlagBits::eStorageBuffer | - (is_storage ? vk::BufferUsageFlagBits::eTransferSrc - : vk::BufferUsageFlagBits::eTransferDst); - const vk::BufferCreateInfo buffer_ci{ +TileManager::ScratchBuffer TileManager::GetScratchBuffer(u32 size) { + constexpr auto usage = + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; + + const vk::BufferCreateInfo buffer_ci = { .size = size, .usage = usage, }; - VmaAllocationCreateInfo alloc_info{ - .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | - VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT - : static_cast(0), + const VmaAllocationCreateInfo alloc_info{ .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, - .requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT - : static_cast(0), }; VkBuffer buffer; @@ -189,67 +97,120 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /* return {buffer, allocation}; } -void TileManager::Upload(ScratchBuffer buffer, const void* data, size_t size) { - VmaAllocationInfo alloc_info{}; - vmaGetAllocationInfo(instance.GetAllocator(), buffer.second, &alloc_info); - ASSERT(size <= alloc_info.size); - void* ptr{}; - const auto result = vmaMapMemory(instance.GetAllocator(), buffer.second, &ptr); - ASSERT(result == VK_SUCCESS); - std::memcpy(ptr, data, size); - vmaUnmapMemory(instance.GetAllocator(), buffer.second); +vk::Pipeline TileManager::GetTilingPipeline(const ImageInfo& info, bool is_tiler) { + const u32 pl_id = u32(info.tile_mode) * NUM_BPPS + std::bit_width(info.num_bits) - 4; + auto& tiling_pipelines = is_tiler ? tilers : detilers; + if (auto pipeline = *tiling_pipelines[pl_id]; pipeline != VK_NULL_HANDLE) { + return pipeline; + } + + const auto device = instance.GetDevice(); + std::vector defines = { + fmt::format("BITS_PER_PIXEL={}", info.num_bits), + fmt::format("NUM_SAMPLES={}", info.num_samples), + fmt::format("ARRAY_MODE={}", u32(info.array_mode)), + fmt::format("MICRO_TILE_MODE={}", u32(AmdGpu::GetMicroTileMode(info.tile_mode))), + fmt::format("MICRO_TILE_THICKNESS={}", AmdGpu::GetMicroTileThickness(info.array_mode)), + }; + if (AmdGpu::IsMacroTiled(info.array_mode)) { + const auto macro_tile_mode = + AmdGpu::CalculateMacrotileMode(info.tile_mode, info.num_bits, info.num_samples); + const u32 num_banks = AmdGpu::GetNumBanks(macro_tile_mode); + defines.emplace_back( + fmt::format("PIPE_CONFIG={}", u32(AmdGpu::GetPipeConfig(info.tile_mode)))); + defines.emplace_back(fmt::format("BANK_WIDTH={}", AmdGpu::GetBankWidth(macro_tile_mode))); + defines.emplace_back(fmt::format("BANK_HEIGHT={}", AmdGpu::GetBankHeight(macro_tile_mode))); + defines.emplace_back(fmt::format("NUM_BANKS={}", num_banks)); + defines.emplace_back(fmt::format("NUM_BANK_BITS={}", std::bit_width(num_banks) - 1)); + defines.emplace_back( + fmt::format("TILE_SPLIT_BYTES={}", AmdGpu::GetTileSplit(info.tile_mode))); + defines.emplace_back( + fmt::format("MACRO_TILE_ASPECT={}", AmdGpu::GetMacrotileAspect(macro_tile_mode))); + } + if (is_tiler) { + defines.emplace_back(fmt::format("IS_TILER=1")); + } + + const auto& module = Vulkan::Compile(HostShaders::TILING_COMP, + vk::ShaderStageFlagBits::eCompute, device, defines); + const auto module_name = fmt::format("{}_{} {}", magic_enum::enum_name(info.tile_mode), + info.num_bits, is_tiler ? "tiler" : "detiler"); + LOG_WARNING(Render_Vulkan, "Compiling shader {}", module_name); + for (const auto& def : defines) { + LOG_WARNING(Render_Vulkan, "#define {}", def); + } + Vulkan::SetObjectName(device, module, module_name); + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + const vk::ComputePipelineCreateInfo compute_pipeline_ci = { + .stage = shader_ci, + .layout = *pl_layout, + }; + auto [result, pipeline] = + device.createComputePipelineUnique(VK_NULL_HANDLE, compute_pipeline_ci); + ASSERT_MSG(result == vk::Result::eSuccess, "Detiler pipeline creation failed {}", + vk::to_string(result)); + tiling_pipelines[pl_id] = std::move(pipeline); + device.destroyShaderModule(module); + return *tiling_pipelines[pl_id]; } -void TileManager::FreeBuffer(ScratchBuffer buffer) { - vmaDestroyBuffer(instance.GetAllocator(), buffer.first, buffer.second); -} - -std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_offset, - const ImageInfo& info) { +TileManager::Result TileManager::DetileImage(vk::Buffer in_buffer, u32 in_offset, + const ImageInfo& info) { if (!info.props.is_tiled) { return {in_buffer, in_offset}; } - const auto* detiler = GetDetiler(info); - if (!detiler) { - if (info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled && - info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled && - info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) { - LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", - vk::to_string(info.pixel_format), NameOf(info.tiling_mode)); + TilingInfo params{}; + params.bank_swizzle = info.bank_swizzle; + params.num_slices = info.props.is_volume ? info.size.depth : info.resources.layers; + params.num_mips = info.resources.levels; + for (u32 mip = 0; mip < params.num_mips; ++mip) { + auto& mip_info = params.mips[mip]; + mip_info = info.mips_layout[mip]; + if (info.props.is_block) { + mip_info.pitch = std::max((mip_info.pitch + 3) / 4, 1U); + mip_info.height = std::max((mip_info.height + 3) / 4, 1U); } - return {in_buffer, in_offset}; } - const u32 image_size = info.guest_size; + const vk::DescriptorBufferInfo params_buffer_info{ + .buffer = stream_buffer.Handle(), + .offset = stream_buffer.Copy(¶ms, sizeof(params), instance.UniformMinAlignment()), + .range = sizeof(params), + }; - // Prepare output buffer - auto out_buffer = AllocBuffer(image_size, true); - scheduler.DeferOperation([=, this]() { FreeBuffer(out_buffer); }); + const auto [out_buffer, out_allocation] = GetScratchBuffer(info.guest_size); + scheduler.DeferOperation([this, out_buffer, out_allocation]() { + vmaDestroyBuffer(instance.GetAllocator(), out_buffer, out_allocation); + }); - auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, GetTilingPipeline(info, false)); - const vk::DescriptorBufferInfo input_buffer_info{ + const vk::DescriptorBufferInfo tiled_buffer_info{ .buffer = in_buffer, .offset = in_offset, - .range = image_size, + .range = info.guest_size, }; - const vk::DescriptorBufferInfo output_buffer_info{ - .buffer = out_buffer.first, + const vk::DescriptorBufferInfo linear_buffer_info{ + .buffer = out_buffer, .offset = 0, - .range = image_size, + .range = info.guest_size, }; - std::vector set_writes{ + const std::array set_writes = {{ { .dstSet = VK_NULL_HANDLE, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &input_buffer_info, + .pBufferInfo = &tiled_buffer_info, }, { .dstSet = VK_NULL_HANDLE, @@ -257,41 +218,107 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &output_buffer_info, + .pBufferInfo = &linear_buffer_info, }, - }; - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *detiler->pl_layout, 0, - set_writes); + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 2, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = ¶ms_buffer_info, + }, + }}; + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pl_layout, 0, set_writes); - DetilerParams params; - params.num_levels = info.resources.levels; - params.pitch0 = info.pitch >> (info.props.is_block ? 2u : 0u); - params.height = info.size.height; - if (info.tiling_mode == AmdGpu::TilingMode::Texture_Volume || - info.tiling_mode == AmdGpu::TilingMode::Display_MicroTiled) { - if (info.resources.levels != 1) { - LOG_ERROR(Render_Vulkan, "Unexpected mipmaps for volume and display tilings {}", - info.resources.levels); + const auto dim_x = (info.guest_size / (info.num_bits / 8)) / 64; + cmdbuf.dispatch(dim_x, 1, 1); + return {out_buffer, 0}; +} + +void TileManager::TileImage(vk::Image in_image, std::span buffer_copies, + vk::Buffer out_buffer, u32 out_offset, const ImageInfo& info) { + if (!info.props.is_tiled) { + for (auto& copy : buffer_copies) { + copy.bufferOffset += out_offset; } - const auto tiles_per_row = info.pitch / 8u; - const auto tiles_per_slice = tiles_per_row * ((info.size.height + 7u) / 8u); - params.sizes[0] = tiles_per_row; - params.sizes[1] = tiles_per_slice; - } else { - ASSERT(info.resources.levels <= params.sizes.size()); - std::memset(¶ms.sizes, 0, sizeof(params.sizes)); - for (int m = 0; m < info.resources.levels; ++m) { - params.sizes[m] = info.mips_layout[m].size + (m > 0 ? params.sizes[m - 1] : 0); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(in_image, vk::ImageLayout::eTransferSrcOptimal, out_buffer, + buffer_copies); + return; + } + + TilingInfo params{}; + params.bank_swizzle = info.bank_swizzle; + params.num_slices = info.props.is_volume ? info.size.depth : info.resources.layers; + params.num_mips = static_cast(buffer_copies.size()); + for (u32 mip = 0; mip < params.num_mips; ++mip) { + auto& mip_info = params.mips[mip]; + mip_info = info.mips_layout[mip]; + if (info.props.is_block) { + mip_info.pitch = std::max((mip_info.pitch + 3) / 4, 1U); + mip_info.height = std::max((mip_info.height + 3) / 4, 1U); } } - cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params), - ¶ms); + const vk::DescriptorBufferInfo params_buffer_info{ + .buffer = stream_buffer.Handle(), + .offset = stream_buffer.Copy(¶ms, sizeof(params), instance.UniformMinAlignment()), + .range = sizeof(params), + }; - ASSERT((image_size % 64) == 0); - const auto num_tiles = image_size / (64 * (info.num_bits / 8)); - cmdbuf.dispatch(num_tiles, 1, 1); - return {out_buffer.first, 0}; + const auto [temp_buffer, temp_allocation] = GetScratchBuffer(info.guest_size); + scheduler.DeferOperation([this, temp_buffer, temp_allocation]() { + vmaDestroyBuffer(instance.GetAllocator(), temp_buffer, temp_allocation); + }); + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(in_image, vk::ImageLayout::eTransferSrcOptimal, temp_buffer, + buffer_copies); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, GetTilingPipeline(info, true)); + + const vk::DescriptorBufferInfo tiled_buffer_info{ + .buffer = out_buffer, + .offset = out_offset, + .range = info.guest_size, + }; + + const vk::DescriptorBufferInfo linear_buffer_info{ + .buffer = temp_buffer, + .offset = 0, + .range = info.guest_size, + }; + + const std::array set_writes = {{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &tiled_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &linear_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 2, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = ¶ms_buffer_info, + }, + }}; + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pl_layout, 0, set_writes); + + const auto dim_x = (info.guest_size / (info.num_bits / 8)) / 64; + cmdbuf.dispatch(dim_x, 1, 1); } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index adda16b3d..dc897a31e 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -4,56 +4,42 @@ #pragma once #include "common/types.h" +#include "video_core/amdgpu/tiling.h" #include "video_core/buffer_cache/buffer.h" namespace VideoCore { -class TextureCache; struct ImageInfo; - -enum DetilerType : u32 { - Micro8, - Micro16, - Micro32, - Micro64, - Micro128, - - Macro8, - Macro32, - Macro64, - - Display_Micro64, - - Max -}; - -struct DetilerContext { - vk::UniquePipeline pl; - vk::UniquePipelineLayout pl_layout; -}; +class StreamBuffer; class TileManager { + static constexpr size_t NUM_BPPS = 5; + public: using ScratchBuffer = std::pair; + using Result = std::pair; - TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); + explicit TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + StreamBuffer& stream_buffer); ~TileManager(); - std::pair TryDetile(vk::Buffer in_buffer, u32 in_offset, - const ImageInfo& info); + void TileImage(vk::Image in_image, std::span buffer_copies, + vk::Buffer out_buffer, u32 out_offset, const ImageInfo& info); - ScratchBuffer AllocBuffer(u32 size, bool is_storage = false); - void Upload(ScratchBuffer buffer, const void* data, size_t size); - void FreeBuffer(ScratchBuffer buffer); + Result DetileImage(vk::Buffer in_buffer, u32 in_offset, const ImageInfo& info); private: - const DetilerContext* GetDetiler(const ImageInfo& info) const; + vk::Pipeline GetTilingPipeline(const ImageInfo& info, bool is_tiler); + ScratchBuffer GetScratchBuffer(u32 size); private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + StreamBuffer& stream_buffer; vk::UniqueDescriptorSetLayout desc_layout; - std::array detilers; + vk::UniquePipelineLayout pl_layout; + std::array detilers{}; + std::array tilers{}; }; } // namespace VideoCore