diff --git a/CMakeLists.txt b/CMakeLists.txt index f09e3a1ed..94f5d4dce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -925,6 +925,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/amdgpu/pm4_cmds.h src/video_core/amdgpu/pm4_opcodes.h src/video_core/amdgpu/resource.h + src/video_core/amdgpu/tiling.cpp + src/video_core/amdgpu/tiling.h src/video_core/amdgpu/types.h src/video_core/amdgpu/default_context.cpp src/video_core/buffer_cache/buffer.cpp diff --git a/src/core/devtools/widget/reg_popup.cpp b/src/core/devtools/widget/reg_popup.cpp index 7bb38df24..90d8c9681 100644 --- a/src/core/devtools/widget/reg_popup.cpp +++ b/src/core/devtools/widget/reg_popup.cpp @@ -64,7 +64,7 @@ void RegPopup::DrawColorBuffer(const AmdGpu::Liverpool::ColorBuffer& buffer) { "NumSamples()", buffer.NumSamples(), "NumSlices()", buffer.NumSlices(), "GetColorSliceSize()", buffer.GetColorSliceSize(), - "GetTilingMode()", buffer.GetTilingMode(), + "GetTileMode()", buffer.GetTileMode(), "IsTiled()", buffer.IsTiled(), "NumFormat()", buffer.GetNumberFmt() ); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index d80f2956b..16c841581 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -326,12 +326,12 @@ constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept } if (!image.Valid()) { LOG_DEBUG(Render_Vulkan, "Encountered invalid image sharp"); - image = is_depth ? AmdGpu::Image::NullDepth() : AmdGpu::Image::Null(); + image = AmdGpu::Image::Null(is_depth); } else if (is_depth) { const auto data_fmt = image.GetDataFmt(); if (data_fmt != AmdGpu::DataFormat::Format16 && data_fmt != AmdGpu::DataFormat::Format32) { LOG_DEBUG(Render_Vulkan, "Encountered non-depth image used with depth instruction!"); - image = AmdGpu::Image::NullDepth(); + image = AmdGpu::Image::Null(true); } } return image; diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index d3e671c58..7901b8db6 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -6,8 +6,8 @@ #include #include "common/types.h" -#include "frontend/fetch_shader.h" #include "shader_recompiler/backend/bindings.h" +#include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/info.h" namespace Shader { diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index d693a0a38..941a79c2d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -22,7 +22,7 @@ #include "common/unique_function.h" #include "shader_recompiler/params.h" #include "video_core/amdgpu/pixel_format.h" -#include "video_core/amdgpu/resource.h" +#include "video_core/amdgpu/tiling.h" #include "video_core/amdgpu/types.h" namespace Vulkan { @@ -426,7 +426,7 @@ struct Liverpool { BitField<0, 2, ZFormat> format; BitField<2, 2, u32> num_samples; BitField<13, 3, u32> tile_split; - BitField<20, 3, u32> tile_mode_index; + BitField<20, 3, TileMode> tile_mode_index; BitField<23, 4, u32> decompress_on_n_zplanes; BitField<27, 1, u32> allow_expclear; BitField<28, 1, u32> read_size; @@ -502,6 +502,14 @@ struct Liverpool { const auto bpe = NumBits() >> 3; // in bytes return (depth_slice.tile_max + 1) * 64 * bpe * NumSamples(); } + + TileMode GetTileMode() const { + return z_info.tile_mode_index.Value(); + } + + bool IsTiled() const { + return GetTileMode() != TileMode::DisplayLinearAligned; + } }; enum class ClipSpace : u32 { @@ -888,7 +896,7 @@ struct Liverpool { u32 u32all; } info; union Color0Attrib { - BitField<0, 5, TilingMode> tile_mode_index; + BitField<0, 5, TileMode> tile_mode_index; BitField<5, 5, u32> fmask_tile_mode_index; BitField<10, 2, u32> fmask_bank_height; BitField<12, 3, u32> num_samples_log2; @@ -949,13 +957,13 @@ struct Liverpool { return slice_size; } - TilingMode GetTilingMode() const { - return info.linear_general ? TilingMode::Display_Linear + TileMode GetTileMode() const { + return info.linear_general ? TileMode::DisplayLinearAligned : attrib.tile_mode_index.Value(); } bool IsTiled() const { - return GetTilingMode() != TilingMode::Display_Linear; + return GetTileMode() != TileMode::DisplayLinearAligned; } [[nodiscard]] DataFormat GetDataFmt() const { diff --git a/src/video_core/amdgpu/pixel_format.cpp b/src/video_core/amdgpu/pixel_format.cpp index 682cdf357..d88b05f41 100644 --- a/src/video_core/amdgpu/pixel_format.cpp +++ b/src/video_core/amdgpu/pixel_format.cpp @@ -178,7 +178,7 @@ static constexpr std::array BITS_PER_BLOCK = { 64, // 12 Format16_16_16_16 96, // 13 Format32_32_32 128, // 14 Format32_32_32_32 - 0, // 15 + -1, // 15 16, // 16 Format5_6_5 16, // 17 Format1_5_5_5 16, // 18 Format5_5_5_1 @@ -186,15 +186,15 @@ static constexpr std::array BITS_PER_BLOCK = { 32, // 20 Format8_24 32, // 21 Format24_8 64, // 22 FormatX24_8_32 - 0, // 23 - 0, // 24 - 0, // 25 - 0, // 26 - 0, // 27 - 0, // 28 - 0, // 29 - 0, // 30 - 0, // 31 + -1, // 23 + -1, // 24 + -1, // 25 + -1, // 26 + -1, // 27 + -1, // 28 + -1, // 29 + -1, // 30 + -1, // 31 16, // 32 FormatGB_GR 16, // 33 FormatBG_RG 32, // 34 Format5_9_9_9 @@ -213,4 +213,55 @@ u32 NumBitsPerBlock(DataFormat format) { return BITS_PER_BLOCK[index]; } +static constexpr std::array BITS_PER_ELEMENT = { + 0, // 0 FormatInvalid + 8, // 1 Format8 + 16, // 2 Format16 + 16, // 3 Format8_8 + 32, // 4 Format32 + 32, // 5 Format16_16 + 32, // 6 Format10_11_11 + 32, // 7 Format11_11_10 + 32, // 8 Format10_10_10_2 + 32, // 9 Format2_10_10_10 + 32, // 10 Format8_8_8_8 + 64, // 11 Format32_32 + 64, // 12 Format16_16_16_16 + 96, // 13 Format32_32_32 + 128, // 14 Format32_32_32_32 + -1, // 15 + 16, // 16 Format5_6_5 + 16, // 17 Format1_5_5_5 + 16, // 18 Format5_5_5_1 + 16, // 19 Format4_4_4_4 + 32, // 20 Format8_24 + 32, // 21 Format24_8 + 64, // 22 FormatX24_8_32 + -1, // 23 + -1, // 24 + -1, // 25 + -1, // 26 + -1, // 27 + -1, // 28 + -1, // 29 + -1, // 30 + -1, // 31 + 16, // 32 FormatGB_GR + 16, // 33 FormatBG_RG + 32, // 34 Format5_9_9_9 + 4, // 35 FormatBc1 + 8, // 36 FormatBc2 + 8, // 37 FormatBc3 + 4, // 38 FormatBc4 + 8, // 39 FormatBc5 + 8, // 40 FormatBc6 + 8, // 41 FormatBc7 +}; + +u32 NumBitsPerElement(DataFormat format) { + const u32 index = static_cast(format); + ASSERT_MSG(index < BITS_PER_ELEMENT.size(), "Invalid data format = {}", format); + return BITS_PER_ELEMENT[index]; +} + } // namespace AmdGpu diff --git a/src/video_core/amdgpu/pixel_format.h b/src/video_core/amdgpu/pixel_format.h index 45c688e57..e7ad27dd3 100644 --- a/src/video_core/amdgpu/pixel_format.h +++ b/src/video_core/amdgpu/pixel_format.h @@ -85,7 +85,7 @@ enum class NumberClass { Uint, }; -enum class CompSwizzle : u8 { +enum class CompSwizzle : u32 { Zero = 0, One = 1, Red = 4, @@ -313,7 +313,11 @@ constexpr NumberClass GetNumberClass(const NumberFormat nfmt) { } constexpr bool IsInteger(const NumberFormat nfmt) { - return nfmt == AmdGpu::NumberFormat::Sint || nfmt == AmdGpu::NumberFormat::Uint; + return nfmt == NumberFormat::Sint || nfmt == NumberFormat::Uint; +} + +constexpr bool IsBlockCoded(DataFormat format) { + return format >= DataFormat::FormatBc1 && format <= DataFormat::FormatBc7; } std::string_view NameOf(DataFormat fmt); @@ -321,6 +325,7 @@ std::string_view NameOf(NumberFormat fmt); u32 NumComponents(DataFormat format); u32 NumBitsPerBlock(DataFormat format); +u32 NumBitsPerElement(DataFormat format); } // namespace AmdGpu diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index ff9cfe2cc..742cdee86 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -7,6 +7,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "video_core/amdgpu/pixel_format.h" +#include "video_core/amdgpu/tiling.h" namespace AmdGpu { @@ -138,37 +139,6 @@ constexpr std::string_view NameOf(ImageType type) { } } -enum class TilingMode : u32 { - Depth_MacroTiled = 0u, - Display_Linear = 0x8u, - Display_MicroTiled = 0x9u, - Display_MacroTiled = 0xAu, - Texture_MicroTiled = 0xDu, - Texture_MacroTiled = 0xEu, - Texture_Volume = 0x13u, -}; - -constexpr std::string_view NameOf(TilingMode type) { - switch (type) { - case TilingMode::Depth_MacroTiled: - return "Depth_MacroTiled"; - case TilingMode::Display_Linear: - return "Display_Linear"; - case TilingMode::Display_MicroTiled: - return "Display_MicroTiled"; - case TilingMode::Display_MacroTiled: - return "Display_MacroTiled"; - case TilingMode::Texture_MicroTiled: - return "Texture_MicroTiled"; - case TilingMode::Texture_MacroTiled: - return "Texture_MacroTiled"; - case TilingMode::Texture_Volume: - return "Texture_Volume"; - default: - return "Unknown"; - } -} - struct Image { u64 base_address : 38; u64 mtype_l2 : 2; @@ -212,28 +182,15 @@ struct Image { u64 alt_tile_mode : 1; u64 : 39; - static constexpr Image Null() { + static constexpr Image Null(bool is_depth) { Image image{}; - image.data_format = u64(DataFormat::Format8_8_8_8); - image.num_format = u64(NumberFormat::Unorm); + image.data_format = u64(is_depth ? DataFormat::Format32 : DataFormat::Format8_8_8_8); + image.num_format = u64(is_depth ? NumberFormat::Float : NumberFormat::Unorm); image.dst_sel_x = u64(CompSwizzle::Red); image.dst_sel_y = u64(CompSwizzle::Green); image.dst_sel_z = u64(CompSwizzle::Blue); image.dst_sel_w = u64(CompSwizzle::Alpha); - image.tiling_index = u64(TilingMode::Texture_MicroTiled); - image.type = u64(ImageType::Color2D); - return image; - } - - static constexpr Image NullDepth() { - Image image{}; - image.data_format = u64(DataFormat::Format32); - image.num_format = u64(NumberFormat::Float); - image.dst_sel_x = u64(CompSwizzle::Red); - image.dst_sel_y = u64(CompSwizzle::Green); - image.dst_sel_z = u64(CompSwizzle::Blue); - image.dst_sel_w = u64(CompSwizzle::Alpha); - image.tiling_index = u64(TilingMode::Texture_MicroTiled); + image.tiling_index = u64(TileMode::Thin1DThin); image.type = u64(ImageType::Color2D); return image; } @@ -314,16 +271,26 @@ struct Image { return MapNumberConversion(NumberFormat(num_format), DataFormat(data_format)); } - TilingMode GetTilingMode() const { - if (tiling_index >= 0 && tiling_index <= 7) { - return tiling_index == 5 ? TilingMode::Texture_MicroTiled - : TilingMode::Depth_MacroTiled; - } - return static_cast(tiling_index); + TileMode GetTileMode() const { + return static_cast(tiling_index); } bool IsTiled() const { - return GetTilingMode() != TilingMode::Display_Linear; + return GetTileMode() != TileMode::DisplayLinearAligned && + GetTileMode() != TileMode::DisplayLinearGeneral; + } + + u8 GetBankSwizzle() const { + const auto tile_mode = GetTileMode(); + const auto array_mode = GetArrayMode(tile_mode); + const auto dfmt = GetDataFmt(); + if (!alt_tile_mode || dfmt == DataFormat::FormatInvalid || !IsMacroTiled(array_mode)) { + return 0; + } + const u32 bpp = NumBitsPerElement(dfmt); + const auto macro_tile_mode = CalculateMacrotileMode(tile_mode, bpp, NumSamples()); + const u32 banks = GetAltNumBanks(macro_tile_mode); + return (((banks - 1) << 4) & base_address) >> 4; } bool IsFmask() const noexcept { @@ -331,7 +298,21 @@ struct Image { GetDataFmt() <= DataFormat::FormatFmask64_8; } - [[nodiscard]] ImageType GetViewType(const bool is_array) const noexcept { + ImageType GetBaseType() const noexcept { + const auto base_type = GetType(); + if (base_type == ImageType::Color1DArray) { + return ImageType::Color1D; + } + if (base_type == ImageType::Color2DArray) { + return ImageType::Color2D; + } + if (base_type == ImageType::Color2DMsaa || base_type == ImageType::Color2DMsaaArray) { + return ImageType::Color2D; + } + return base_type; + } + + ImageType GetViewType(const bool is_array) const noexcept { const auto base_type = GetType(); if (IsCube()) { // Cube needs to remain array type regardless of instruction array specifier. @@ -422,13 +403,7 @@ enum class Filter : u64 { }; constexpr bool IsAnisoFilter(const Filter filter) { - switch (filter) { - case Filter::AnisoPoint: - case Filter::AnisoLinear: - return true; - default: - return false; - } + return filter == Filter::AnisoPoint || filter == Filter::AnisoLinear; } enum class MipFilter : u64 { @@ -495,7 +470,7 @@ struct Sampler { } float LodBias() const noexcept { - return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / + return static_cast(static_cast((lod_bias.Value() ^ 0x2000u) - 0x2000u)) / 256.0f; } diff --git a/src/video_core/amdgpu/tiling.cpp b/src/video_core/amdgpu/tiling.cpp new file mode 100644 index 000000000..e16d695b1 --- /dev/null +++ b/src/video_core/amdgpu/tiling.cpp @@ -0,0 +1,554 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/assert.h" +#include "video_core/amdgpu/tiling.h" + +#include + +namespace AmdGpu { + +static constexpr u32 MICROTILE_SIZE = 8; +static constexpr u32 DRAM_ROW_SIZE = 1024; + +std::string_view NameOf(TileMode tile_mode) { + return magic_enum::enum_name(tile_mode); +} + +ArrayMode GetArrayMode(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth1DThin: + case TileMode::Display1DThin: + case TileMode::Thin1DThin: + return ArrayMode::Array1DTiledThin1; + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Display2DThin: + case TileMode::Thin2DThin: + return ArrayMode::Array2DTiledThin1; + case TileMode::DisplayThinPrt: + case TileMode::ThinThinPrt: + return ArrayMode::ArrayPrtTiledThin1; + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::Display2DThinPrt: + case TileMode::Thin2DThinPrt: + return ArrayMode::ArrayPrt2DTiledThin1; + case TileMode::Thin3DThin: + case TileMode::Thin3DThinPrt: + return ArrayMode::Array3DTiledThin1; + case TileMode::Thick1DThick: + return ArrayMode::Array1DTiledThick; + case TileMode::Thick2DThick: + return ArrayMode::Array2DTiledThick; + case TileMode::Thick3DThick: + return ArrayMode::Array3DTiledThick; + case TileMode::ThickThickPrt: + return ArrayMode::ArrayPrtTiledThick; + case TileMode::Thick2DThickPrt: + return ArrayMode::ArrayPrt2DTiledThick; + case TileMode::Thick3DThickPrt: + return ArrayMode::ArrayPrt3DTiledThick; + case TileMode::Thick2DXThick: + return ArrayMode::Array2DTiledXThick; + case TileMode::Thick3DXThick: + return ArrayMode::Array3DTiledXThick; + case TileMode::DisplayLinearAligned: + return ArrayMode::ArrayLinearAligned; + case TileMode::DisplayLinearGeneral: + return ArrayMode::ArrayLinearGeneral; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +MicroTileMode GetMicroTileMode(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + return MicroTileMode::Depth; + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::DisplayLinearGeneral: + return MicroTileMode::Display; + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + return MicroTileMode::Thin; + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + return MicroTileMode::Thick; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +PipeConfig GetPipeConfig(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick2DThickPrt: + case TileMode::Thick2DXThick: + return PipeConfig::P8_32x32_16x16; + case TileMode::DisplayThinPrt: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick3DXThick: + return PipeConfig::P8_32x32_8x16; + case TileMode::DisplayLinearGeneral: + return PipeConfig::P2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +PipeConfig GetAltPipeConfig(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + return PipeConfig::P16_32x32_8x16; + case TileMode::DisplayLinearGeneral: + return PipeConfig::P2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetSampleSplit(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth2DThin128: + case TileMode::Depth2DThin256: + case TileMode::Depth2DThin512: + case TileMode::Depth2DThin1K: + case TileMode::Depth1DThin: + case TileMode::Depth2DThinPrt256: + case TileMode::Depth2DThinPrt1K: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Thin1DThin: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + case TileMode::DisplayLinearGeneral: + return 1; + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + return 2; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetTileSplit(TileMode tile_mode) { + switch (tile_mode) { + case TileMode::Depth2DThin64: + case TileMode::Depth1DThin: + case TileMode::DisplayLinearAligned: + case TileMode::Display1DThin: + case TileMode::Display2DThin: + case TileMode::DisplayThinPrt: + case TileMode::Display2DThinPrt: + case TileMode::Thin1DThin: + case TileMode::Thin2DThin: + case TileMode::Thin3DThin: + case TileMode::ThinThinPrt: + case TileMode::Thin2DThinPrt: + case TileMode::Thin3DThinPrt: + case TileMode::Thick1DThick: + case TileMode::Thick2DThick: + case TileMode::Thick3DThick: + case TileMode::ThickThickPrt: + case TileMode::Thick2DThickPrt: + case TileMode::Thick3DThickPrt: + case TileMode::Thick2DXThick: + case TileMode::Thick3DXThick: + case TileMode::DisplayLinearGeneral: + return 64; + case TileMode::Depth2DThin128: + return 128; + case TileMode::Depth2DThin256: + case TileMode::Depth2DThinPrt256: + return 256; + case TileMode::Depth2DThin512: + return 512; + case TileMode::Depth2DThin1K: + case TileMode::Depth2DThinPrt1K: + return 1024; + default: + UNREACHABLE_MSG("Unknown tile mode = {}", u32(tile_mode)); + } +} + +u32 GetBankWidth(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetBankHeight(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x2_16_Dup: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + case MacroTileMode::Mode_1x8_16: + return 8; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetNumBanks(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 2; + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_4_Dup: + return 4; + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_8_Dup: + return 8; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 16; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetMacrotileAspect(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltBankHeight(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + return 2; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 4; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltNumBanks(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 2; + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + return 4; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x4_16_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + return 8; + case MacroTileMode::Mode_1x8_16: + return 16; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +u32 GetAltMacrotileAspect(MacroTileMode mode) { + switch (mode) { + case MacroTileMode::Mode_1x1_16: + case MacroTileMode::Mode_1x1_16_Dup: + case MacroTileMode::Mode_1x1_8: + case MacroTileMode::Mode_1x1_4: + case MacroTileMode::Mode_1x1_2: + case MacroTileMode::Mode_1x1_2_Dup: + case MacroTileMode::Mode_1x2_16_Dup: + case MacroTileMode::Mode_1x1_16_Dup2: + case MacroTileMode::Mode_1x1_8_Dup: + case MacroTileMode::Mode_1x1_4_Dup: + case MacroTileMode::Mode_1x1_2_Dup2: + case MacroTileMode::Mode_1x1_2_Dup3: + return 1; + case MacroTileMode::Mode_1x4_16: + case MacroTileMode::Mode_1x2_16: + case MacroTileMode::Mode_1x8_16: + case MacroTileMode::Mode_1x4_16_Dup: + return 2; + default: + UNREACHABLE_MSG("Unknown macro tile mode = {}", u32(mode)); + } +} + +bool IsMacroTiled(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array1DTiledThick: + return false; + case ArrayMode::Array2DTiledThin1: + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array2DTiledXThick: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::Array3DTiledThin1: + case ArrayMode::Array3DTiledThick: + case ArrayMode::Array3DTiledXThick: + case ArrayMode::ArrayPrt3DTiledThick: + return true; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +bool IsPrt(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::ArrayPrt3DTiledThick: + return true; + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array1DTiledThick: + case ArrayMode::Array2DTiledThin1: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array2DTiledXThick: + case ArrayMode::Array3DTiledThin1: + case ArrayMode::Array3DTiledThick: + case ArrayMode::Array3DTiledXThick: + return false; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +u32 GetMicroTileThickness(ArrayMode array_mode) { + switch (array_mode) { + case ArrayMode::ArrayLinearGeneral: + case ArrayMode::ArrayLinearAligned: + case ArrayMode::Array1DTiledThin1: + case ArrayMode::Array2DTiledThin1: + case ArrayMode::ArrayPrtTiledThin1: + case ArrayMode::ArrayPrt2DTiledThin1: + case ArrayMode::ArrayPrt3DTiledThin1: + case ArrayMode::Array3DTiledThin1: + return 1; + case ArrayMode::Array1DTiledThick: + case ArrayMode::Array2DTiledThick: + case ArrayMode::Array3DTiledThick: + case ArrayMode::ArrayPrtTiledThick: + case ArrayMode::ArrayPrt2DTiledThick: + case ArrayMode::ArrayPrt3DTiledThick: + return 4; + case ArrayMode::Array2DTiledXThick: + case ArrayMode::Array3DTiledXThick: + return 8; + default: + UNREACHABLE_MSG("Unknown array mode = {}", u32(array_mode)); + } +} + +u32 GetPipeCount(PipeConfig pipe_cfg) { + switch (pipe_cfg) { + case PipeConfig::P2: + return 2; + case PipeConfig::P8_32x32_8x16: + case PipeConfig::P8_32x32_16x16: + return 8; + case PipeConfig::P16_32x32_8x16: + return 16; + default: + UNREACHABLE_MSG("Unknown pipe config = {}", u32(pipe_cfg)); + } +} + +MacroTileMode CalculateMacrotileMode(TileMode tile_mode, u32 bpp, u32 num_samples) { + ASSERT_MSG(std::has_single_bit(num_samples) && num_samples <= 16, "Invalid sample count {}", + num_samples); + ASSERT_MSG(bpp >= 1 && bpp <= 128, "Invalid bpp {}", bpp); + + const ArrayMode array_mode = GetArrayMode(tile_mode); + ASSERT_MSG(IsMacroTiled(array_mode), "Tile mode not macro tiled"); + + const MicroTileMode micro_tile_mode = GetMicroTileMode(tile_mode); + const u32 sample_split = GetSampleSplit(tile_mode); + const u32 tile_split_hw = GetTileSplit(tile_mode); + + const u32 tile_thickness = GetMicroTileThickness(array_mode); + const u32 tile_bytes_1x = bpp * MICROTILE_SIZE * MICROTILE_SIZE * tile_thickness / 8; + const u32 color_tile_split = std::max(256U, sample_split * tile_bytes_1x); + const u32 tile_split = + micro_tile_mode == MicroTileMode::Depth ? tile_split_hw : color_tile_split; + const u32 tilesplic = std::min(DRAM_ROW_SIZE, tile_split); + const u32 tile_bytes = std::min(tilesplic, num_samples * tile_bytes_1x); + const u32 mtm_idx = std::bit_width(tile_bytes / 64) - 1; + return IsPrt(array_mode) ? MacroTileMode(mtm_idx + 8) : MacroTileMode(mtm_idx); +} + +} // namespace AmdGpu diff --git a/src/video_core/amdgpu/tiling.h b/src/video_core/amdgpu/tiling.h new file mode 100644 index 000000000..3cf0d444d --- /dev/null +++ b/src/video_core/amdgpu/tiling.h @@ -0,0 +1,149 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/types.h" + +namespace AmdGpu { + +struct Image; + +static constexpr size_t NUM_TILE_MODES = 32; + +enum class PipeConfig : u32 { + P2 = 0, + P4_8x16 = 4, + P4_16x16 = 5, + P4_16x32 = 6, + P4_32x32 = 7, + P8_16x16_8x16 = 8, + P8_16x32_8x16 = 9, + P8_32x32_8x16 = 10, + P8_16x32_16x16 = 11, + P8_32x32_16x16 = 12, + P8_32x32_16x32 = 13, + P8_32x64_32x32 = 14, + P16_32x32_8x16 = 16, + P16_32x32_16x16 = 17, + P16 = 18, +}; + +enum class MicroTileMode : u32 { + Display = 0, + Thin = 1, + Depth = 2, + Rotated = 3, + Thick = 4, +}; + +enum class MacroTileMode : u32 { + Mode_1x4_16 = 0, + Mode_1x2_16 = 1, + Mode_1x1_16 = 2, + Mode_1x1_16_Dup = 3, + Mode_1x1_8 = 4, + Mode_1x1_4 = 5, + Mode_1x1_2 = 6, + Mode_1x1_2_Dup = 7, + Mode_1x8_16 = 8, + Mode_1x4_16_Dup = 9, + Mode_1x2_16_Dup = 10, + Mode_1x1_16_Dup2 = 11, + Mode_1x1_8_Dup = 12, + Mode_1x1_4_Dup = 13, + Mode_1x1_2_Dup2 = 14, + Mode_1x1_2_Dup3 = 15, +}; + +enum class ArrayMode : u32 { + ArrayLinearGeneral = 0, + ArrayLinearAligned = 1, + Array1DTiledThin1 = 2, + Array1DTiledThick = 3, + Array2DTiledThin1 = 4, + ArrayPrtTiledThin1 = 5, + ArrayPrt2DTiledThin1 = 6, + Array2DTiledThick = 7, + Array2DTiledXThick = 8, + ArrayPrtTiledThick = 9, + ArrayPrt2DTiledThick = 10, + ArrayPrt3DTiledThin1 = 11, + Array3DTiledThin1 = 12, + Array3DTiledThick = 13, + Array3DTiledXThick = 14, + ArrayPrt3DTiledThick = 15, +}; + +enum class TileMode : u32 { + Depth2DThin64 = 0, + Depth2DThin128 = 1, + Depth2DThin256 = 2, + Depth2DThin512 = 3, + Depth2DThin1K = 4, + Depth1DThin = 5, + Depth2DThinPrt256 = 6, + Depth2DThinPrt1K = 7, + DisplayLinearAligned = 8, + Display1DThin = 9, + Display2DThin = 10, + DisplayThinPrt = 11, + Display2DThinPrt = 12, + Thin1DThin = 13, + Thin2DThin = 14, + Thin3DThin = 15, + ThinThinPrt = 16, + Thin2DThinPrt = 17, + Thin3DThinPrt = 18, + Thick1DThick = 19, + Thick2DThick = 20, + Thick3DThick = 21, + ThickThickPrt = 22, + Thick2DThickPrt = 23, + Thick3DThickPrt = 24, + Thick2DXThick = 25, + Thick3DXThick = 26, + DisplayLinearGeneral = 31, +}; + +std::string_view NameOf(TileMode tile_mode); + +ArrayMode GetArrayMode(TileMode tile_mode); + +MicroTileMode GetMicroTileMode(TileMode tile_mode); + +PipeConfig GetPipeConfig(TileMode tile_mode); + +PipeConfig GetAltPipeConfig(TileMode tile_mode); + +u32 GetSampleSplit(TileMode tile_mode); + +u32 GetTileSplit(TileMode tile_mode); + +u32 GetBankWidth(MacroTileMode mode); + +u32 GetBankHeight(MacroTileMode mode); + +u32 GetNumBanks(MacroTileMode mode); + +u32 GetMacrotileAspect(MacroTileMode mode); + +u32 GetAltBankHeight(MacroTileMode mode); + +u32 GetAltNumBanks(MacroTileMode mode); + +u32 GetAltMacrotileAspect(MacroTileMode mode); + +bool IsMacroTiled(ArrayMode array_mode); + +bool IsPrt(ArrayMode array_mode); + +u32 GetMicroTileThickness(ArrayMode array_mode); + +u32 GetPipeCount(PipeConfig pipe_cfg); + +MacroTileMode CalculateMacrotileMode(TileMode tile_mode, u32 bpp, u32 num_samples); + +} // namespace AmdGpu diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 8cbeae87a..c1e203b30 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -396,7 +396,9 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, // Avoid using ObtainBuffer here as that might give us the stream buffer. const BufferId buffer_id = FindBuffer(src, num_bytes); auto& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, src, num_bytes, false, false); + if (SynchronizeBuffer(buffer, src, num_bytes, false, true)) { + texture_cache.InvalidateMemoryFromGPU(dst, num_bytes); + } return buffer; }(); auto& dst_buffer = [&] -> const Buffer& { @@ -854,7 +856,7 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { } } -void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, +bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer) { boost::container::small_vector copies; size_t total_size_bytes = 0; @@ -867,47 +869,47 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, total_size_bytes += range_size; }, [&] { src_buffer = UploadCopies(buffer, copies, total_size_bytes); }); - SCOPE_EXIT { - if (is_texel_buffer) { - SynchronizeBufferFromImage(buffer, device_addr, size); - } - }; - if (!src_buffer) { - return; + + if (src_buffer) { + scheduler.EndRendering(); + const auto cmdbuf = scheduler.CommandBuffer(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | + vk::AccessFlagBits2::eTransferRead | + vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, + .buffer = buffer.Handle(), + .offset = 0, + .size = buffer.SizeBytes(), + }; + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + }); + cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); + TouchBuffer(buffer); } - scheduler.EndRendering(); - const auto cmdbuf = scheduler.CommandBuffer(); - const vk::BufferMemoryBarrier2 pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite | - vk::AccessFlagBits2::eTransferRead | vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = buffer.Handle(), - .offset = 0, - .size = buffer.SizeBytes(), - }; - const vk::BufferMemoryBarrier2 post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite, - .buffer = buffer.Handle(), - .offset = 0, - .size = buffer.SizeBytes(), - }; - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &pre_barrier, - }); - cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &post_barrier, - }); - TouchBuffer(buffer); + if (is_texel_buffer) { + return SynchronizeBufferFromImage(buffer, device_addr, size); + } + return false; } vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span copies, @@ -944,115 +946,81 @@ vk::Buffer BufferCache::UploadCopies(Buffer& buffer, std::span c } bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { - boost::container::small_vector image_ids; - texture_cache.ForEachImageInRegion(device_addr, size, [&](ImageId image_id, Image& image) { - if (image.info.guest_address != device_addr) { - return; - } - // Only perform sync if image is: - // - GPU modified; otherwise there are no changes to synchronize. - // - Not CPU dirty; otherwise we could overwrite CPU changes with stale GPU changes. - // - Not GPU dirty; otherwise we could overwrite GPU changes with stale image data. - if (False(image.flags & ImageFlagBits::GpuModified) || - True(image.flags & ImageFlagBits::Dirty)) { - return; - } - image_ids.push_back(image_id); - }); - if (image_ids.empty()) { + const ImageId image_id = texture_cache.FindImageFromRange(device_addr, size); + if (!image_id) { return false; } - ImageId image_id{}; - if (image_ids.size() == 1) { - // Sometimes image size might not exactly match with requested buffer size - // If we only found 1 candidate image use it without too many questions. - image_id = image_ids[0]; - } else { - for (s32 i = 0; i < image_ids.size(); ++i) { - Image& image = texture_cache.GetImage(image_ids[i]); - if (image.info.guest_size == size) { - image_id = image_ids[i]; - break; - } - } - if (!image_id) { - LOG_WARNING(Render_Vulkan, - "Failed to find exact image match for copy addr={:#x}, size={:#x}", - device_addr, size); - return false; - } - } Image& image = texture_cache.GetImage(image_id); ASSERT_MSG(device_addr == image.info.guest_address, "Texel buffer aliases image subresources {:x} : {:x}", device_addr, image.info.guest_address); - boost::container::small_vector copies; - u32 offset = buffer.Offset(image.info.guest_address); - const u32 num_layers = image.info.resources.layers; - const u32 max_offset = offset + size; - for (u32 m = 0; m < image.info.resources.levels; m++) { - const u32 width = std::max(image.info.size.width >> m, 1u); - const u32 height = std::max(image.info.size.height >> m, 1u); - const u32 depth = - image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; - const auto [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; - offset += mip_ofs; - if (offset + mip_size > max_offset) { + const u32 buf_offset = buffer.Offset(image.info.guest_address); + boost::container::small_vector buffer_copies; + u32 copy_size = 0; + for (u32 mip = 0; mip < image.info.resources.levels; mip++) { + const auto& mip_info = image.info.mips_layout[mip]; + const u32 width = std::max(image.info.size.width >> mip, 1u); + const u32 height = std::max(image.info.size.height >> mip, 1u); + const u32 depth = std::max(image.info.size.depth >> mip, 1u); + if (buf_offset + mip_info.offset + mip_info.size > buffer.SizeBytes()) { break; } - copies.push_back({ - .bufferOffset = offset, - .bufferRowLength = mip_pitch, - .bufferImageHeight = mip_height, + buffer_copies.push_back(vk::BufferImageCopy{ + .bufferOffset = mip_info.offset, + .bufferRowLength = mip_info.pitch, + .bufferImageHeight = mip_info.height, .imageSubresource{ .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, - .mipLevel = m, + .mipLevel = mip, .baseArrayLayer = 0, - .layerCount = num_layers, + .layerCount = image.info.resources.layers, }, .imageOffset = {0, 0, 0}, .imageExtent = {width, height, depth}, }); + copy_size += mip_info.size; } - if (!copies.empty()) { - scheduler.EndRendering(); - const vk::BufferMemoryBarrier2 pre_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, - .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, - .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, - .buffer = buffer.Handle(), - .offset = max_offset - size, - .size = size, - }; - const vk::BufferMemoryBarrier2 post_barrier = { - .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, - .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, - .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, - .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, - .buffer = buffer.Handle(), - .offset = max_offset - size, - .size = size, - }; - auto barriers = image.GetBarriers(vk::ImageLayout::eTransferSrcOptimal, - vk::AccessFlagBits2::eTransferRead, - vk::PipelineStageFlagBits2::eTransfer, {}); - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &pre_barrier, - .imageMemoryBarrierCount = static_cast(barriers.size()), - .pImageMemoryBarriers = barriers.data(), - }); - cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.Handle(), - copies); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ - .dependencyFlags = vk::DependencyFlagBits::eByRegion, - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &post_barrier, - }); + if (copy_size == 0) { + return false; } + scheduler.EndRendering(); + const vk::BufferMemoryBarrier2 pre_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .srcAccessMask = vk::AccessFlagBits2::eMemoryRead, + .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, + .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, + .buffer = buffer.Handle(), + .offset = buf_offset, + .size = copy_size, + }; + const vk::BufferMemoryBarrier2 post_barrier = { + .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, + .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, + .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, + .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, + .buffer = buffer.Handle(), + .offset = buf_offset, + .size = copy_size, + }; + auto barriers = + image.GetBarriers(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, + vk::PipelineStageFlagBits2::eTransfer, {}); + auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &pre_barrier, + .imageMemoryBarrierCount = static_cast(barriers.size()), + .pImageMemoryBarriers = barriers.data(), + }); + auto& tile_manager = texture_cache.GetTileManager(); + tile_manager.TileImage(image.image, buffer_copies, buffer.Handle(), buf_offset, image.info); + cmdbuf = scheduler.CommandBuffer(); + cmdbuf.pipelineBarrier2(vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &post_barrier, + }); return true; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 2aa67ee42..aecc97db0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -102,15 +102,14 @@ public: /// Retrieves a utility buffer optimized for specified memory usage. StreamBuffer& GetUtilityBuffer(MemoryUsage usage) noexcept { - switch (usage) { - case MemoryUsage::Stream: + if (usage == MemoryUsage::Stream) { return stream_buffer; - case MemoryUsage::Download: + } else if (usage == MemoryUsage::Download) { return download_buffer; - case MemoryUsage::Upload: - return staging_buffer; - case MemoryUsage::DeviceLocal: + } else if (usage == MemoryUsage::DeviceLocal) { return device_buffer; + } else { + return staging_buffer; } } @@ -200,7 +199,7 @@ private: template void ChangeRegister(BufferId buffer_id); - void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, + bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer); vk::Buffer UploadCopies(Buffer& buffer, std::span copies, diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index e88147eb5..486bc51dc 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -16,6 +16,7 @@ set(SHADER_FILES fs_tri.vert fsr.comp post_process.frag + tiling.comp ) set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) diff --git a/src/video_core/host_shaders/tiling.comp b/src/video_core/host_shaders/tiling.comp new file mode 100644 index 000000000..14bb21547 --- /dev/null +++ b/src/video_core/host_shaders/tiling.comp @@ -0,0 +1,444 @@ +// SPDX-FileCopyrightText: Copyright 2025 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 core + +#extension GL_GOOGLE_include_directive : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types : require + +layout (local_size_x = 64, local_size_y = 1, local_size_z = 1) in; + +// #define BITS_PER_PIXEL +// #define NUM_SAMPLES +// #define MICRO_TILE_MODE +// #define ARRAY_MODE +// #define MICRO_TILE_THICKNESS +// #define PIPE_CONFIG +// #define BANK_WIDTH +// #define BANK_HEIGHT +// #define NUM_BANKS +// #define NUM_BANK_BITS +// #define TILE_SPLIT_BYTES +// #define MACRO_TILE_ASPECT + +#define BYTES_PER_PIXEL (BITS_PER_PIXEL / 8) + +#if BITS_PER_PIXEL == 8 +#define BLOCK_TYPE uint8_t +#elif BITS_PER_PIXEL == 16 +#define BLOCK_TYPE uint16_t +#elif BITS_PER_PIXEL == 32 +#define BLOCK_TYPE uint32_t +#elif BITS_PER_PIXEL == 64 +#define BLOCK_TYPE u32vec2 +#elif BITS_PER_PIXEL == 96 +#define BLOCK_TYPE u32vec3 +#else +#define BLOCK_TYPE u32vec4 +#endif + +#if PIPE_CONFIG == ADDR_SURF_P2 +#define NUM_PIPES 2 +#define NUM_PIPE_BITS 1 +#else +#define NUM_PIPES 8 +#define NUM_PIPE_BITS 3 +#endif + +#define MICRO_TILE_WIDTH 8 +#define MICRO_TILE_HEIGHT 8 +#define MICRO_TILE_PIXELS (MICRO_TILE_WIDTH * MICRO_TILE_HEIGHT) +#define MICRO_TILE_BITS (MICRO_TILE_PIXELS * MICRO_TILE_THICKNESS * BITS_PER_PIXEL * NUM_SAMPLES) +#define MICRO_TILE_BYTES (MICRO_TILE_BITS / 8) + +#define NUM_PIPE_INTERLEAVE_BITS 8 + +#define ADDR_SURF_DISPLAY_MICRO_TILING 0 +#define ADDR_SURF_THIN_MICRO_TILING 1 +#define ADDR_SURF_DEPTH_MICRO_TILING 2 +#define ADDR_SURF_ROTATED_MICRO_TILING 3 + +#define ARRAY_LINEAR_GENERAL 0 +#define ARRAY_LINEAR_ALIGNED 1 +#define ARRAY_1D_TILED_THIN1 2 +#define ARRAY_1D_TILED_THICK 3 +#define ARRAY_2D_TILED_THIN1 4 +#define ARRAY_PRT_TILED_THIN1 5 +#define ARRAY_PRT_2D_TILED_THIN1 6 +#define ARRAY_2D_TILED_THICK 7 +#define ARRAY_2D_TILED_XTHICK 8 +#define ARRAY_PRT_TILED_THICK 9 +#define ARRAY_PRT_2D_TILED_THICK 10 +#define ARRAY_PRT_3D_TILED_THIN1 11 +#define ARRAY_3D_TILED_THIN1 12 +#define ARRAY_3D_TILED_THICK 13 +#define ARRAY_3D_TILED_XTHICK 14 +#define ARRAY_PRT_3D_TILED_THICK 15 + +#define ADDR_SURF_P2 0 +#define ADDR_SURF_P8_32x32_8x16 10 +#define ADDR_SURF_P8_32x32_16x16 12 + +#define BITS_PER_BYTE 8 +#define BITS_TO_BYTES(x) (((x) + (BITS_PER_BYTE-1)) / BITS_PER_BYTE) + +#define _BIT(v, b) bitfieldExtract((v), (b), 1) + +struct MipInfo { + uint size; + uint pitch; + uint height; + uint offset; +}; + +layout (set = 0, binding = 0, scalar) buffer InputBuf { + BLOCK_TYPE tiled_data[]; +}; + +layout (set = 0, binding = 1, scalar) buffer OutputBuf { + BLOCK_TYPE linear_data[]; +}; + +layout (set = 0, binding = 2, scalar) uniform TilingInfo { + uint bank_swizzle; + uint num_slices; + uint num_mips; + MipInfo mips[16]; +} info; + +uint32_t ComputePixelIndexWithinMicroTile(uint32_t x, uint32_t y, uint32_t z) { + uint32_t p0 = 0; + uint32_t p1 = 0; + uint32_t p2 = 0; + uint32_t p3 = 0; + uint32_t p4 = 0; + uint32_t p5 = 0; + uint32_t p6 = 0; + uint32_t p7 = 0; + uint32_t p8 = 0; + + uint32_t x0 = _BIT(x, 0); + uint32_t x1 = _BIT(x, 1); + uint32_t x2 = _BIT(x, 2); + uint32_t y0 = _BIT(y, 0); + uint32_t y1 = _BIT(y, 1); + uint32_t y2 = _BIT(y, 2); + uint32_t z0 = _BIT(z, 0); + uint32_t z1 = _BIT(z, 1); + uint32_t z2 = _BIT(z, 2); + +#if MICRO_TILE_MODE == ADDR_SURF_DISPLAY_MICRO_TILING + #if BITS_PER_PIXEL == 8 + p0 = x0; + p1 = x1; + p2 = x2; + p3 = y1; + p4 = y0; + p5 = y2; + #elif BITS_PER_PIXEL == 16 + p0 = x0; + p1 = x1; + p2 = x2; + p3 = y0; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 32 + p0 = x0; + p1 = x1; + p2 = y0; + p3 = x2; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 64 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = x2; + p4 = y1; + p5 = y2; + #elif BITS_PER_PIXEL == 128 + p0 = y0; + p1 = x0; + p2 = x1; + p3 = x2; + p4 = y1; + p5 = y2; + #endif +#elif MICRO_TILE_MODE == ADDR_SURF_THIN_MICRO_TILING || MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + p0 = x0; + p1 = y0; + p2 = x1; + p3 = y1; + p4 = x2; + p5 = y2; +#else + #if BITS_PER_PIXEL == 8 || BITS_PER_PIXEL == 16 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = y1; + p4 = z0; + p5 = z1; + #elif BITS_PER_PIXEL == 32 + p0 = x0; + p1 = y0; + p2 = x1; + p3 = z0; + p4 = y1; + p5 = z1; + #elif BITS_PER_PIXEL == 64 || BITS_PER_PIXEL == 128 + p0 = x0; + p1 = y0; + p2 = z0; + p3 = x1; + p4 = y1; + p5 = z1; + #endif + p6 = x2; + p7 = y2; + + #if MICRO_TILE_THICKNESS == 8 + p8 = z2; + #endif +#endif + + uint32_t pixel_number = + ((p0) | (p1 << 1) | (p2 << 2) | (p3 << 3) | (p4 << 4) | + (p5 << 5) | (p6 << 6) | (p7 << 7) | (p8 << 8)); + + return pixel_number; +} + +#if ARRAY_MODE == ARRAY_1D_TILED_THIN1 || ARRAY_MODE == ARRAY_1D_TILED_THICK +uint32_t ComputeSurfaceAddrFromCoordMicroTiled(uint32_t x, uint32_t y, uint32_t slice, uint32_t pitch, uint32_t height, uint32_t sample_index) { + uint32_t slice_bytes = BITS_TO_BYTES(pitch * height * MICRO_TILE_THICKNESS * BITS_PER_PIXEL * NUM_SAMPLES); + + uint32_t micro_tiles_per_row = pitch / MICRO_TILE_WIDTH; + uint32_t micro_tile_index_x = x / MICRO_TILE_WIDTH; + uint32_t micro_tile_index_y = y / MICRO_TILE_HEIGHT; + uint32_t micro_tile_index_z = slice / MICRO_TILE_THICKNESS; + + uint32_t slice_offset = micro_tile_index_z * slice_bytes; + uint32_t micro_tile_offset = (micro_tile_index_y * micro_tiles_per_row + micro_tile_index_x) * MICRO_TILE_BYTES; + + uint32_t pixel_index = ComputePixelIndexWithinMicroTile(x, y, slice); + + uint32_t sample_offset; + uint32_t pixel_offset; +#if MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + sample_offset = sample_index * BITS_PER_PIXEL; + pixel_offset = pixel_index * BITS_PER_PIXEL * NUM_SAMPLES; +#else + sample_offset = sample_index * (MICRO_TILE_BYTES * 8 / NUM_SAMPLES); + pixel_offset = pixel_index * BITS_PER_PIXEL; +#endif + + uint32_t elem_offset = (sample_offset + pixel_offset) / 8; + return slice_offset + micro_tile_offset + elem_offset; +} +#else +uint32_t ComputePipeFromCoord(uint32_t x, uint32_t y, uint32_t slice) { + uint32_t p0 = 0; + uint32_t p1 = 0; + uint32_t p2 = 0; + + uint32_t tx = x / MICRO_TILE_WIDTH; + uint32_t ty = y / MICRO_TILE_HEIGHT; + uint32_t x3 = _BIT(tx, 0); + uint32_t x4 = _BIT(tx, 1); + uint32_t x5 = _BIT(tx, 2); + uint32_t y3 = _BIT(ty, 0); + uint32_t y4 = _BIT(ty, 1); + uint32_t y5 = _BIT(ty, 2); + +#if PIPE_CONFIG == ADDR_SURF_P2 + p0 = x3 ^ y3; +#elif PIPE_CONFIG == ADDR_SURF_P8_32x32_8x16 + p0 = x4 ^ y3 ^ x5; + p1 = x3 ^ y4; + p2 = x5 ^ y5; +#elif PIPE_CONFIG == ADDR_SURF_P8_32x32_16x16 + p0 = x3 ^ y3 ^ x4; + p1 = x4 ^ y4; + p2 = x5 ^ y5; +#endif + + uint32_t pipe = p0 | (p1 << 1) | (p2 << 2); + + uint32_t pipe_swizzle = 0; +#if ARRAY_MODE == ARRAY_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THICK || ARRAY_MODE == ARRAY_3D_TILED_XTHICK + pipe_swizzle += max(1, NUM_PIPES / 2 - 1) * (slice / MICRO_TILE_THICKNESS); +#endif + pipe_swizzle &= (NUM_PIPES - 1); + pipe = pipe ^ pipe_swizzle; + return pipe; +} + +uint32_t ComputeBankFromCoord(uint32_t x, uint32_t y, uint32_t slice, uint32_t tile_split_slice) { + uint32_t b0 = 0; + uint32_t b1 = 0; + uint32_t b2 = 0; + uint32_t b3 = 0; + uint32_t slice_rotation = 0; + uint32_t tile_split_rotation = 0; + + uint32_t tx = x / MICRO_TILE_WIDTH / (BANK_WIDTH * NUM_PIPES); + uint32_t ty = y / MICRO_TILE_HEIGHT / BANK_HEIGHT; + + uint32_t x3 = _BIT(tx, 0); + uint32_t x4 = _BIT(tx, 1); + uint32_t x5 = _BIT(tx, 2); + uint32_t x6 = _BIT(tx, 3); + uint32_t y3 = _BIT(ty, 0); + uint32_t y4 = _BIT(ty, 1); + uint32_t y5 = _BIT(ty, 2); + uint32_t y6 = _BIT(ty, 3); + +#if NUM_BANKS == 16 + b0 = x3 ^ y6; + b1 = x4 ^ y5 ^ y6; + b2 = x5 ^ y4; + b3 = x6 ^ y3; +#elif NUM_BANKS == 8 + b0 = x3 ^ y5; + b1 = x4 ^ y4 ^ y5; + b2 = x5 ^ y3; +#elif NUM_BANKS == 4 + b0 = x3 ^ y4; + b1 = x4 ^ y3; +#elif NUM_BANKS == 2 + b0 = x3 ^ y3; +#endif + + uint32_t bank = b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); + +#if ARRAY_MODE == ARRAY_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_2D_TILED_THICK || ARRAY_MODE == ARRAY_2D_TILED_XTHICK + slice_rotation = ((NUM_BANKS / 2) - 1) * (slice / MICRO_TILE_THICKNESS); +#elif ARRAY_MODE == ARRAY_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THICK || ARRAY_MODE == ARRAY_3D_TILED_XTHICK + slice_rotation = max(1u, (NUM_PIPES / 2) - 1) * (slice / MICRO_TILE_THICKNESS) / NUM_PIPES; +#endif + +#if ARRAY_MODE == ARRAY_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_3D_TILED_THIN1 || \ + ARRAY_MODE == ARRAY_PRT_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_3D_TILED_THIN1 + tile_split_rotation = ((NUM_BANKS / 2) + 1) * tile_split_slice; +#endif + + bank ^= info.bank_swizzle + slice_rotation; + bank ^= tile_split_rotation; + bank &= (NUM_BANKS - 1); + + return bank; +} + +uint32_t ComputeSurfaceAddrFromCoordMacroTiled(uint32_t x, uint32_t y, uint32_t slice, uint32_t pitch, uint32_t height, uint32_t sample_index) { + uint32_t pixel_index = ComputePixelIndexWithinMicroTile(x, y, slice); + + uint32_t sample_offset; + uint32_t pixel_offset; +#if MICRO_TILE_MODE == ADDR_SURF_DEPTH_MICRO_TILING + sample_offset = sample_index * BITS_PER_PIXEL; + pixel_offset = pixel_index * BITS_PER_PIXEL * NUM_SAMPLES; +#else + sample_offset = sample_index * (MICRO_TILE_BITS / NUM_SAMPLES); + pixel_offset = pixel_index * BITS_PER_PIXEL; +#endif + + uint32_t element_offset = (pixel_offset + sample_offset) / 8; + + uint32_t slices_per_tile = 1; + uint32_t tile_split_slice = 0; +#if MICRO_TILE_BYTES > TILE_SPLIT_BYTES && MICRO_TILE_THICKNESS == 1 + slices_per_tile = MICRO_TILE_BYTES / TILE_SPLIT_BYTES; + tile_split_slice = element_offset / TILE_SPLIT_BYTES; + element_offset %= TILE_SPLIT_BYTES; + #undef MICRO_TILE_BYTES + #define MICRO_TILE_BYTES TILE_SPLIT_BYTES +#endif + + uint32_t macro_tile_pitch = (MICRO_TILE_WIDTH * BANK_WIDTH * NUM_PIPES) * MACRO_TILE_ASPECT; + uint32_t macro_tile_height = (MICRO_TILE_HEIGHT * BANK_HEIGHT * NUM_BANKS) / MACRO_TILE_ASPECT; + + uint32_t macro_tile_bytes = MICRO_TILE_BYTES * + (macro_tile_pitch / MICRO_TILE_WIDTH) * + (macro_tile_height / MICRO_TILE_HEIGHT) / (NUM_PIPES * NUM_BANKS); + + uint32_t macro_tiles_per_row = pitch / macro_tile_pitch; + + uint32_t macro_tile_index_x = x / macro_tile_pitch; + uint32_t macro_tile_index_y = y / macro_tile_height; + uint32_t macro_tile_offset = + ((macro_tile_index_y * macro_tiles_per_row) + macro_tile_index_x) * macro_tile_bytes; + uint32_t macro_tiles_per_slice = macro_tiles_per_row * (height / macro_tile_height); + + uint32_t slice_bytes = macro_tiles_per_slice * macro_tile_bytes; + uint32_t slice_offset = + slice_bytes * (tile_split_slice + slices_per_tile * (slice / MICRO_TILE_THICKNESS)); + + uint32_t tile_row_index = (y / MICRO_TILE_HEIGHT) % BANK_HEIGHT; + uint32_t tile_column_index = ((x / MICRO_TILE_WIDTH) / NUM_PIPES) % BANK_WIDTH; + uint32_t tile_index = (tile_row_index * BANK_WIDTH) + tile_column_index; + uint32_t tile_offset = tile_index * MICRO_TILE_BYTES; + + uint32_t total_offset = slice_offset + macro_tile_offset + element_offset + tile_offset; + +#if ARRAY_MODE == ARRAY_PRT_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_TILED_THICK || \ + ARRAY_MODE == ARRAY_PRT_2D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_2D_TILED_THICK || \ + ARRAY_MODE == ARRAY_PRT_3D_TILED_THIN1 || ARRAY_MODE == ARRAY_PRT_3D_TILED_THICK + x %= macro_tile_pitch; + y %= macro_tile_height; +#endif + + uint32_t pipe = ComputePipeFromCoord(x, y, slice); + uint32_t bank = ComputeBankFromCoord(x, y, slice, tile_split_slice); + + uint32_t pipe_interleave_mask = (1 << NUM_PIPE_INTERLEAVE_BITS) - 1; + uint32_t pipe_interleave_offset = total_offset & pipe_interleave_mask; + uint32_t offset = total_offset >> NUM_PIPE_INTERLEAVE_BITS; + + uint32_t addr = pipe_interleave_offset; + uint32_t pipe_bits = pipe << NUM_PIPE_INTERLEAVE_BITS; + uint32_t bank_bits = bank << (NUM_PIPE_INTERLEAVE_BITS + NUM_PIPE_BITS); + uint32_t offset_bits = offset << (NUM_PIPE_INTERLEAVE_BITS + NUM_PIPE_BITS + NUM_BANK_BITS); + + addr |= pipe_bits; + addr |= bank_bits; + addr |= offset_bits; + + return addr; +} +#endif + +uint GetMipLevel(inout uint texel) { + uint mip = 0; + uint mip_size = info.mips[mip].size / BYTES_PER_PIXEL; + while (texel >= mip_size && mip < info.num_mips) { + texel -= mip_size; + ++mip; + mip_size = info.mips[mip].size / BYTES_PER_PIXEL; + } + return mip; +} + +void main() { + uint texel = gl_GlobalInvocationID.x; + uint mip = GetMipLevel(texel); + uint pitch = info.mips[mip].pitch; + uint height = info.mips[mip].height; + uint tiled_offset = info.mips[mip].offset; + uint x = texel % pitch; + uint y = (texel / pitch) % height; + uint slice = texel / (pitch * height); + +#if ARRAY_MODE == ARRAY_1D_TILED_THIN1 || ARRAY_MODE == ARRAY_1D_TILED_THICK + tiled_offset += ComputeSurfaceAddrFromCoordMicroTiled(x, y, slice, pitch, height, 0); +#else + tiled_offset += ComputeSurfaceAddrFromCoordMacroTiled(x, y, slice, pitch, height, 0); +#endif + +#ifdef IS_TILER + tiled_data[tiled_offset / BYTES_PER_PIXEL] = linear_data[gl_GlobalInvocationID.x]; +#else + linear_data[gl_GlobalInvocationID.x] = tiled_data[tiled_offset / BYTES_PER_PIXEL]; +#endif +} diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 03c13a4cb..e4e628c69 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -253,6 +253,7 @@ bool Instance::CreateDevice() { ASSERT(add_extension(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME)); // Optional + maintenance_8 = add_extension(VK_KHR_MAINTENANCE_8_EXTENSION_NAME); depth_range_unrestricted = add_extension(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME); dynamic_state_3 = add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); if (dynamic_state_3) { @@ -459,6 +460,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVertexAttributeDivisorFeatures{ .vertexAttributeInstanceRateDivisor = true, }, + vk::PhysicalDeviceMaintenance8FeaturesKHR{ + .maintenance8 = true, + }, vk::PhysicalDeviceShaderAtomicFloat2FeaturesEXT{ .shaderBufferFloat32AtomicMinMax = shader_atomic_float2_features.shaderBufferFloat32AtomicMinMax, @@ -527,6 +531,9 @@ bool Instance::CreateDevice() { if (!provoking_vertex) { device_chain.unlink(); } + if (!maintenance_8) { + device_chain.unlink(); + } if (!shader_atomic_float2) { device_chain.unlink(); } diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index c34c12589..be316f6e8 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -109,6 +109,11 @@ public: return vk12_features.shaderInt8; } + /// Returns true if VK_KHR_maintenance8 is supported + bool IsMaintenance8Supported() const { + return maintenance_8; + } + /// Returns true when VK_EXT_custom_border_color is supported bool IsCustomBorderColorSupported() const { return custom_border_color; @@ -469,6 +474,7 @@ private: bool shader_atomic_float2{}; bool workgroup_memory_explicit_layout{}; bool portability_subset{}; + bool maintenance_8{}; bool supports_memory_budget{}; u64 total_memory_budget{}; std::vector valid_heaps; diff --git a/src/video_core/renderer_vulkan/vk_presenter.h b/src/video_core/renderer_vulkan/vk_presenter.h index 8ed2052ee..ea933b21c 100644 --- a/src/video_core/renderer_vulkan/vk_presenter.h +++ b/src/video_core/renderer_vulkan/vk_presenter.h @@ -5,6 +5,7 @@ #include +#include "core/libraries/videoout/buffer.h" #include "imgui/imgui_config.h" #include "imgui/imgui_texture.h" #include "video_core/amdgpu/liverpool.h" diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fa84a6b42..ec0c38bda 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -457,7 +457,7 @@ void Rasterizer::OnSubmit() { } bool Rasterizer::BindResources(const Pipeline* pipeline) { - if (IsComputeMetaClear(pipeline)) { + if (IsComputeImageCopy(pipeline) || IsComputeMetaClear(pipeline)) { return false; } @@ -523,22 +523,82 @@ bool Rasterizer::IsComputeMetaClear(const Pipeline* pipeline) { // If a shader wants to encode HTILE, for example, from a depth image it will have to compute // proper tile address from dispatch invocation id. This address calculation contains an xor // operation so use it as a heuristic for metadata writes that are probably not clears. - if (info.has_bitwise_xor) { - return false; - } - - // Assume if a shader writes metadata without address calculation, it is a clear shader. - for (const auto& desc : info.buffers) { - const VAddr address = desc.GetSharp(info).base_address; - if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { - // Assume all slices were updates - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return true; + if (!info.has_bitwise_xor) { + // Assume if a shader writes metadata without address calculation, it is a clear shader. + for (const auto& desc : info.buffers) { + const VAddr address = desc.GetSharp(info).base_address; + if (!desc.IsSpecial() && desc.is_written && texture_cache.ClearMeta(address)) { + // Assume all slices were updates + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return true; + } } } return false; } +bool Rasterizer::IsComputeImageCopy(const Pipeline* pipeline) { + if (!pipeline->IsCompute()) { + return false; + } + + // Ensure shader only has 2 bound buffers + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); + if (cs_pgm.num_thread_x.full != 64 || info.buffers.size() != 2 || !info.images.empty()) { + return false; + } + + // Those 2 buffers must both be formatted. One must be source and another destination. + const auto& desc0 = info.buffers[0]; + const auto& desc1 = info.buffers[1]; + if (!desc0.is_formatted || !desc1.is_formatted || desc0.is_written == desc1.is_written) { + return false; + } + + // Buffers must have the same size and each thread of the dispatch must copy 1 dword of data + const AmdGpu::Buffer buf0 = desc0.GetSharp(info); + const AmdGpu::Buffer buf1 = desc1.GetSharp(info); + if (buf0.GetSize() != buf1.GetSize() || cs_pgm.dim_x != (buf0.GetSize() / 256)) { + return false; + } + + // Find images the buffer alias + const auto image0_id = texture_cache.FindImageFromRange(buf0.base_address, buf0.GetSize()); + if (!image0_id) { + return false; + } + const auto image1_id = + texture_cache.FindImageFromRange(buf1.base_address, buf1.GetSize(), false); + if (!image1_id) { + return false; + } + + // Image copy must be valid + VideoCore::Image& image0 = texture_cache.GetImage(image0_id); + VideoCore::Image& image1 = texture_cache.GetImage(image1_id); + if (image0.info.guest_size != image1.info.guest_size || + image0.info.pitch != image1.info.pitch || image0.info.guest_size != buf0.GetSize() || + image0.info.num_bits != image1.info.num_bits) { + return false; + } + + // Perform image copy + VideoCore::Image& src_image = desc0.is_written ? image1 : image0; + VideoCore::Image& dst_image = desc0.is_written ? image0 : image1; + if (instance.IsMaintenance8Supported() || + src_image.info.props.is_depth == dst_image.info.props.is_depth) { + dst_image.CopyImage(src_image); + } else { + const auto& copy_buffer = + buffer_cache.GetUtilityBuffer(VideoCore::MemoryUsage::DeviceLocal); + dst_image.CopyImageWithBuffer(src_image, copy_buffer.Handle(), 0); + } + dst_image.flags |= VideoCore::ImageFlagBits::GpuModified; + dst_image.flags &= ~VideoCore::ImageFlagBits::Dirty; + return true; +} + void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Bindings& binding, Shader::PushData& push_data) { buffer_bindings.clear(); @@ -687,7 +747,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin if (image.binding.force_general || image.binding.is_target) { image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | - (image.info.IsDepthStencil() + (image.info.props.is_depth ? vk::AccessFlagBits2::eDepthStencilAttachmentWrite : vk::AccessFlagBits2::eColorAttachmentWrite), {}); @@ -698,7 +758,7 @@ void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindin vk::AccessFlagBits2::eShaderWrite, desc.view_info.range); } else { - const auto new_layout = image.info.IsDepthStencil() + const auto new_layout = image.info.props.is_depth ? vk::ImageLayout::eDepthStencilReadOnlyOptimal : vk::ImageLayout::eShaderReadOnlyOptimal; image.Transit(new_layout, vk::AccessFlagBits2::eShaderRead, @@ -823,10 +883,8 @@ void Rasterizer::Resolve() { mrt0_hint}; VideoCore::TextureCache::RenderTargetDesc mrt1_desc{liverpool->regs.color_buffers[1], mrt1_hint}; - auto& mrt0_image = - texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, VideoCore::FindFlags::ExactFmt)); - auto& mrt1_image = - texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, VideoCore::FindFlags::ExactFmt)); + auto& mrt0_image = texture_cache.GetImage(texture_cache.FindImage(mrt0_desc, true)); + auto& mrt1_image = texture_cache.GetImage(texture_cache.FindImage(mrt1_desc, true)); VideoCore::SubresourceRange mrt0_range; mrt0_range.base.layer = liverpool->regs.color_buffers[0].view.slice_start; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a1d59021b..65de62bb4 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -112,6 +112,7 @@ private: } bool IsComputeMetaClear(const Pipeline* pipeline); + bool IsComputeImageCopy(const Pipeline* pipeline); private: friend class VideoCore::BufferCache; diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 4ab2e991c..c2a8478ca 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -18,10 +18,10 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { vk::ImageUsageFlags usage = vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled; - if (info.IsDepthStencil()) { + if (info.props.is_depth) { usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment; } else { - if (!info.IsBlockCoded()) { + if (!info.props.is_block) { usage |= vk::ImageUsageFlagBits::eColorAttachment; } // In cases where an image is created as a render/depth target and cleared with compute, @@ -35,6 +35,22 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { return usage; } +static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { + switch (type) { + case AmdGpu::ImageType::Color1D: + case AmdGpu::ImageType::Color1DArray: + return vk::ImageType::e1D; + case AmdGpu::ImageType::Color2D: + case AmdGpu::ImageType::Color2DMsaa: + case AmdGpu::ImageType::Color2DArray: + return vk::ImageType::e2D; + case AmdGpu::ImageType::Color3D: + return vk::ImageType::e3D; + default: + UNREACHABLE(); + } +} + static vk::FormatFeatureFlags2 FormatFeatureFlags(const vk::ImageUsageFlags usage_flags) { vk::FormatFeatureFlags2 feature_flags{}; if (usage_flags & vk::ImageUsageFlagBits::eTransferSrc) { @@ -132,7 +148,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, const auto supported_format = instance->GetSupportedFormat(info.pixel_format, format_features); const vk::PhysicalDeviceImageFormatInfo2 format_info{ .format = supported_format, - .type = info.type, + .type = ConvertImageType(info.type), .tiling = tiling, .usage = usage_flags, .flags = flags, @@ -141,7 +157,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, instance->GetPhysicalDevice().getImageFormatProperties2(format_info); if (image_format_properties.result == vk::Result::eErrorFormatNotSupported) { LOG_ERROR(Render_Vulkan, "image format {} type {} is not supported (flags {}, usage {})", - vk::to_string(supported_format), vk::to_string(info.type), + vk::to_string(supported_format), vk::to_string(format_info.type), vk::to_string(format_info.flags), vk::to_string(format_info.usage)); } const auto supported_samples = @@ -151,7 +167,7 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, const vk::ImageCreateInfo image_ci = { .flags = flags, - .imageType = info.type, + .imageType = ConvertImageType(info.type), .format = supported_format, .extent{ .width = info.size.width, @@ -168,9 +184,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, image.Create(image_ci); - Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {:#x}:{:#x}", - info.size.width, info.size.height, info.size.depth, info.guest_address, - info.guest_size); + Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {} {:#x}:{:#x}", + info.size.width, info.size.height, info.size.depth, + AmdGpu::NameOf(info.tile_mode), info.guest_address, info.guest_size); } boost::container::small_vector Image::GetBarriers( @@ -325,38 +341,41 @@ void Image::Upload(vk::Buffer buffer, u64 offset) { vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); } -void Image::CopyImage(const Image& src_image) { - scheduler->EndRendering(); - Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); - - auto cmdbuf = scheduler->CommandBuffer(); +void Image::CopyImage(Image& src_image) { const auto& src_info = src_image.info; - - boost::container::small_vector image_copy{}; const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); - for (u32 m = 0; m < num_mips; ++m) { - const auto mip_w = std::max(src_info.size.width >> m, 1u); - const auto mip_h = std::max(src_info.size.height >> m, 1u); - const auto mip_d = std::max(src_info.size.depth >> m, 1u); + ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1); - image_copy.emplace_back(vk::ImageCopy{ + boost::container::small_vector image_copies; + for (u32 mip = 0; mip < num_mips; ++mip) { + const auto mip_w = std::max(src_info.size.width >> mip, 1u); + const auto mip_h = std::max(src_info.size.height >> mip, 1u); + const auto mip_d = std::max(src_info.size.depth >> mip, 1u); + + image_copies.emplace_back(vk::ImageCopy{ .srcSubresource{ - .aspectMask = src_image.aspect_mask, - .mipLevel = m, + .aspectMask = src_image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, .layerCount = src_info.resources.layers, }, .dstSubresource{ - .aspectMask = src_image.aspect_mask, - .mipLevel = m, + .aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, .layerCount = src_info.resources.layers, }, .extent = {mip_w, mip_h, mip_d}, }); } + + scheduler->EndRendering(); + src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); + Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); + + auto cmdbuf = scheduler->CommandBuffer(); cmdbuf.copyImage(src_image.image, src_image.last_state.layout, image, last_state.layout, - image_copy); + image_copies); Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits2::eShaderRead | vk::AccessFlagBits2::eTransferRead, {}); @@ -364,32 +383,29 @@ void Image::CopyImage(const Image& src_image) { void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) { const auto& src_info = src_image.info; + const u32 num_mips = std::min(src_info.resources.levels, info.resources.levels); + ASSERT(src_info.resources.layers == info.resources.layers || num_mips == 1); - vk::BufferImageCopy buffer_image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = - { - .aspectMask = src_info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth - : vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, + boost::container::small_vector buffer_copies; + for (u32 mip = 0; mip < num_mips; ++mip) { + const auto mip_w = std::max(src_info.size.width >> mip, 1u); + const auto mip_h = std::max(src_info.size.height >> mip, 1u); + const auto mip_d = std::max(src_info.size.depth >> mip, 1u); + + buffer_copies.emplace_back(vk::BufferImageCopy{ + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = src_image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = mip, .baseArrayLayer = 0, - .layerCount = 1, + .layerCount = src_info.resources.layers, }, - .imageOffset = - { - .x = 0, - .y = 0, - .z = 0, - }, - .imageExtent = - { - .width = src_info.size.width, - .height = src_info.size.height, - .depth = src_info.size.depth, - }, - }; + .imageOffset = {0, 0, 0}, + .imageExtent = {mip_w, mip_h, mip_d}, + }); + } const vk::BufferMemoryBarrier2 pre_copy_barrier = { .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, @@ -416,7 +432,6 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {}); auto cmdbuf = scheduler->CommandBuffer(); - cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -424,7 +439,7 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) }); cmdbuf.copyImageToBuffer(src_image.image, vk::ImageLayout::eTransferSrcOptimal, buffer, - buffer_image_copy); + buffer_copies); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, @@ -432,11 +447,11 @@ void Image::CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset) .pBufferMemoryBarriers = &post_copy_barrier, }); - buffer_image_copy.imageSubresource.aspectMask = - info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor; + for (auto& copy : buffer_copies) { + copy.imageSubresource.aspectMask = aspect_mask & ~vk::ImageAspectFlagBits::eStencil; + } - cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, - buffer_image_copy); + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, buffer_copies); } void Image::CopyMip(const Image& src_image, u32 mip, u32 slice) { diff --git a/src/video_core/texture_cache/image.h b/src/video_core/texture_cache/image.h index b791b548b..c30edad79 100644 --- a/src/video_core/texture_cache/image.h +++ b/src/video_core/texture_cache/image.h @@ -103,7 +103,7 @@ struct Image { std::optional range, vk::CommandBuffer cmdbuf = {}); void Upload(vk::Buffer buffer, u64 offset); - void CopyImage(const Image& src_image); + void CopyImage(Image& src_image); void CopyImageWithBuffer(Image& src_image, vk::Buffer buffer, u64 offset); void CopyMip(const Image& src_image, u32 mip, u32 slice); diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index ed10a20bf..0e1f10bfe 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -2,12 +2,15 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" -#include "common/config.h" #include "core/libraries/kernel/process.h" +#include "core/libraries/videoout/buffer.h" +#include "shader_recompiler/info.h" #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/texture_cache/image_info.h" #include "video_core/texture_cache/tile.h" +#include + namespace VideoCore { using namespace Vulkan; @@ -32,30 +35,15 @@ static vk::Format ConvertPixelFormat(const VideoOutFormat format) { return {}; } -static vk::ImageType ConvertImageType(AmdGpu::ImageType type) noexcept { - switch (type) { - case AmdGpu::ImageType::Color1D: - case AmdGpu::ImageType::Color1DArray: - return vk::ImageType::e1D; - case AmdGpu::ImageType::Color2D: - case AmdGpu::ImageType::Color2DMsaa: - case AmdGpu::ImageType::Color2DArray: - return vk::ImageType::e2D; - case AmdGpu::ImageType::Color3D: - return vk::ImageType::e3D; - default: - UNREACHABLE(); - } -} - ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, VAddr cpu_address) noexcept { const auto& attrib = group.attrib; props.is_tiled = attrib.tiling_mode == TilingMode::Tile; - tiling_mode = props.is_tiled ? AmdGpu::TilingMode::Display_MacroTiled - : AmdGpu::TilingMode::Display_Linear; + tile_mode = + props.is_tiled ? AmdGpu::TileMode::Display2DThin : AmdGpu::TileMode::DisplayLinearAligned; + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = ConvertPixelFormat(attrib.pixel_format); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; size.width = attrib.width; size.height = attrib.height; pitch = attrib.tiling_mode == TilingMode::Linear ? size.width : (size.width + 127) & (~127); @@ -63,26 +51,18 @@ ImageInfo::ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, ASSERT(num_bits == 32); guest_address = cpu_address; - if (!props.is_tiled) { - guest_size = pitch * size.height * 4; - } else { - if (Libraries::Kernel::sceKernelIsNeoMode()) { - guest_size = pitch * ((size.height + 127) & (~127)) * 4; - } else { - guest_size = pitch * ((size.height + 63) & (~63)) * 4; - } - } - mips_layout.emplace_back(guest_size, pitch, 0); + UpdateSize(); } ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, const AmdGpu::Liverpool::CbDbExtent& hint /*= {}*/) noexcept { props.is_tiled = buffer.IsTiled(); - tiling_mode = buffer.GetTilingMode(); + tile_mode = buffer.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::SurfaceFormat(buffer.GetDataFmt(), buffer.GetNumberFmt()); num_samples = buffer.NumSamples(); num_bits = NumBitsPerBlock(buffer.GetDataFmt()); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; @@ -94,17 +74,21 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::ColorBuffer& buffer, guest_address = buffer.Address(); const auto color_slice_sz = buffer.GetColorSliceSize(); guest_size = color_slice_sz * buffer.NumSlices(); - mips_layout.emplace_back(color_slice_sz, pitch, 0); - tiling_idx = static_cast(buffer.attrib.tile_mode_index.Value()); + mips_layout.emplace_back(guest_size, pitch, buffer.Height(), 0); alt_tile = Libraries::Kernel::sceKernelIsNeoMode() && buffer.info.alt_tile_mode; } ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slices, VAddr htile_address, const AmdGpu::Liverpool::CbDbExtent& hint, bool write_buffer) noexcept { - props.is_tiled = false; + tile_mode = buffer.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::DepthFormat(buffer.z_info.format, buffer.stencil_info.format); - type = vk::ImageType::e2D; + type = AmdGpu::ImageType::Color2D; + props.is_tiled = buffer.IsTiled(); + props.is_depth = true; + props.has_stencil = + buffer.stencil_info.format != AmdGpu::Liverpool::DepthBuffer::StencilFormat::Invalid; num_samples = buffer.NumSamples(); num_bits = buffer.NumBits(); size.width = hint.Valid() ? hint.width : buffer.Pitch(); @@ -120,21 +104,22 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice guest_address = write_buffer ? buffer.DepthWriteAddress() : buffer.DepthAddress(); const auto depth_slice_sz = buffer.GetDepthSliceSize(); guest_size = depth_slice_sz * num_slices; - mips_layout.emplace_back(depth_slice_sz, pitch, 0); + mips_layout.emplace_back(guest_size, pitch, buffer.Height(), 0); } ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& desc) noexcept { - tiling_mode = image.GetTilingMode(); + tile_mode = image.GetTileMode(); + array_mode = AmdGpu::GetArrayMode(tile_mode); pixel_format = LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); - // Override format if image is forced to be a depth target if (desc.is_depth) { pixel_format = LiverpoolToVK::PromoteFormatToDepth(pixel_format); + props.is_depth = true; } - type = ConvertImageType(image.GetType()); + type = image.GetBaseType(); props.is_tiled = image.IsTiled(); - props.is_volume = image.GetType() == AmdGpu::ImageType::Color3D; + props.is_volume = type == AmdGpu::ImageType::Color3D; props.is_pow2 = image.pow2pad; - props.is_block = IsBlockCoded(); + props.is_block = AmdGpu::IsBlockCoded(image.GetDataFmt()); size.width = image.width + 1; size.height = image.height + 1; size.depth = props.is_volume ? image.depth + 1 : 1; @@ -143,94 +128,34 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& de resources.layers = image.NumLayers(); num_samples = image.NumSamples(); num_bits = NumBitsPerBlock(image.GetDataFmt()); + bank_swizzle = image.GetBankSwizzle(); guest_address = image.Address(); - mips_layout.reserve(resources.levels); - tiling_idx = image.tiling_index; alt_tile = Libraries::Kernel::sceKernelIsNeoMode() && image.alt_tile_mode; UpdateSize(); } -bool ImageInfo::IsBlockCoded() const { - switch (pixel_format) { - case vk::Format::eBc1RgbaSrgbBlock: - case vk::Format::eBc1RgbaUnormBlock: - case vk::Format::eBc1RgbSrgbBlock: - case vk::Format::eBc1RgbUnormBlock: - case vk::Format::eBc2SrgbBlock: - case vk::Format::eBc2UnormBlock: - case vk::Format::eBc3SrgbBlock: - case vk::Format::eBc3UnormBlock: - case vk::Format::eBc4SnormBlock: - case vk::Format::eBc4UnormBlock: - case vk::Format::eBc5SnormBlock: - case vk::Format::eBc5UnormBlock: - case vk::Format::eBc6HSfloatBlock: - case vk::Format::eBc6HUfloatBlock: - case vk::Format::eBc7SrgbBlock: - case vk::Format::eBc7UnormBlock: - return true; - default: - return false; - } -} - -bool ImageInfo::IsDepthStencil() const { - switch (pixel_format) { - case vk::Format::eD16Unorm: - case vk::Format::eD16UnormS8Uint: - case vk::Format::eD32Sfloat: - case vk::Format::eD32SfloatS8Uint: - return true; - default: - return false; - } -} - -bool ImageInfo::HasStencil() const { - if (pixel_format == vk::Format::eD32SfloatS8Uint || - pixel_format == vk::Format::eD24UnormS8Uint || - pixel_format == vk::Format::eD16UnormS8Uint) { - return true; - } - return false; -} - bool ImageInfo::IsCompatible(const ImageInfo& info) const { return (pixel_format == info.pixel_format && num_samples == info.num_samples && num_bits == info.num_bits); } -bool ImageInfo::IsTilingCompatible(u32 lhs, u32 rhs) const { - if (lhs == rhs) { - return true; - } - if (lhs == 0x0e && rhs == 0x0d) { - return true; - } - if (lhs == 0x0d && rhs == 0x0e) { - return true; - } - return false; -} - void ImageInfo::UpdateSize() { mips_layout.clear(); MipInfo mip_info{}; guest_size = 0; - for (auto mip = 0u; mip < resources.levels; ++mip) { - auto bpp = num_bits; - auto mip_w = pitch >> mip; - auto mip_h = size.height >> mip; + for (s32 mip = 0; mip < resources.levels; ++mip) { + u32 mip_w = pitch >> mip; + u32 mip_h = size.height >> mip; if (props.is_block) { mip_w = (mip_w + 3) / 4; mip_h = (mip_h + 3) / 4; } mip_w = std::max(mip_w, 1u); mip_h = std::max(mip_h, 1u); - auto mip_d = std::max(size.depth >> mip, 1u); - auto thickness = 1; + u32 mip_d = std::max(size.depth >> mip, 1u); + u32 thickness = 1; if (props.is_pow2) { mip_w = std::bit_ceil(mip_w); @@ -238,35 +163,36 @@ void ImageInfo::UpdateSize() { mip_d = std::bit_ceil(mip_d); } - switch (tiling_mode) { - case AmdGpu::TilingMode::Display_Linear: { - std::tie(mip_info.pitch, mip_info.size) = - ImageSizeLinearAligned(mip_w, mip_h, bpp, num_samples); + switch (array_mode) { + case AmdGpu::ArrayMode::ArrayLinearGeneral: + case AmdGpu::ArrayMode::ArrayLinearAligned: { + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = + ImageSizeLinearAligned(mip_w, mip_h, num_bits, num_samples); break; } - case AmdGpu::TilingMode::Texture_Volume: + case AmdGpu::ArrayMode::Array1DTiledThick: thickness = 4; mip_d += (-mip_d) & (thickness - 1); [[fallthrough]]; - case AmdGpu::TilingMode::Display_MicroTiled: - case AmdGpu::TilingMode::Texture_MicroTiled: { - std::tie(mip_info.pitch, mip_info.size) = - ImageSizeMicroTiled(mip_w, mip_h, thickness, bpp, num_samples); + case AmdGpu::ArrayMode::Array1DTiledThin1: { + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = + ImageSizeMicroTiled(mip_w, mip_h, thickness, num_bits, num_samples); break; } - case AmdGpu::TilingMode::Display_MacroTiled: - case AmdGpu::TilingMode::Texture_MacroTiled: - case AmdGpu::TilingMode::Depth_MacroTiled: { + case AmdGpu::ArrayMode::Array2DTiledThick: + thickness = 4; + mip_d += (-mip_d) & (thickness - 1); + [[fallthrough]]; + case AmdGpu::ArrayMode::Array2DTiledThin1: { ASSERT(!props.is_block); - std::tie(mip_info.pitch, mip_info.size) = ImageSizeMacroTiled( - mip_w, mip_h, thickness, bpp, num_samples, tiling_idx, mip, alt_tile); + std::tie(mip_info.pitch, mip_info.height, mip_info.size) = ImageSizeMacroTiled( + mip_w, mip_h, thickness, num_bits, num_samples, tile_mode, mip, alt_tile); break; } default: { - UNREACHABLE(); + UNREACHABLE_MSG("Unknown array mode {}", magic_enum::enum_name(array_mode)); } } - mip_info.height = mip_h; if (props.is_block) { mip_info.pitch = std::max(mip_info.pitch * 4, 32u); mip_info.height = std::max(mip_info.height * 4, 32u); @@ -283,7 +209,7 @@ s32 ImageInfo::MipOf(const ImageInfo& info) const { return -1; } - if (!IsTilingCompatible(info.tiling_idx, tiling_idx)) { + if (info.array_mode != array_mode) { return -1; } @@ -321,7 +247,7 @@ s32 ImageInfo::MipOf(const ImageInfo& info) const { } const auto mip_d = std::max(info.size.depth >> mip, 1u); - if (info.type == vk::ImageType::e3D && type == vk::ImageType::e2D) { + if (info.type == AmdGpu::ImageType::Color3D && type == AmdGpu::ImageType::Color2D) { // In case of 2D array to 3D copy, make sure we have proper number of layers. if (resources.layers != mip_d) { return -1; diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 9fa3b6c3d..00f56b1c7 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -3,16 +3,36 @@ #pragma once +#include + #include "common/types.h" -#include "core/libraries/videoout/buffer.h" -#include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" +#include "video_core/renderer_vulkan/vk_common.h" #include "video_core/texture_cache/types.h" -#include +namespace AmdGpu { +enum class ImageType : u64; +} + +namespace Libraries::VideoOut { +struct BufferAttributeGroup; +} + +namespace Shader { +struct ImageResource; +} namespace VideoCore { +struct ImageProperties { + u32 is_volume : 1; + u32 is_tiled : 1; + u32 is_pow2 : 1; + u32 is_block : 1; + u32 is_depth : 1; + u32 has_stencil : 1; +}; + struct ImageInfo { ImageInfo() = default; ImageInfo(const Libraries::VideoOut::BufferAttributeGroup& group, VAddr cpu_address) noexcept; @@ -23,61 +43,49 @@ struct ImageInfo { ImageInfo(const AmdGpu::Image& image, const Shader::ImageResource& desc) noexcept; bool IsTiled() const { - return tiling_mode != AmdGpu::TilingMode::Display_Linear; + return tile_mode != AmdGpu::TileMode::DisplayLinearAligned; } Extent3D BlockDim() const { - const u32 shift = props.is_block ? 2 : 0; - return Extent3D{size.width >> shift, size.height >> shift, size.depth}; + return props.is_block ? Extent3D{size.width >> 2, size.height >> 2, size.depth} : size; } - bool IsBlockCoded() const; - bool IsDepthStencil() const; - bool HasStencil() const; - s32 MipOf(const ImageInfo& info) const; s32 SliceOf(const ImageInfo& info, s32 mip) const; bool IsCompatible(const ImageInfo& info) const; - bool IsTilingCompatible(u32 lhs, u32 rhs) const; - void UpdateSize(); struct { VAddr cmask_addr; VAddr fmask_addr; VAddr htile_addr; - u32 htile_clear_mask{u32(-1)}; + u32 htile_clear_mask = u32(-1); } meta_info{}; - struct { - u32 is_volume : 1; - u32 is_tiled : 1; - u32 is_pow2 : 1; - u32 is_block : 1; - } props{}; // Surface properties with impact on various calculation factors - + ImageProperties props{}; vk::Format pixel_format = vk::Format::eUndefined; - vk::ImageType type = vk::ImageType::e2D; + AmdGpu::ImageType type; SubresourceExtent resources; Extent3D size{1, 1, 1}; u32 num_bits{}; u32 num_samples = 1; - u32 pitch = 0; - AmdGpu::TilingMode tiling_mode{AmdGpu::TilingMode::Display_Linear}; + u32 pitch{}; + AmdGpu::TileMode tile_mode = AmdGpu::TileMode::DisplayLinearAligned; + AmdGpu::ArrayMode array_mode = AmdGpu::ArrayMode::ArrayLinearAligned; struct MipInfo { u32 size; u32 pitch; u32 height; u32 offset; }; - boost::container::small_vector mips_layout; - VAddr guest_address{0}; - u32 guest_size{0}; - u32 tiling_idx{0}; // TODO: merge with existing! - bool alt_tile{false}; + boost::container::static_vector mips_layout; + VAddr guest_address{}; + u32 guest_size{}; + u8 bank_swizzle{}; + bool alt_tile{}; - VAddr stencil_addr{0}; - u32 stencil_size{0}; + VAddr stencil_addr{}; + u32 stencil_size{}; }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 2e162ce83..1b2bc3ae7 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -29,19 +29,18 @@ vk::ImageViewType ConvertImageViewType(AmdGpu::ImageType type) { } } -bool IsViewTypeCompatible(vk::ImageViewType view_type, vk::ImageType image_type) { +bool IsViewTypeCompatible(AmdGpu::ImageType view_type, AmdGpu::ImageType image_type) { switch (view_type) { - case vk::ImageViewType::e1D: - case vk::ImageViewType::e1DArray: - return image_type == vk::ImageType::e1D; - case vk::ImageViewType::e2D: - case vk::ImageViewType::e2DArray: - return image_type == vk::ImageType::e2D || image_type == vk::ImageType::e3D; - case vk::ImageViewType::eCube: - case vk::ImageViewType::eCubeArray: - return image_type == vk::ImageType::e2D; - case vk::ImageViewType::e3D: - return image_type == vk::ImageType::e3D; + case AmdGpu::ImageType::Color1D: + case AmdGpu::ImageType::Color1DArray: + return image_type == AmdGpu::ImageType::Color1D; + case AmdGpu::ImageType::Color2D: + case AmdGpu::ImageType::Color2DArray: + case AmdGpu::ImageType::Color2DMsaa: + case AmdGpu::ImageType::Color2DMsaaArray: + return image_type == AmdGpu::ImageType::Color2D || image_type == AmdGpu::ImageType::Color3D; + case AmdGpu::ImageType::Color3D: + return image_type == AmdGpu::ImageType::Color3D; default: UNREACHABLE(); } @@ -63,7 +62,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, const Shader::ImageReso range.base.layer = image.base_array; range.extent.levels = image.NumViewLevels(desc.is_array); range.extent.layers = image.NumViewLayers(desc.is_array); - type = ConvertImageViewType(image.GetViewType(desc.is_array)); + type = image.GetViewType(desc.is_array); if (!is_storage) { mapping = Vulkan::LiverpoolToVK::ComponentMapping(image.DstSelect()); @@ -73,7 +72,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, const Shader::ImageReso ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::ColorBuffer& col_buffer) noexcept { range.base.layer = col_buffer.view.slice_start; range.extent.layers = col_buffer.NumSlices() - range.base.layer; - type = range.extent.layers > 1 ? vk::ImageViewType::e2DArray : vk::ImageViewType::e2D; + type = range.extent.layers > 1 ? AmdGpu::ImageType::Color2DArray : AmdGpu::ImageType::Color2D; format = Vulkan::LiverpoolToVK::SurfaceFormat(col_buffer.GetDataFmt(), col_buffer.GetNumberFmt()); } @@ -86,7 +85,7 @@ ImageViewInfo::ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, is_storage = ctl.depth_write_enable; range.base.layer = view.slice_start; range.extent.layers = view.NumSlices() - range.base.layer; - type = range.extent.layers > 1 ? vk::ImageViewType::e2DArray : vk::ImageViewType::e2D; + type = range.extent.layers > 1 ? AmdGpu::ImageType::Color2DArray : AmdGpu::ImageType::Color2D; } ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info_, Image& image, @@ -113,7 +112,7 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info const vk::ImageViewCreateInfo image_view_ci = { .pNext = &usage_ci, .image = image.image, - .viewType = info.type, + .viewType = ConvertImageViewType(info.type), .format = instance.GetSupportedFormat(format, image.format_features), .components = info.mapping, .subresourceRange{ @@ -124,9 +123,9 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info .layerCount = info.range.extent.layers, }, }; - if (!IsViewTypeCompatible(image_view_ci.viewType, image.info.type)) { + if (!IsViewTypeCompatible(info.type, image.info.type)) { LOG_ERROR(Render_Vulkan, "image view type {} is incompatible with image type {}", - vk::to_string(image_view_ci.viewType), vk::to_string(image.info.type)); + vk::to_string(image_view_ci.viewType), vk::to_string(image_view_ci.viewType)); } auto [view_result, view] = instance.GetDevice().createImageViewUnique(image_view_ci); diff --git a/src/video_core/texture_cache/image_view.h b/src/video_core/texture_cache/image_view.h index 6a17490bf..a0bcd157a 100644 --- a/src/video_core/texture_cache/image_view.h +++ b/src/video_core/texture_cache/image_view.h @@ -23,7 +23,7 @@ struct ImageViewInfo { ImageViewInfo(const AmdGpu::Liverpool::DepthBuffer& depth_buffer, AmdGpu::Liverpool::DepthView view, AmdGpu::Liverpool::DepthControl ctl); - vk::ImageViewType type = vk::ImageViewType::e2D; + AmdGpu::ImageType type = AmdGpu::ImageType::Color2D; vk::Format format = vk::Format::eR8G8B8A8Unorm; SubresourceRange range; vk::ComponentMapping mapping{}; @@ -45,9 +45,8 @@ struct ImageView { ImageView(ImageView&&) = default; ImageView& operator=(ImageView&&) = default; - ImageId image_id{}; - Extent3D size{0, 0, 0}; - ImageViewInfo info{}; + ImageId image_id; + ImageViewInfo info; vk::UniqueImageView image_view; }; diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index fa2029b8f..9f7894f1e 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -1,7 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include #include #include "common/assert.h" @@ -25,7 +24,8 @@ static constexpr u64 NumFramesBeforeRemoval = 32; TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, - blit_helper{instance, scheduler}, tile_manager{instance, scheduler} { + blit_helper{instance, scheduler}, + tile_manager{instance, scheduler, buffer_cache.GetUtilityBuffer(MemoryUsage::Stream)} { // Create basic null image at fixed image ID. const auto null_id = GetNullImage(vk::Format::eR8G8B8A8Unorm); ASSERT(null_id.index == NULL_IMAGE_ID.index); @@ -63,8 +63,8 @@ ImageId TextureCache::GetNullImage(const vk::Format format) { ImageInfo info{}; info.pixel_format = format; - info.type = vk::ImageType::e2D; - info.tiling_idx = static_cast(AmdGpu::TilingMode::Texture_MicroTiled); + info.type = AmdGpu::ImageType::Color2D; + info.tile_mode = AmdGpu::TileMode::Thin1DThin; info.num_bits = 32; info.UpdateSize(); @@ -107,8 +107,8 @@ void TextureCache::DownloadImageMemory(ImageId image_id) { .bufferImageHeight = image.info.size.height, .imageSubresource = { - .aspectMask = image.info.IsDepthStencil() ? vk::ImageAspectFlagBits::eDepth - : vk::ImageAspectFlagBits::eColor, + .aspectMask = image.info.props.is_depth ? vk::ImageAspectFlagBits::eDepth + : vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = image.info.resources.layers, @@ -196,11 +196,12 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi ImageId cache_image_id) { auto& cache_image = slot_images[cache_image_id]; - if (!cache_image.info.IsDepthStencil() && !requested_info.IsDepthStencil()) { + if (!cache_image.info.props.is_depth && !requested_info.props.is_depth) { return {}; } - const bool stencil_match = requested_info.HasStencil() == cache_image.info.HasStencil(); + const bool stencil_match = + requested_info.props.has_stencil == cache_image.info.props.has_stencil; const bool bpp_match = requested_info.num_bits == cache_image.info.num_bits; // If an image in the cache has less slices we need to expand it @@ -210,27 +211,27 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi case BindingType::Texture: // The guest requires a depth sampled texture, but cache can offer only Rxf. Need to // recreate the image. - recreate |= requested_info.IsDepthStencil() && !cache_image.info.IsDepthStencil(); + recreate |= requested_info.props.is_depth && !cache_image.info.props.is_depth; break; case BindingType::Storage: // If the guest is going to use previously created depth as storage, the image needs to be // recreated. (TODO: Probably a case with linear rgba8 aliasing is legit) - recreate |= cache_image.info.IsDepthStencil(); + recreate |= cache_image.info.props.is_depth; break; case BindingType::RenderTarget: // Render target can have only Rxf format. If the cache contains only Dx[S8] we need to // re-create the image. - ASSERT(!requested_info.IsDepthStencil()); - recreate |= cache_image.info.IsDepthStencil(); + ASSERT(!requested_info.props.is_depth); + recreate |= cache_image.info.props.is_depth; break; case BindingType::DepthTarget: // The guest has requested previously allocated texture to be bound as a depth target. // In this case we need to convert Rx float to a Dx[S8] as requested - recreate |= !cache_image.info.IsDepthStencil(); + recreate |= !cache_image.info.props.is_depth; // The guest is trying to bind a depth target and cache has it. Need to be sure that aspects // and bpp match - recreate |= cache_image.info.IsDepthStencil() && !(stencil_match && bpp_match); + recreate |= cache_image.info.props.is_depth && !(stencil_match && bpp_match); break; default: break; @@ -251,9 +252,13 @@ ImageId TextureCache::ResolveDepthOverlap(const ImageInfo& requested_info, Bindi if (cache_image.info.num_samples == 1 && new_info.num_samples == 1) { // Perform depth<->color copy using the intermediate copy buffer. - const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); - new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); - } else if (cache_image.info.num_samples == 1 && new_info.IsDepthStencil() && + if (instance.IsMaintenance8Supported()) { + new_image.CopyImage(cache_image); + } else { + const auto& copy_buffer = buffer_cache.GetUtilityBuffer(MemoryUsage::DeviceLocal); + new_image.CopyImageWithBuffer(cache_image, copy_buffer.Handle(), 0); + } + } else if (cache_image.info.num_samples == 1 && new_info.props.is_depth && new_info.num_samples > 1) { // Perform a rendering pass to transfer the channels of source as samples in dest. blit_helper.BlitColorToMsDepth(cache_image, new_image); @@ -294,12 +299,12 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {depth_image_id, -1, -1}; } - if (image_info.IsBlockCoded() && !tex_cache_image.info.IsBlockCoded()) { - // Compressed view of uncompressed image with same block size. - // We need to recreate the image with compressed format and copy. + // Compressed view of uncompressed image with same block size. + if (image_info.props.is_block && !tex_cache_image.info.props.is_block) { return {ExpandImage(image_info, cache_image_id), -1, -1}; } + // Size and resources are less than or equal, use image view. if (image_info.pixel_format != tex_cache_image.info.pixel_format || image_info.guest_size <= tex_cache_image.info.guest_size) { auto result_id = merged_image_id ? merged_image_id : cache_image_id; @@ -309,16 +314,15 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag return {is_compatible ? result_id : ImageId{}, -1, -1}; } + // Size and resources are greater, expand the image. if (image_info.type == tex_cache_image.info.type && image_info.resources > tex_cache_image.info.resources) { - // Size and resources are greater, expand the image. return {ExpandImage(image_info, cache_image_id), -1, -1}; } - if (image_info.tiling_mode != tex_cache_image.info.tiling_mode) { - // Size is greater but resources are not, because the tiling mode is different. - // Likely this memory address is being reused for a different image with a different - // tiling mode. + // Size is greater but resources are not, because the tiling mode is different. + // Likely the address is reused for a image with a different tiling mode. + if (image_info.tile_mode != tex_cache_image.info.tile_mode) { if (safe_to_delete) { FreeImage(cache_image_id); } @@ -346,9 +350,9 @@ std::tuple TextureCache::ResolveOverlap(const ImageInfo& imag // Left overlap, the image from cache is a possible subresource of the image requested if (auto mip = tex_cache_image.info.MipOf(image_info); mip >= 0) { if (auto slice = tex_cache_image.info.SliceOf(image_info, mip); slice >= 0) { + // We have a larger image created and a separate one, representing a subres of it + // bound as render target. In this case we need to rebind render target. if (tex_cache_image.binding.is_target) { - // We have a larger image created and a separate one, representing a subres of - // it, bound as render target. In this case we need to rebind render target. tex_cache_image.binding.needs_rebind = 1u; if (merged_image_id) { GetImage(merged_image_id).binding.is_target = 1u; @@ -385,7 +389,6 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { auto& src_image = slot_images[image_id]; auto& new_image = slot_images[new_image_id]; - src_image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {}); RefreshImage(new_image); new_image.CopyImage(src_image); @@ -400,7 +403,7 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { return new_image_id; } -ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { +ImageId TextureCache::FindImage(BaseDesc& desc, bool exact_fmt) { const auto& info = desc.info; if (info.guest_address == 0) [[unlikely]] { @@ -420,28 +423,22 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { if (cache_image.info.guest_address != info.guest_address) { continue; } - if (False(flags & FindFlags::RelaxSize) && cache_image.info.guest_size != info.guest_size) { + if (cache_image.info.guest_size != info.guest_size) { continue; } - if (False(flags & FindFlags::RelaxDim) && cache_image.info.size != info.size) { + if (cache_image.info.size != info.size) { continue; } - if (False(flags & FindFlags::RelaxFmt) && - (!IsVulkanFormatCompatible(cache_image.info.pixel_format, info.pixel_format) || - (cache_image.info.type != info.type && info.size != Extent3D{1, 1, 1}))) { + if (!IsVulkanFormatCompatible(cache_image.info.pixel_format, info.pixel_format) || + (cache_image.info.type != info.type && info.size != Extent3D{1, 1, 1})) { continue; } - if (True(flags & FindFlags::ExactFmt) && - info.pixel_format != cache_image.info.pixel_format) { + if (exact_fmt && info.pixel_format != cache_image.info.pixel_format) { continue; } image_id = cache_id; } - if (True(flags & FindFlags::NoCreate) && !image_id) { - return {}; - } - // Try to resolve overlaps (if any) int view_mip{-1}; int view_slice{-1}; @@ -463,8 +460,7 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { if (image_id) { Image& image_resolved = slot_images[image_id]; - if (True(flags & FindFlags::ExactFmt) && - info.pixel_format != image_resolved.info.pixel_format) { + if (exact_fmt && info.pixel_format != image_resolved.info.pixel_format) { // Cannot reuse this image as we need the exact requested format. image_id = {}; } else if (image_resolved.info.resources < info.resources) { @@ -495,6 +491,37 @@ ImageId TextureCache::FindImage(BaseDesc& desc, FindFlags flags) { return image_id; } +ImageId TextureCache::FindImageFromRange(VAddr address, size_t size, bool ensure_valid) { + boost::container::small_vector image_ids; + ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { + if (image.info.guest_address != address) { + return; + } + if (ensure_valid && (False(image.flags & ImageFlagBits::GpuModified) || + True(image.flags & ImageFlagBits::Dirty))) { + return; + } + image_ids.push_back(image_id); + }); + if (image_ids.size() == 1) { + // Sometimes image size might not exactly match with requested buffer size + // If we only found 1 candidate image use it without too many questions. + return image_ids.back(); + } + if (!image_ids.empty()) { + for (s32 i = 0; i < image_ids.size(); ++i) { + Image& image = slot_images[image_ids[i]]; + if (image.info.guest_size == size) { + return image_ids[i]; + } + } + LOG_WARNING(Render_Vulkan, + "Failed to find exact image match for copy addr={:#x}, size={:#x}", address, + size); + } + return {}; +} + ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo& view_info) { Image& image = slot_images[image_id]; if (const ImageViewId view_id = image.FindView(view_info); view_id) { @@ -511,8 +538,7 @@ ImageView& TextureCache::FindTexture(ImageId image_id, const BaseDesc& desc) { Image& image = slot_images[image_id]; if (desc.type == BindingType::Storage) { image.flags |= ImageFlagBits::GpuModified; - if (Config::readbackLinearImages() && - image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { + if (Config::readbackLinearImages() && !image.info.props.is_tiled) { download_images.emplace(image_id); } } @@ -524,10 +550,6 @@ ImageView& TextureCache::FindRenderTarget(BaseDesc& desc) { const ImageId image_id = FindImage(desc); Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; - if (Config::readbackLinearImages() && - image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear) { - download_images.emplace(image_id); - } image.usage.render_target = 1u; UpdateImage(image_id); @@ -552,7 +574,7 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) { Image& image = slot_images[image_id]; image.flags |= ImageFlagBits::GpuModified; image.usage.depth_target = 1u; - image.usage.stencil = image.info.HasStencil(); + image.usage.stencil = image.info.props.has_stencil; UpdateImage(image_id); // Register meta data for this depth buffer @@ -589,11 +611,7 @@ ImageView& TextureCache::FindDepthTarget(BaseDesc& desc) { } void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_scheduler /*= nullptr*/) { - if (False(image.flags & ImageFlagBits::Dirty)) { - return; - } - - if (image.info.num_samples > 1) { + if (False(image.flags & ImageFlagBits::Dirty) || image.info.num_samples > 1) { return; } @@ -644,15 +662,10 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const u32 extent_width = mip_pitch ? std::min(mip_pitch, width) : width; const u32 extent_height = mip_height ? std::min(mip_height, height) : height; - const bool is_volume = image.info.tiling_mode == AmdGpu::TilingMode::Texture_Volume; - const u32 height_aligned = mip_height && image.info.IsTiled() && !is_volume - ? std::max(mip_height, 8U) - : mip_height; - image_copy.push_back({ .bufferOffset = mip_offset, .bufferRowLength = mip_pitch, - .bufferImageHeight = height_aligned, + .bufferImageHeight = mip_height, .imageSubresource{ .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, .mipLevel = m, @@ -674,13 +687,10 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size; - const auto [vk_buffer, buf_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); - - const auto cmdbuf = sched_ptr->CommandBuffer(); - - // The obtained buffer may be GPU modified so we need to emit a barrier to prevent RAW hazard - if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, + const auto [in_buffer, in_offset] = buffer_cache.ObtainBufferForImage(image_addr, image_size); + if (auto barrier = in_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, vk::PipelineStageFlagBits2::eTransfer)) { + const auto cmdbuf = sched_ptr->CommandBuffer(); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -689,7 +699,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule } const auto [buffer, offset] = - tile_manager.TryDetile(vk_buffer->Handle(), buf_offset, image.info); + !custom_scheduler ? tile_manager.DetileImage(in_buffer->Handle(), in_offset, image.info) + : std::make_pair(in_buffer->Handle(), in_offset); for (auto& copy : image_copy) { copy.bufferOffset += offset; } @@ -715,6 +726,7 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const auto image_barriers = image.GetBarriers(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, vk::PipelineStageFlagBits2::eTransfer, {}); + const auto cmdbuf = sched_ptr->CommandBuffer(); cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, @@ -911,8 +923,8 @@ void TextureCache::RunGarbageCollector() { --num_deletions; auto& image = slot_images[image_id]; const bool download = image.SafeToDownload(); - const bool linear = image.info.tiling_mode == AmdGpu::TilingMode::Display_Linear; - if (!linear && download) { + const bool tiled = image.info.IsTiled(); + if (tiled && download) { // This is a workaround for now. We can't handle non-linear image downloads. return false; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c4f09f6a0..b63a7abf2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -26,17 +26,6 @@ namespace VideoCore { class BufferCache; class PageManager; -enum class FindFlags { - NoCreate = 1 << 0, ///< Do not create an image if searching for one fails. - RelaxDim = 1 << 1, ///< Do not check the dimentions of image, only address. - RelaxSize = 1 << 2, ///< Do not check that the size matches exactly. - RelaxFmt = 1 << 3, ///< Do not check that format is compatible. - ExactFmt = 1 << 4, ///< Require the format to be exactly the same. -}; -DECLARE_ENUM_FLAG_OPERATORS(FindFlags) - -static constexpr u32 MaxInvalidateDist = 12_MB; - class TextureCache { // Default values for garbage collection static constexpr s64 DEFAULT_PRESSURE_GC_MEMORY = 1_GB + 512_MB; @@ -103,6 +92,10 @@ public: BufferCache& buffer_cache, PageManager& tracker); ~TextureCache(); + TileManager& GetTileManager() noexcept { + return tile_manager; + } + /// Invalidates any image in the logical page range. void InvalidateMemory(VAddr addr, size_t size); @@ -116,7 +109,10 @@ public: void ProcessDownloadImages(); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(BaseDesc& desc, FindFlags flags = {}); + [[nodiscard]] ImageId FindImage(BaseDesc& desc, bool exact_fmt = false); + + /// Retrieves image whose address matches provided + [[nodiscard]] ImageId FindImageFromRange(VAddr address, size_t size, bool ensure_valid = true); /// Retrieves an image view with the properties of the specified image id. [[nodiscard]] ImageView& FindTexture(ImageId image_id, const BaseDesc& desc); @@ -145,6 +141,7 @@ public: [[nodiscard]] ImageId ResolveDepthOverlap(const ImageInfo& requested_info, BindingType binding, ImageId cache_img_id); + /// Creates a new image with provided image info and copies subresources from image_id [[nodiscard]] ImageId ExpandImage(const ImageInfo& info, ImageId image_id); /// Reuploads image contents. diff --git a/src/video_core/texture_cache/tile.h b/src/video_core/texture_cache/tile.h index 54938b801..68c9428fe 100644 --- a/src/video_core/texture_cache/tile.h +++ b/src/video_core/texture_cache/tile.h @@ -6,6 +6,10 @@ #include "common/assert.h" #include "common/types.h" +namespace AmdGpu { +enum class TileMode : u32; +} + namespace VideoCore { // clang-format off @@ -285,17 +289,17 @@ constexpr std::array macro_tile_extents_alt{ constexpr std::pair micro_tile_extent{8u, 8u}; constexpr auto hw_pipe_interleave = 256u; -constexpr std::pair GetMacroTileExtents(u32 tiling_idx, u32 bpp, u32 num_samples, - bool alt) { +constexpr std::pair GetMacroTileExtents(AmdGpu::TileMode tile_mode, u32 bpp, + u32 num_samples, bool alt) { ASSERT(num_samples <= 8); const auto samples_log = static_cast(std::log2(num_samples)); - const auto row = tiling_idx * 5; + const auto row = u32(tile_mode) * 5; const auto column = std::bit_width(bpp) - 4; // bpps are 8, 16, 32, 64, 128 return (alt ? macro_tile_extents_alt : macro_tile_extents)[samples_log][row + column]; } -constexpr std::pair ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, - u32 num_samples) { +constexpr std::tuple ImageSizeLinearAligned(u32 pitch, u32 height, u32 bpp, + u32 num_samples) { const auto pitch_align = std::max(8u, 64u / ((bpp + 7) / 8)); auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = height; @@ -305,11 +309,11 @@ constexpr std::pair ImageSizeLinearAligned(u32 pitch, u32 height, u pitch_aligned += pitch_align; log_sz = pitch_aligned * height_aligned * num_samples; } - return {pitch_aligned, (log_sz * bpp + 7) / 8}; + return {pitch_aligned, height_aligned, (log_sz * bpp + 7) / 8}; } -constexpr std::pair ImageSizeMicroTiled(u32 pitch, u32 height, u32 thickness, u32 bpp, - u32 num_samples) { +constexpr std::tuple ImageSizeMicroTiled(u32 pitch, u32 height, u32 thickness, + u32 bpp, u32 num_samples) { const auto& [pitch_align, height_align] = micro_tile_extent; auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); @@ -318,14 +322,14 @@ constexpr std::pair ImageSizeMicroTiled(u32 pitch, u32 height, u32 pitch_aligned += pitch_align; log_sz = (pitch_aligned * height_aligned * bpp * num_samples + 7) / 8; } - return {pitch_aligned, log_sz}; + return {pitch_aligned, height_aligned, log_sz}; } -constexpr std::pair ImageSizeMacroTiled(u32 pitch, u32 height, u32 thickness, u32 bpp, - u32 num_samples, u32 tiling_idx, u32 mip_n, - bool alt) { - const auto& [pitch_align, height_align] = - GetMacroTileExtents(tiling_idx, bpp, num_samples, alt); +constexpr std::tuple ImageSizeMacroTiled(u32 pitch, u32 height, u32 thickness, + u32 bpp, u32 num_samples, + AmdGpu::TileMode tile_mode, u32 mip_n, + bool alt) { + const auto [pitch_align, height_align] = GetMacroTileExtents(tile_mode, bpp, num_samples, alt); ASSERT(pitch_align != 0 && height_align != 0); bool downgrade_to_micro = false; if (mip_n > 0) { @@ -341,7 +345,7 @@ constexpr std::pair ImageSizeMacroTiled(u32 pitch, u32 height, u32 const auto pitch_aligned = (pitch + pitch_align - 1) & ~(pitch_align - 1); const auto height_aligned = (height + height_align - 1) & ~(height_align - 1); const auto log_sz = pitch_aligned * height_aligned * num_samples; - return {pitch_aligned, (log_sz * bpp + 7) / 8}; + return {pitch_aligned, height_aligned, (log_sz * bpp + 7) / 8}; } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index dd6fae457..d872f8b2e 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "video_core/buffer_cache/buffer.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" @@ -8,82 +9,25 @@ #include "video_core/texture_cache/image_view.h" #include "video_core/texture_cache/tile_manager.h" -#include "video_core/host_shaders/detilers/display_micro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_32bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/macro_8bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_128bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_16bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_32bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_64bpp_comp.h" -#include "video_core/host_shaders/detilers/micro_8bpp_comp.h" +#include "video_core/host_shaders/tiling_comp.h" -// #include #include #include namespace VideoCore { -const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const { - switch (info.tiling_mode) { - case AmdGpu::TilingMode::Texture_MicroTiled: - switch (info.num_bits) { - case 8: - return &detilers[DetilerType::Micro8]; - case 16: - return &detilers[DetilerType::Micro16]; - case 32: - return &detilers[DetilerType::Micro32]; - case 64: - return &detilers[DetilerType::Micro64]; - case 128: - return &detilers[DetilerType::Micro128]; - default: - return nullptr; - } - case AmdGpu::TilingMode::Texture_Volume: - switch (info.num_bits) { - case 8: - return &detilers[DetilerType::Macro8]; - case 32: - return &detilers[DetilerType::Macro32]; - case 64: - return &detilers[DetilerType::Macro64]; - default: - return nullptr; - } - break; - case AmdGpu::TilingMode::Display_MicroTiled: - switch (info.num_bits) { - case 64: - return &detilers[DetilerType::Display_Micro64]; - default: - return nullptr; - } - break; - default: - return nullptr; - } -} - -struct DetilerParams { - u32 num_levels; - u32 pitch0; - u32 height; - std::array sizes; +struct TilingInfo { + u32 bank_swizzle; + u32 num_slices; + u32 num_mips; + std::array mips; }; -TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler) - : instance{instance}, scheduler{scheduler} { - static const std::array detiler_shaders{ - HostShaders::MICRO_8BPP_COMP, HostShaders::MICRO_16BPP_COMP, - HostShaders::MICRO_32BPP_COMP, HostShaders::MICRO_64BPP_COMP, - HostShaders::MICRO_128BPP_COMP, HostShaders::MACRO_8BPP_COMP, - HostShaders::MACRO_32BPP_COMP, HostShaders::MACRO_64BPP_COMP, - HostShaders::DISPLAY_MICRO_64BPP_COMP, - }; - - boost::container::static_vector bindings{ +TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + StreamBuffer& stream_buffer_) + : instance{instance}, scheduler{scheduler}, stream_buffer{stream_buffer_} { + const auto device = instance.GetDevice(); + const std::array bindings = {{ { .binding = 0, .descriptorType = vk::DescriptorType::eStorageBuffer, @@ -96,88 +40,52 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute, }, - }; + { + .binding = 2, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }, + }}; const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, .bindingCount = static_cast(bindings.size()), .pBindings = bindings.data(), }; - auto desc_layout_result = instance.GetDevice().createDescriptorSetLayoutUnique(desc_layout_ci); + auto desc_layout_result = device.createDescriptorSetLayoutUnique(desc_layout_ci); ASSERT_MSG(desc_layout_result.result == vk::Result::eSuccess, "Failed to create descriptor set layout: {}", vk::to_string(desc_layout_result.result)); desc_layout = std::move(desc_layout_result.value); - const vk::PushConstantRange push_constants = { - .stageFlags = vk::ShaderStageFlagBits::eCompute, - .offset = 0, - .size = sizeof(DetilerParams), + const vk::DescriptorSetLayout set_layout = *desc_layout; + const vk::PipelineLayoutCreateInfo layout_info = { + .setLayoutCount = 1U, + .pSetLayouts = &set_layout, + .pushConstantRangeCount = 0U, + .pPushConstantRanges = nullptr, }; - - for (int pl_id = 0; pl_id < DetilerType::Max; ++pl_id) { - auto& ctx = detilers[pl_id]; - - const auto& module = Vulkan::Compile( - detiler_shaders[pl_id], vk::ShaderStageFlagBits::eCompute, instance.GetDevice()); - - // Set module debug name - auto module_name = magic_enum::enum_name(static_cast(pl_id)); - Vulkan::SetObjectName(instance.GetDevice(), module, module_name); - - const vk::PipelineShaderStageCreateInfo shader_ci = { - .stage = vk::ShaderStageFlagBits::eCompute, - .module = module, - .pName = "main", - }; - - const vk::DescriptorSetLayout set_layout = *desc_layout; - const vk::PipelineLayoutCreateInfo layout_info = { - .setLayoutCount = 1U, - .pSetLayouts = &set_layout, - .pushConstantRangeCount = 1, - .pPushConstantRanges = &push_constants, - }; - auto [layout_result, layout] = instance.GetDevice().createPipelineLayoutUnique(layout_info); - ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", - vk::to_string(layout_result)); - ctx.pl_layout = std::move(layout); - - const vk::ComputePipelineCreateInfo compute_pipeline_ci = { - .stage = shader_ci, - .layout = *ctx.pl_layout, - }; - auto result = instance.GetDevice().createComputePipelineUnique( - /*pipeline_cache*/ {}, compute_pipeline_ci); - if (result.result == vk::Result::eSuccess) { - ctx.pl = std::move(result.value); - } else { - UNREACHABLE_MSG("Detiler pipeline creation failed!"); - } - - // Once pipeline is compiled, we don't need the shader module anymore - instance.GetDevice().destroyShaderModule(module); - } + auto [layout_result, layout] = device.createPipelineLayoutUnique(layout_info); + ASSERT_MSG(layout_result == vk::Result::eSuccess, "Failed to create pipeline layout: {}", + vk::to_string(layout_result)); + pl_layout = std::move(layout); } TileManager::~TileManager() = default; -TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /*= false*/) { - const auto usage = vk::BufferUsageFlagBits::eStorageBuffer | - (is_storage ? vk::BufferUsageFlagBits::eTransferSrc - : vk::BufferUsageFlagBits::eTransferDst); - const vk::BufferCreateInfo buffer_ci{ +TileManager::ScratchBuffer TileManager::GetScratchBuffer(u32 size) { + constexpr auto usage = + vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer | + vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst; + + const vk::BufferCreateInfo buffer_ci = { .size = size, .usage = usage, }; - VmaAllocationCreateInfo alloc_info{ - .flags = !is_storage ? VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | - VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT - : static_cast(0), + const VmaAllocationCreateInfo alloc_info{ .usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, - .requiredFlags = !is_storage ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT - : static_cast(0), }; VkBuffer buffer; @@ -189,67 +97,120 @@ TileManager::ScratchBuffer TileManager::AllocBuffer(u32 size, bool is_storage /* return {buffer, allocation}; } -void TileManager::Upload(ScratchBuffer buffer, const void* data, size_t size) { - VmaAllocationInfo alloc_info{}; - vmaGetAllocationInfo(instance.GetAllocator(), buffer.second, &alloc_info); - ASSERT(size <= alloc_info.size); - void* ptr{}; - const auto result = vmaMapMemory(instance.GetAllocator(), buffer.second, &ptr); - ASSERT(result == VK_SUCCESS); - std::memcpy(ptr, data, size); - vmaUnmapMemory(instance.GetAllocator(), buffer.second); +vk::Pipeline TileManager::GetTilingPipeline(const ImageInfo& info, bool is_tiler) { + const u32 pl_id = u32(info.tile_mode) * NUM_BPPS + std::bit_width(info.num_bits) - 4; + auto& tiling_pipelines = is_tiler ? tilers : detilers; + if (auto pipeline = *tiling_pipelines[pl_id]; pipeline != VK_NULL_HANDLE) { + return pipeline; + } + + const auto device = instance.GetDevice(); + std::vector defines = { + fmt::format("BITS_PER_PIXEL={}", info.num_bits), + fmt::format("NUM_SAMPLES={}", info.num_samples), + fmt::format("ARRAY_MODE={}", u32(info.array_mode)), + fmt::format("MICRO_TILE_MODE={}", u32(AmdGpu::GetMicroTileMode(info.tile_mode))), + fmt::format("MICRO_TILE_THICKNESS={}", AmdGpu::GetMicroTileThickness(info.array_mode)), + }; + if (AmdGpu::IsMacroTiled(info.array_mode)) { + const auto macro_tile_mode = + AmdGpu::CalculateMacrotileMode(info.tile_mode, info.num_bits, info.num_samples); + const u32 num_banks = AmdGpu::GetNumBanks(macro_tile_mode); + defines.emplace_back( + fmt::format("PIPE_CONFIG={}", u32(AmdGpu::GetPipeConfig(info.tile_mode)))); + defines.emplace_back(fmt::format("BANK_WIDTH={}", AmdGpu::GetBankWidth(macro_tile_mode))); + defines.emplace_back(fmt::format("BANK_HEIGHT={}", AmdGpu::GetBankHeight(macro_tile_mode))); + defines.emplace_back(fmt::format("NUM_BANKS={}", num_banks)); + defines.emplace_back(fmt::format("NUM_BANK_BITS={}", std::bit_width(num_banks) - 1)); + defines.emplace_back( + fmt::format("TILE_SPLIT_BYTES={}", AmdGpu::GetTileSplit(info.tile_mode))); + defines.emplace_back( + fmt::format("MACRO_TILE_ASPECT={}", AmdGpu::GetMacrotileAspect(macro_tile_mode))); + } + if (is_tiler) { + defines.emplace_back(fmt::format("IS_TILER=1")); + } + + const auto& module = Vulkan::Compile(HostShaders::TILING_COMP, + vk::ShaderStageFlagBits::eCompute, device, defines); + const auto module_name = fmt::format("{}_{} {}", magic_enum::enum_name(info.tile_mode), + info.num_bits, is_tiler ? "tiler" : "detiler"); + LOG_WARNING(Render_Vulkan, "Compiling shader {}", module_name); + for (const auto& def : defines) { + LOG_WARNING(Render_Vulkan, "#define {}", def); + } + Vulkan::SetObjectName(device, module, module_name); + const vk::PipelineShaderStageCreateInfo shader_ci = { + .stage = vk::ShaderStageFlagBits::eCompute, + .module = module, + .pName = "main", + }; + const vk::ComputePipelineCreateInfo compute_pipeline_ci = { + .stage = shader_ci, + .layout = *pl_layout, + }; + auto [result, pipeline] = + device.createComputePipelineUnique(VK_NULL_HANDLE, compute_pipeline_ci); + ASSERT_MSG(result == vk::Result::eSuccess, "Detiler pipeline creation failed {}", + vk::to_string(result)); + tiling_pipelines[pl_id] = std::move(pipeline); + device.destroyShaderModule(module); + return *tiling_pipelines[pl_id]; } -void TileManager::FreeBuffer(ScratchBuffer buffer) { - vmaDestroyBuffer(instance.GetAllocator(), buffer.first, buffer.second); -} - -std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_offset, - const ImageInfo& info) { +TileManager::Result TileManager::DetileImage(vk::Buffer in_buffer, u32 in_offset, + const ImageInfo& info) { if (!info.props.is_tiled) { return {in_buffer, in_offset}; } - const auto* detiler = GetDetiler(info); - if (!detiler) { - if (info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled && - info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled && - info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) { - LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", - vk::to_string(info.pixel_format), NameOf(info.tiling_mode)); + TilingInfo params{}; + params.bank_swizzle = info.bank_swizzle; + params.num_slices = info.props.is_volume ? info.size.depth : info.resources.layers; + params.num_mips = info.resources.levels; + for (u32 mip = 0; mip < params.num_mips; ++mip) { + auto& mip_info = params.mips[mip]; + mip_info = info.mips_layout[mip]; + if (info.props.is_block) { + mip_info.pitch = std::max((mip_info.pitch + 3) / 4, 1U); + mip_info.height = std::max((mip_info.height + 3) / 4, 1U); } - return {in_buffer, in_offset}; } - const u32 image_size = info.guest_size; + const vk::DescriptorBufferInfo params_buffer_info{ + .buffer = stream_buffer.Handle(), + .offset = stream_buffer.Copy(¶ms, sizeof(params), instance.UniformMinAlignment()), + .range = sizeof(params), + }; - // Prepare output buffer - auto out_buffer = AllocBuffer(image_size, true); - scheduler.DeferOperation([=, this]() { FreeBuffer(out_buffer); }); + const auto [out_buffer, out_allocation] = GetScratchBuffer(info.guest_size); + scheduler.DeferOperation([this, out_buffer, out_allocation]() { + vmaDestroyBuffer(instance.GetAllocator(), out_buffer, out_allocation); + }); - auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, *detiler->pl); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, GetTilingPipeline(info, false)); - const vk::DescriptorBufferInfo input_buffer_info{ + const vk::DescriptorBufferInfo tiled_buffer_info{ .buffer = in_buffer, .offset = in_offset, - .range = image_size, + .range = info.guest_size, }; - const vk::DescriptorBufferInfo output_buffer_info{ - .buffer = out_buffer.first, + const vk::DescriptorBufferInfo linear_buffer_info{ + .buffer = out_buffer, .offset = 0, - .range = image_size, + .range = info.guest_size, }; - std::vector set_writes{ + const std::array set_writes = {{ { .dstSet = VK_NULL_HANDLE, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &input_buffer_info, + .pBufferInfo = &tiled_buffer_info, }, { .dstSet = VK_NULL_HANDLE, @@ -257,41 +218,107 @@ std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, - .pBufferInfo = &output_buffer_info, + .pBufferInfo = &linear_buffer_info, }, - }; - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *detiler->pl_layout, 0, - set_writes); + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 2, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = ¶ms_buffer_info, + }, + }}; + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pl_layout, 0, set_writes); - DetilerParams params; - params.num_levels = info.resources.levels; - params.pitch0 = info.pitch >> (info.props.is_block ? 2u : 0u); - params.height = info.size.height; - if (info.tiling_mode == AmdGpu::TilingMode::Texture_Volume || - info.tiling_mode == AmdGpu::TilingMode::Display_MicroTiled) { - if (info.resources.levels != 1) { - LOG_ERROR(Render_Vulkan, "Unexpected mipmaps for volume and display tilings {}", - info.resources.levels); + const auto dim_x = (info.guest_size / (info.num_bits / 8)) / 64; + cmdbuf.dispatch(dim_x, 1, 1); + return {out_buffer, 0}; +} + +void TileManager::TileImage(vk::Image in_image, std::span buffer_copies, + vk::Buffer out_buffer, u32 out_offset, const ImageInfo& info) { + if (!info.props.is_tiled) { + for (auto& copy : buffer_copies) { + copy.bufferOffset += out_offset; } - const auto tiles_per_row = info.pitch / 8u; - const auto tiles_per_slice = tiles_per_row * ((info.size.height + 7u) / 8u); - params.sizes[0] = tiles_per_row; - params.sizes[1] = tiles_per_slice; - } else { - ASSERT(info.resources.levels <= params.sizes.size()); - std::memset(¶ms.sizes, 0, sizeof(params.sizes)); - for (int m = 0; m < info.resources.levels; ++m) { - params.sizes[m] = info.mips_layout[m].size + (m > 0 ? params.sizes[m - 1] : 0); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(in_image, vk::ImageLayout::eTransferSrcOptimal, out_buffer, + buffer_copies); + return; + } + + TilingInfo params{}; + params.bank_swizzle = info.bank_swizzle; + params.num_slices = info.props.is_volume ? info.size.depth : info.resources.layers; + params.num_mips = static_cast(buffer_copies.size()); + for (u32 mip = 0; mip < params.num_mips; ++mip) { + auto& mip_info = params.mips[mip]; + mip_info = info.mips_layout[mip]; + if (info.props.is_block) { + mip_info.pitch = std::max((mip_info.pitch + 3) / 4, 1U); + mip_info.height = std::max((mip_info.height + 3) / 4, 1U); } } - cmdbuf.pushConstants(*detiler->pl_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(params), - ¶ms); + const vk::DescriptorBufferInfo params_buffer_info{ + .buffer = stream_buffer.Handle(), + .offset = stream_buffer.Copy(¶ms, sizeof(params), instance.UniformMinAlignment()), + .range = sizeof(params), + }; - ASSERT((image_size % 64) == 0); - const auto num_tiles = image_size / (64 * (info.num_bits / 8)); - cmdbuf.dispatch(num_tiles, 1, 1); - return {out_buffer.first, 0}; + const auto [temp_buffer, temp_allocation] = GetScratchBuffer(info.guest_size); + scheduler.DeferOperation([this, temp_buffer, temp_allocation]() { + vmaDestroyBuffer(instance.GetAllocator(), temp_buffer, temp_allocation); + }); + + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(in_image, vk::ImageLayout::eTransferSrcOptimal, temp_buffer, + buffer_copies); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, GetTilingPipeline(info, true)); + + const vk::DescriptorBufferInfo tiled_buffer_info{ + .buffer = out_buffer, + .offset = out_offset, + .range = info.guest_size, + }; + + const vk::DescriptorBufferInfo linear_buffer_info{ + .buffer = temp_buffer, + .offset = 0, + .range = info.guest_size, + }; + + const std::array set_writes = {{ + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &tiled_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eStorageBuffer, + .pBufferInfo = &linear_buffer_info, + }, + { + .dstSet = VK_NULL_HANDLE, + .dstBinding = 2, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = ¶ms_buffer_info, + }, + }}; + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pl_layout, 0, set_writes); + + const auto dim_x = (info.guest_size / (info.num_bits / 8)) / 64; + cmdbuf.dispatch(dim_x, 1, 1); } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index adda16b3d..dc897a31e 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -4,56 +4,42 @@ #pragma once #include "common/types.h" +#include "video_core/amdgpu/tiling.h" #include "video_core/buffer_cache/buffer.h" namespace VideoCore { -class TextureCache; struct ImageInfo; - -enum DetilerType : u32 { - Micro8, - Micro16, - Micro32, - Micro64, - Micro128, - - Macro8, - Macro32, - Macro64, - - Display_Micro64, - - Max -}; - -struct DetilerContext { - vk::UniquePipeline pl; - vk::UniquePipelineLayout pl_layout; -}; +class StreamBuffer; class TileManager { + static constexpr size_t NUM_BPPS = 5; + public: using ScratchBuffer = std::pair; + using Result = std::pair; - TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); + explicit TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + StreamBuffer& stream_buffer); ~TileManager(); - std::pair TryDetile(vk::Buffer in_buffer, u32 in_offset, - const ImageInfo& info); + void TileImage(vk::Image in_image, std::span buffer_copies, + vk::Buffer out_buffer, u32 out_offset, const ImageInfo& info); - ScratchBuffer AllocBuffer(u32 size, bool is_storage = false); - void Upload(ScratchBuffer buffer, const void* data, size_t size); - void FreeBuffer(ScratchBuffer buffer); + Result DetileImage(vk::Buffer in_buffer, u32 in_offset, const ImageInfo& info); private: - const DetilerContext* GetDetiler(const ImageInfo& info) const; + vk::Pipeline GetTilingPipeline(const ImageInfo& info, bool is_tiler); + ScratchBuffer GetScratchBuffer(u32 size); private: const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; + StreamBuffer& stream_buffer; vk::UniqueDescriptorSetLayout desc_layout; - std::array detilers; + vk::UniquePipelineLayout pl_layout; + std::array detilers{}; + std::array tilers{}; }; } // namespace VideoCore