shader_recompiler: Replace texel buffers with in-shader buffer format interpretation (#2363)

* shader_recompiler: Replace texel buffers with in-shader buffer format interpretation

* shader_recompiler: Move 10/11-bit float conversion to functions and address some comments.

* vulkan: Remove VK_KHR_maintenance5 as it is no longer needed for buffer views.

* shader_recompiler: Add helpers for composites and bitfields in pack/unpack.

* shader_recompiler: Use initializer_list for bitfield insert helper.
This commit is contained in:
squidbus
2025-02-06 20:40:49 -08:00
committed by GitHub
parent 78b4f10cc6
commit cfe249debe
35 changed files with 1037 additions and 562 deletions

View File

@@ -352,12 +352,9 @@ vk::ComponentMapping ComponentMapping(AmdGpu::CompMapping comp_mapping) {
};
}
static constexpr vk::FormatFeatureFlags2 BufferRead =
vk::FormatFeatureFlagBits2::eUniformTexelBuffer | vk::FormatFeatureFlagBits2::eVertexBuffer;
static constexpr vk::FormatFeatureFlags2 BufferWrite =
vk::FormatFeatureFlagBits2::eStorageTexelBuffer |
vk::FormatFeatureFlagBits2::eStorageReadWithoutFormat |
vk::FormatFeatureFlagBits2::eStorageWriteWithoutFormat;
// Texel buffer feature flags are not needed as format is interpreted in-shader.
static constexpr vk::FormatFeatureFlags2 BufferRead = vk::FormatFeatureFlagBits2::eVertexBuffer;
static constexpr vk::FormatFeatureFlags2 BufferWrite = static_cast<vk::FormatFeatureFlags2>(0);
static constexpr vk::FormatFeatureFlags2 ImageRead = vk::FormatFeatureFlagBits2::eTransferSrc |
vk::FormatFeatureFlagBits2::eTransferDst |
vk::FormatFeatureFlagBits2::eSampledImage;

View File

@@ -55,15 +55,6 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
.stageFlags = vk::ShaderStageFlagBits::eCompute,
});
}
for (const auto& tex_buffer : info->texture_buffers) {
bindings.push_back({
.binding = binding++,
.descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer
: vk::DescriptorType::eUniformTexelBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
});
}
for (const auto& image : info->images) {
bindings.push_back({
.binding = binding++,

View File

@@ -375,15 +375,6 @@ void GraphicsPipeline::BuildDescSetLayout() {
.stageFlags = gp_stage_flags,
});
}
for (const auto& tex_buffer : stage->texture_buffers) {
bindings.push_back({
.binding = binding++,
.descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer
: vk::DescriptorType::eUniformTexelBuffer,
.descriptorCount = 1,
.stageFlags = gp_stage_flags,
});
}
for (const auto& image : stage->images) {
bindings.push_back({
.binding = binding++,

View File

@@ -268,7 +268,6 @@ bool Instance::CreateDevice() {
null_descriptor =
feature_chain.get<vk::PhysicalDeviceRobustness2FeaturesEXT>().nullDescriptor;
}
maintenance5 = add_extension(VK_KHR_MAINTENANCE_5_EXTENSION_NAME);
custom_border_color = add_extension(VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME);
depth_clip_control = add_extension(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME);
@@ -376,9 +375,6 @@ bool Instance::CreateDevice() {
.maintenance4 = true,
},
// Other extensions
vk::PhysicalDeviceMaintenance5FeaturesKHR{
.maintenance5 = true,
},
vk::PhysicalDeviceCustomBorderColorFeaturesEXT{
.customBorderColors = true,
.customBorderColorWithoutFormat = true,
@@ -414,9 +410,6 @@ bool Instance::CreateDevice() {
if (!maintenance4) {
device_chain.unlink<vk::PhysicalDeviceMaintenance4FeaturesKHR>();
}
if (!maintenance5) {
device_chain.unlink<vk::PhysicalDeviceMaintenance5FeaturesKHR>();
}
if (!custom_border_color) {
device_chain.unlink<vk::PhysicalDeviceCustomBorderColorFeaturesEXT>();
}

View File

@@ -114,11 +114,6 @@ public:
return null_descriptor;
}
/// Returns true when VK_KHR_maintenance5 is supported.
bool IsMaintenance5Supported() const {
return maintenance5;
}
/// Returns true when VK_KHR_fragment_shader_barycentric is supported.
bool IsFragmentShaderBarycentricSupported() const {
return fragment_shader_barycentric;
@@ -209,11 +204,6 @@ public:
return properties.limits.minStorageBufferOffsetAlignment;
}
/// Returns the minimum required alignment for texel buffers
vk::DeviceSize TexelBufferMinAlignment() const {
return properties.limits.minTexelBufferOffsetAlignment;
}
/// Returns the minimum alignemt required for accessing host-mapped device memory
vk::DeviceSize NonCoherentAtomSize() const {
return properties.limits.nonCoherentAtomSize;
@@ -229,11 +219,6 @@ public:
return properties.limits.maxComputeSharedMemorySize;
}
/// Returns the maximum supported elements in a texel buffer
u32 MaxTexelBufferElements() const {
return properties.limits.maxTexelBufferElements;
}
/// Returns the maximum sampler LOD bias.
float MaxSamplerLodBias() const {
return properties.limits.maxSamplerLodBias;
@@ -317,7 +302,6 @@ private:
bool dynamic_color_write_mask{};
bool vertex_input_dynamic_state{};
bool null_descriptor{};
bool maintenance5{};
bool list_restart{};
bool legacy_vertex_attributes{};
bool shader_stencil_export{};

View File

@@ -29,8 +29,6 @@ using Shader::VsOutput;
constexpr static std::array DescriptorHeapSizes = {
vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192},
vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024},
vk::DescriptorPoolSize{vk::DescriptorType::eUniformTexelBuffer, 128},
vk::DescriptorPoolSize{vk::DescriptorType::eStorageTexelBuffer, 128},
vk::DescriptorPoolSize{vk::DescriptorType::eSampledImage, 8192},
vk::DescriptorPoolSize{vk::DescriptorType::eSampler, 1024},
};

View File

@@ -435,28 +435,6 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
if (pipeline->IsCompute()) {
const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute);
// Most of the time when a metadata is updated with a shader it gets cleared. It means
// we can skip the whole dispatch and update the tracked state instead. Also, it is not
// intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we
// will need its full emulation anyways. For cases of metadata read a warning will be
// logged.
const auto IsMetaUpdate = [&](const auto& desc) {
const auto sharp = desc.GetSharp(info);
const VAddr address = sharp.base_address;
if (desc.is_written) {
// Assume all slices were updates
if (texture_cache.ClearMeta(address)) {
LOG_TRACE(Render_Vulkan, "Metadata update skipped");
return true;
}
} else {
if (texture_cache.IsMeta(address)) {
LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)");
}
}
return false;
};
// Assume if a shader reads and writes metas at the same time, it is a copy shader.
bool meta_read = false;
for (const auto& desc : info.buffers) {
@@ -469,23 +447,26 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
}
}
for (const auto& desc : info.texture_buffers) {
if (!desc.is_written) {
const VAddr address = desc.GetSharp(info).base_address;
meta_read = texture_cache.IsMeta(address);
}
}
// Most of the time when a metadata is updated with a shader it gets cleared. It means
// we can skip the whole dispatch and update the tracked state instead. Also, it is not
// intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we
// will need its full emulation anyways. For cases of metadata read a warning will be
// logged.
if (!meta_read) {
for (const auto& desc : info.buffers) {
if (IsMetaUpdate(desc)) {
return false;
}
}
for (const auto& desc : info.texture_buffers) {
if (IsMetaUpdate(desc)) {
return false;
const auto sharp = desc.GetSharp(info);
const VAddr address = sharp.base_address;
if (desc.is_written) {
// Assume all slices were updates
if (texture_cache.ClearMeta(address)) {
LOG_TRACE(Render_Vulkan, "Metadata update skipped");
return false;
}
} else {
if (texture_cache.IsMeta(address)) {
LOG_WARNING(Render_Vulkan,
"Unexpected metadata read by a CS shader (buffer)");
}
}
}
}
@@ -541,19 +522,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
}
}
texbuffer_bindings.clear();
for (const auto& desc : stage.texture_buffers) {
const auto vsharp = desc.GetSharp(stage);
if (vsharp.base_address != 0 && vsharp.GetSize() > 0 &&
vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) {
const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize());
texbuffer_bindings.emplace_back(buffer_id, vsharp);
} else {
texbuffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp);
}
}
// Bind a SSBO to act as shared memory in case of not being able to use a workgroup buffer
// (e.g. when the compute shared memory is bigger than the GPU's shared memory)
if (stage.has_emulated_shared_memory) {
@@ -601,8 +569,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE);
}
} else {
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
vsharp.base_address, vsharp.GetSize(), desc.is_written, false, buffer_id);
const auto [vk_buffer, offset] =
buffer_cache.ObtainBuffer(vsharp.base_address, vsharp.GetSize(), desc.is_written,
desc.is_formatted, buffer_id);
const u32 alignment =
is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment();
const u32 offset_aligned = Common::AlignDown(offset, alignment);
@@ -617,6 +586,9 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
vk::PipelineStageFlagBits2::eAllCommands)) {
buffer_barriers.emplace_back(*barrier);
}
if (desc.is_written && desc.is_formatted) {
texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize());
}
}
set_writes.push_back({
@@ -630,56 +602,6 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
});
++binding.buffer;
}
for (u32 i = 0; i < texbuffer_bindings.size(); i++) {
const auto& [buffer_id, vsharp] = texbuffer_bindings[i];
const auto& desc = stage.texture_buffers[i];
// Fallback format for null buffer view; never used in valid buffer case.
const auto data_fmt = vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid
? vsharp.GetDataFmt()
: AmdGpu::DataFormat::Format8;
const u32 fmt_stride = AmdGpu::NumBits(data_fmt) >> 3;
vk::BufferView buffer_view;
if (buffer_id) {
const u32 alignment = instance.TexelBufferMinAlignment();
const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(
vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id);
const u32 buf_stride = vsharp.GetStride();
ASSERT_MSG(buf_stride % fmt_stride == 0,
"Texel buffer stride must match format stride");
const u32 offset_aligned = Common::AlignDown(offset, alignment);
const u32 adjust = offset - offset_aligned;
ASSERT(adjust % fmt_stride == 0);
push_data.AddTexelOffset(binding.buffer, buf_stride / fmt_stride, adjust / fmt_stride);
buffer_view = vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust,
desc.is_written, data_fmt, vsharp.GetNumberFmt());
if (auto barrier =
vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead,
vk::PipelineStageFlagBits2::eAllCommands)) {
buffer_barriers.emplace_back(*barrier);
}
if (desc.is_written) {
texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize());
}
} else if (instance.IsNullDescriptorSupported()) {
buffer_view = VK_NULL_HANDLE;
} else {
buffer_view =
null_buffer.View(0, fmt_stride, desc.is_written, data_fmt, vsharp.GetNumberFmt());
}
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding.unified++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer
: vk::DescriptorType::eUniformTexelBuffer,
.pTexelBufferView = &buffer_views.emplace_back(buffer_view),
});
++binding.buffer;
}
}
void Rasterizer::BindTextures(const Shader::Info& stage, Shader::Backend::Bindings& binding,

View File

@@ -120,8 +120,6 @@ private:
using BufferBindingInfo = std::pair<VideoCore::BufferId, AmdGpu::Buffer>;
boost::container::static_vector<BufferBindingInfo, 32> buffer_bindings;
using TexBufferBindingInfo = std::pair<VideoCore::BufferId, AmdGpu::Buffer>;
boost::container::static_vector<TexBufferBindingInfo, 32> texbuffer_bindings;
using ImageBindingInfo = std::pair<VideoCore::ImageId, VideoCore::TextureCache::TextureDesc>;
boost::container::static_vector<ImageBindingInfo, 64> image_bindings;
};

View File

@@ -19,9 +19,9 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info,
auto& buffer_cache = rasterizer.GetBufferCache();
// Copy shader defines three formatted buffers as inputs: control, source, and destination.
const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info);
const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info);
const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info);
const auto ctl_buf_sharp = info.buffers[0].GetSharp(info);
const auto src_buf_sharp = info.buffers[1].GetSharp(info);
const auto dst_buf_sharp = info.buffers[2].GetSharp(info);
const auto buf_stride = src_buf_sharp.GetStride();
ASSERT(buf_stride == dst_buf_sharp.GetStride());
@@ -95,12 +95,10 @@ static bool ExecuteCopyShaderHLE(const Shader::Info& info,
}
// Obtain buffers for the total source and destination ranges.
const auto [src_buf, src_buf_offset] =
buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min,
src_offset_max - src_offset_min, false, false);
const auto [dst_buf, dst_buf_offset] =
buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min,
dst_offset_max - dst_offset_min, true, false);
const auto [src_buf, src_buf_offset] = buffer_cache.ObtainBuffer(
src_buf_sharp.base_address + src_offset_min, src_offset_max - src_offset_min, false);
const auto [dst_buf, dst_buf_offset] = buffer_cache.ObtainBuffer(
dst_buf_sharp.base_address + dst_offset_min, dst_offset_max - dst_offset_min, true);
// Apply found buffer base.
const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);