Implement shader resource tables

This commit is contained in:
Frodo Baggins 2024-09-24 19:26:48 -07:00
parent 8b139ff5fa
commit c4300dd9ac
30 changed files with 892 additions and 117 deletions

View File

@ -15,6 +15,10 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_definitions(_DEBUG)
endif()
project(shadPS4)
# Forcing PIE makes sure that the base address is high enough so that it doesn't clash with the PS4 memory.
@ -590,6 +594,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/frontend/structured_control_flow.h
src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp
src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp

View File

@ -13,6 +13,15 @@ DecoderImpl::DecoderImpl() {
DecoderImpl::~DecoderImpl() = default;
std::string DecoderImpl::disassembleInst(ZydisDecodedInstruction& inst,
ZydisDecodedOperand* operands, u64 address) {
const int bufLen = 256;
char szBuffer[bufLen];
ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible,
szBuffer, sizeof(szBuffer), address, ZYAN_NULL);
return szBuffer;
}
void DecoderImpl::printInstruction(void* code, u64 address) {
ZydisDecodedInstruction instruction;
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE];
@ -27,11 +36,8 @@ void DecoderImpl::printInstruction(void* code, u64 address) {
void DecoderImpl::printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,
u64 address) {
const int bufLen = 256;
char szBuffer[bufLen];
ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible,
szBuffer, sizeof(szBuffer), address, ZYAN_NULL);
fmt::print("instruction: {}\n", szBuffer);
std::string s = disassembleInst(inst, operands, address);
fmt::print("instruction: {}\n", s);
}
ZyanStatus DecoderImpl::decodeInstruction(ZydisDecodedInstruction& inst,

View File

@ -14,6 +14,8 @@ public:
DecoderImpl();
~DecoderImpl();
std::string disassembleInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,
u64 address);
void printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address);
void printInstruction(void* code, u64 address);
ZyanStatus decodeInstruction(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,

14
src/common/hash.h Normal file
View File

@ -0,0 +1,14 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include "common/types.h"
[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 12) + (seed >> 4));
}
[[nodiscard]] inline u32 HashCombine(const u32 seed, const u32 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2));
}

View File

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
@ -146,9 +147,14 @@ void EmitGetGotoVariable(EmitContext&) {
UNREACHABLE_MSG("Unreachable instruction");
}
Id EmitReadConst(EmitContext& ctx) {
return ctx.u32_zero_value;
UNREACHABLE_MSG("Unreachable instruction");
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) {
u32 flatbuf_off_dw = inst->Flags<u32>();
ASSERT(ctx.srt_flatbuf.binding >= 0);
ASSERT(flatbuf_off_dw > 0);
Id index = ctx.ConstU32(flatbuf_off_dw);
auto& buffer = ctx.srt_flatbuf;
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
return ctx.OpLoad(ctx.U32[1], ptr);
}
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {

View File

@ -62,7 +62,7 @@ void EmitSetVectorRegister(EmitContext& ctx);
void EmitSetGotoVariable(EmitContext& ctx);
void EmitGetGotoVariable(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst);
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);

View File

@ -4,12 +4,14 @@
#include "common/assert.h"
#include "common/div_ceil.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "video_core/amdgpu/types.h"
#include <boost/container/static_vector.hpp>
#include <fmt/format.h>
#include <numbers>
#include <string_view>
namespace Shader::Backend::SPIRV {
namespace {
@ -435,14 +437,16 @@ void EmitContext::DefinePushDataBlock() {
void EmitContext::DefineBuffers() {
boost::container::small_vector<Id, 8> type_ids;
const auto define_struct = [&](Id record_array_type, bool is_instance_data) {
const auto define_struct = [&](Id record_array_type, bool is_instance_data,
std::optional<std::string_view> explicit_name = {}) {
const Id struct_type{TypeStruct(record_array_type)};
if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) {
return struct_type;
}
Decorate(record_array_type, spv::Decoration::ArrayStride, 4);
const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage)
: fmt::format("{}_cbuf_block_f32", stage);
auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage)
: fmt::format("{}_cbuf_block_f32", stage);
name = explicit_name.value_or(name);
Name(struct_type, name);
Decorate(struct_type, spv::Decoration::Block);
MemberName(struct_type, 0, "data");
@ -451,6 +455,29 @@ void EmitContext::DefineBuffers() {
return struct_type;
};
if (info.has_readconst) {
const Id data_type = U32[1];
const auto storage_class = spv::StorageClass::Uniform;
const Id pointer_type = TypePointer(storage_class, data_type);
const Id record_array_type{
TypeArray(U32[1], ConstU32(static_cast<u32>(info.flattened_ud_buf.num_dwords())))};
const Id struct_type{define_struct(record_array_type, false, "srt_flatbuf_ty")};
const Id struct_pointer_type{TypePointer(storage_class, struct_type)};
const Id id{AddGlobalVariable(struct_pointer_type, storage_class)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, "srt_flatbuf_ubo");
srt_flatbuf = {
.id = id,
.binding = binding.buffer++,
.pointer_type = pointer_type,
};
interfaces.push_back(id);
}
for (const auto& desc : info.buffers) {
const auto sharp = desc.GetSharp(info);
const bool is_storage = desc.IsStorage(sharp);
@ -471,7 +498,7 @@ void EmitContext::DefineBuffers() {
if (is_storage && !desc.is_written) {
Decorate(id, spv::Decoration::NonWritable);
}
Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base));
Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sharp_idx));
buffers.push_back({
.id = id,
@ -495,7 +522,7 @@ void EmitContext::DefineTextureBuffers() {
const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base));
Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx));
texture_buffers.push_back({
.id = id,
.binding = binding.buffer++,
@ -582,7 +609,7 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) {
}
Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
const auto image = ctx.info.ReadUd<AmdGpu::Image>(desc.sgpr_base, desc.dword_offset);
const auto image = ctx.info.ReadUdSharp<AmdGpu::Image>(desc.sharp_idx);
const auto format = desc.is_atomic ? GetFormat(image) : spv::ImageFormat::Unknown;
const u32 sampled = desc.is_storage ? 2 : 1;
switch (desc.type) {
@ -618,8 +645,7 @@ void EmitContext::DefineImagesAndSamplers() {
const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}{}_{:02x}", stage, "img", image_desc.sgpr_base,
image_desc.dword_offset));
Name(id, fmt::format("{}_{}{}", stage, "img", image_desc.sharp_idx));
images.push_back({
.data_types = &data_types,
.id = id,
@ -643,8 +669,7 @@ void EmitContext::DefineImagesAndSamplers() {
const Id id{AddGlobalVariable(sampler_pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}{}_{:02x}", stage, "samp", samp_desc.sgpr_base,
samp_desc.dword_offset));
Name(id, fmt::format("{}_{}{}", stage, "samp", samp_desc.sharp_idx));
samplers.push_back(id);
interfaces.push_back(id);
}

View File

@ -228,6 +228,7 @@ public:
Bindings& binding;
boost::container::small_vector<BufferDefinition, 16> buffers;
boost::container::small_vector<TextureBufferDefinition, 8> texture_buffers;
BufferDefinition srt_flatbuf;
boost::container::small_vector<TextureDefinition, 8> images;
boost::container::small_vector<Id, 4> samplers;

View File

@ -10,6 +10,10 @@ static constexpr u32 SQ_SRC_LITERAL = 0xFF;
void Translator::EmitScalarMemory(const GcnInst& inst) {
switch (inst.opcode) {
// SMRD
case Opcode::S_LOAD_DWORD:
return S_LOAD_DWORD(1, inst);
case Opcode::S_LOAD_DWORDX2:
return S_LOAD_DWORD(2, inst);
case Opcode::S_LOAD_DWORDX4:
return S_LOAD_DWORD(4, inst);
case Opcode::S_LOAD_DWORDX8:

View File

@ -388,7 +388,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
IR::VectorReg dst_reg{attrib.dest_vgpr};
// Read the V# of the attribute to figure out component number and type.
const auto buffer = info.ReadUd<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
const auto buffer = info.ReadUdReg<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
for (u32 i = 0; i < 4; i++) {
const IR::F32 comp = [&] {
switch (buffer.GetSwizzle(i)) {
@ -418,8 +418,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
if (step_rate == Info::VsInput::OverStepRate0 ||
step_rate == Info::VsInput::OverStepRate1) {
info.buffers.push_back({
.sgpr_base = attrib.sgpr_base,
.dword_offset = attrib.dword_offset,
.sharp_idx = info.srt_info.reserve_sharp(attrib.sgpr_base, attrib.dword_offset, 4),
.used_types = IR::Type::F32,
.is_instance_data = true,
});

View File

@ -2,6 +2,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <algorithm>
#include <span>
#include <boost/container/small_vector.hpp>
#include <boost/container/static_vector.hpp>
@ -10,11 +11,13 @@
#include "shader_recompiler/backend/bindings.h"
#include "shader_recompiler/frontend/copy_shader.h"
#include "shader_recompiler/ir/attribute.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/type.h"
#include "shader_recompiler/params.h"
#include "shader_recompiler/runtime_info.h"
#include "video_core/amdgpu/resource.h"
#include "xbyak/xbyak.h"
namespace Shader {
@ -36,8 +39,7 @@ constexpr u32 NUM_TEXTURE_TYPES = 7;
struct Info;
struct BufferResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
IR::Type used_types;
AmdGpu::Buffer inline_cbuf;
bool is_gds_buffer{};
@ -53,8 +55,7 @@ struct BufferResource {
using BufferResourceList = boost::container::small_vector<BufferResource, 16>;
struct TextureBufferResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::NumberFormat nfmt;
bool is_written{};
@ -63,8 +64,7 @@ struct TextureBufferResource {
using TextureBufferResourceList = boost::container::small_vector<TextureBufferResource, 16>;
struct ImageResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::ImageType type;
AmdGpu::NumberFormat nfmt;
bool is_storage{};
@ -77,8 +77,7 @@ struct ImageResource {
using ImageResourceList = boost::container::small_vector<ImageResource, 16>;
struct SamplerResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::Sampler inline_sampler{};
u32 associated_image : 4;
u32 disable_aniso : 1;
@ -180,6 +179,9 @@ struct Info {
ImageResourceList images;
SamplerResourceList samplers;
PersistentSrtInfo srt_info;
FlattenedUserDataBuffer flattened_ud_buf;
std::span<const u32> user_data;
Stage stage;
@ -199,14 +201,23 @@ struct Info {
bool uses_fp64{};
bool uses_step_rates{};
bool translation_failed{}; // indicates that shader has unsupported instructions
bool has_readconst{};
u8 mrt_mask{0u};
// just for logging, TODO delete
size_t perm_idx;
explicit Info(Stage stage_, ShaderParams params)
: stage{stage_}, pgm_hash{params.hash}, pgm_base{params.Base()},
user_data{params.user_data} {}
template <typename T>
T ReadUd(u32 ptr_index, u32 dword_offset) const noexcept {
inline T ReadUdSharp(u32 sharp_idx) const noexcept {
return flattened_ud_buf.ReadUdSharp<T>(sharp_idx);
}
template <typename T>
T ReadUdReg(u32 ptr_index, u32 dword_offset) const noexcept {
T data;
const u32* base = user_data.data();
if (ptr_index != IR::NumScalarRegs) {
@ -228,7 +239,8 @@ struct Info {
}
void AddBindings(Backend::Bindings& bnd) const {
const auto total_buffers = buffers.size() + texture_buffers.size();
const auto total_buffers =
buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0);
bnd.buffer += total_buffers;
bnd.unified += total_buffers + images.size() + samplers.size();
bnd.user_data += ud_mask.NumRegs();
@ -245,22 +257,33 @@ struct Info {
}
return {vertex_offset, instance_offset};
}
void RefreshFlatBuf() {
flattened_ud_buf.resize(srt_info.flattened_bufsize_dw);
ASSERT(user_data.size() <= NumUserDataRegs);
std::memcpy(flattened_ud_buf.data(), user_data.data(), user_data.size_bytes());
// Run the JIT program to walk the SRT and write the leaves to a flat buffer
PFN_SrtWalker pfn = srt_info.walker.getCode<PFN_SrtWalker>();
if (pfn) {
pfn(user_data.data(), flattened_ud_buf.data());
}
}
};
constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
return inline_cbuf ? inline_cbuf : info.ReadUd<AmdGpu::Buffer>(sgpr_base, dword_offset);
return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
}
constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept {
return info.ReadUd<AmdGpu::Buffer>(sgpr_base, dword_offset);
return info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
}
constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept {
return info.ReadUd<AmdGpu::Image>(sgpr_base, dword_offset);
return info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
}
constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept {
return inline_sampler ? inline_sampler : info.ReadUd<AmdGpu::Sampler>(sgpr_base, dword_offset);
return inline_sampler ? inline_sampler : info.ReadUdSharp<AmdGpu::Sampler>(sharp_idx);
}
} // namespace Shader

View File

@ -118,6 +118,10 @@ std::string DumpBlock(const Block& block, const std::map<const Block*, size_t>&
} else {
ret += fmt::format(" {}", op); // '%00000 = ' -> 1 + 5 + 3 = 9 spaces
}
if (op == Opcode::ReadConst) {
ret += fmt::format(" (flags={}) ", inst.Flags<u32>());
}
const size_t arg_count{inst.NumArgs()};
for (size_t arg_index = 0; arg_index < arg_count; ++arg_index) {
const Value arg{inst.Arg(arg_index)};

View File

@ -11,34 +11,37 @@
namespace Shader::IR {
template <typename Pred>
auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t<Pred, const Inst*> {
// Use typename Instruction so the function can be used to return either const or mutable
// Insts depending on the context.
template <typename Instruction, typename Pred>
auto BreadthFirstSearch(Instruction* inst,
Pred&& pred) -> std::invoke_result_t<Pred, Instruction*> {
// Most often case the instruction is the desired already.
if (const std::optional result = pred(inst)) {
if (std::optional result = pred(inst)) {
return result;
}
// Breadth-first search visiting the right most arguments first
boost::container::small_vector<const Inst*, 2> visited;
std::queue<const Inst*> queue;
boost::container::small_vector<Instruction*, 2> visited;
std::queue<Instruction*> queue;
queue.push(inst);
while (!queue.empty()) {
// Pop one instruction from the queue
const Inst* const inst{queue.front()};
Instruction* inst{queue.front()};
queue.pop();
if (const std::optional result = pred(inst)) {
if (std::optional result = pred(inst)) {
// This is the instruction we were looking for
return result;
}
// Visit the right most arguments first
for (size_t arg = inst->NumArgs(); arg--;) {
const Value arg_value{inst->Arg(arg)};
Value arg_value{inst->Arg(arg)};
if (arg_value.IsImmediate()) {
continue;
}
// Queue instruction if it hasn't been visited
const Inst* const arg_inst{arg_value.InstRecursive()};
Instruction* arg_inst{arg_value.InstRecursive()};
if (std::ranges::find(visited, arg_inst) == visited.end()) {
visited.push_back(arg_inst);
queue.push(arg_inst);
@ -50,8 +53,17 @@ auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t<P
}
template <typename Pred>
auto BreadthFirstSearch(const Value& value, Pred&& pred)
-> std::invoke_result_t<Pred, const Inst*> {
auto BreadthFirstSearch(const Value& value,
Pred&& pred) -> std::invoke_result_t<Pred, const Inst*> {
if (value.IsImmediate()) {
// Nothing to do with immediates
return std::nullopt;
}
return BreadthFirstSearch(value.InstRecursive(), pred);
}
template <typename Pred>
auto BreadthFirstSearch(Value value, Pred&& pred) -> std::invoke_result_t<Pred, Inst*> {
if (value.IsImmediate()) {
// Nothing to do with immediates
return std::nullopt;

View File

@ -0,0 +1,267 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include <boost/container/flat_map.hpp>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "common/singleton.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/srt_gvn_table.h"
#include "shader_recompiler/ir/value.h"
#include "src/common/arch.h"
#include "src/common/decoder.h"
using namespace Xbyak::util;
// TODO make sure no problems with identity and Insts being used in maps
static void DumpSrtProgram(const Shader::Info& info, const u8* code, size_t codesize) {
#ifdef ARCH_X86_64
using namespace Common::FS;
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename =
fmt::format("{}_{:#018x}_{}.srtprogram.txt", info.stage, info.pgm_hash, info.perm_idx);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write, FileType::TextFile};
u64 address = reinterpret_cast<u64>(code);
u64 code_end = address + codesize;
ZydisDecodedInstruction instruction;
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
ZyanStatus status = ZYAN_STATUS_SUCCESS;
while (address < code_end && ZYAN_SUCCESS(Common::Decoder::Instance()->decodeInstruction(
instruction, operands, reinterpret_cast<void*>(address)))) {
std::string s =
Common::Decoder::Instance()->disassembleInst(instruction, operands, address);
s += "\n";
file.WriteString(s);
address += instruction.length;
}
#endif
}
namespace {
class SrtCodegen : public Xbyak::CodeGenerator {
public:
SrtCodegen() : CodeGenerator(1_MB) {}
};
using namespace Shader;
struct PassInfo {
// map offset to inst
using PtrUserList = boost::container::flat_map<u32, Shader::IR::Inst*>;
Optimization::SrtGvnTable gvn_table;
// keys are GetUserData or ReadConst instructions that are used as pointers
std::unordered_map<IR::Inst*, PtrUserList> pointer_uses;
// GetUserData instructions corresponding to sgpr_base of SRT roots
boost::container::small_flat_map<IR::ScalarReg, IR::Inst*, 1> srt_roots;
// pick a single inst for a given value number
std::unordered_map<u32, IR::Inst*> vn_to_inst;
// Bumped during codegen to assign offsets to readconsts
u32 dst_off_dw;
PtrUserList* GetUsesAsPointer(IR::Inst* inst) {
auto it = pointer_uses.find(inst);
if (it != pointer_uses.end()) {
return &it->second;
}
return nullptr;
}
// Return a single instruction that this instruction is identical to, according
// to value number
// The "original" is arbitrary. Here it's the first instruction found for a given value number
IR::Inst* DeduplicateInstruction(IR::Inst* inst) {
auto it = vn_to_inst.try_emplace(gvn_table.GetValueNumber(inst), inst);
return it.first->second;
}
};
} // namespace
namespace Shader::Optimization {
namespace {
static IR::Value GetReadConstOff(const IR::Inst* inst) {
ASSERT(inst->GetOpcode() == IR::Opcode::ReadConst);
return inst->Arg(1);
}
static IR::ScalarReg GetUserDataSgprBase(const IR::Inst* inst) {
ASSERT(inst->GetOpcode() == IR::Opcode::GetUserData);
return inst->Arg(0).ScalarReg();
}
static inline void PushPtr(Xbyak::CodeGenerator& c, u32 off_dw) {
c.push(rdi);
c.mov(rdi, ptr[rdi + (off_dw << 2)]);
c.mov(r10, 0xFFFFFFFFFFFFULL);
c.and_(rdi, r10);
}
static inline void PopPtr(Xbyak::CodeGenerator& c) {
c.pop(rdi);
};
static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info,
Xbyak::CodeGenerator& c) {
PushPtr(c, off_dw);
PassInfo::PtrUserList* use_list = pass_info.GetUsesAsPointer(subtree);
ASSERT(use_list);
// First copy all the src data from this tree level
// That way, all data that was contiguous in the guest SRT is also contiguous in the
// flattened buffer.
// TODO src and dst are contiguous. Optimize with wider loads/stores
// TODO if this subtree is dynamically indexed, don't compact it (keep it sparse)
for (auto [src_off_dw, use] : *use_list) {
c.mov(r10d, ptr[rdi + (src_off_dw << 2)]);
c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r10d);
use->SetFlags<u32>(pass_info.dst_off_dw);
pass_info.dst_off_dw++;
}
// Then visit any children used as pointers
for (const auto [src_off_dw, use] : *use_list) {
if (pass_info.GetUsesAsPointer(use)) {
VisitPointer(src_off_dw, use, pass_info, c);
}
}
PopPtr(c);
}
static void GenerateSrtProgram(Info& info, PassInfo& pass_info) {
Xbyak::CodeGenerator& c = *Common::Singleton<SrtCodegen>::Instance();
if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) {
return;
}
pass_info.dst_off_dw = NumUserDataRegs;
// Special case for V# step rate buffers in fetch shader
for (auto i = 0; i < info.srt_info.srt_reservations.size(); i++) {
PersistentSrtInfo::SrtSharpReservation res = info.srt_info.srt_reservations[i];
// get pointer to V#
c.mov(r10d, ptr[rdi + (res.sgpr_base << 2)]);
u32 src_off = res.dword_offset << 2;
for (auto j = 0; j < res.num_dwords; j++) {
c.mov(r11d, ptr[r10d + src_off]);
c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d);
src_off += 4;
++pass_info.dst_off_dw;
}
}
ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw);
for (const auto& [sgpr_base, root] : pass_info.srt_roots) {
VisitPointer(static_cast<u32>(sgpr_base), root, pass_info, c);
}
c.ret();
c.ready();
size_t codesize = c.getSize();
info.srt_info.walker = SmallCodeArray(c.getCode(), codesize);
if (Config::dumpShaders()) {
DumpSrtProgram(info, c.getCode(), codesize);
}
c.reset();
info.srt_info.flattened_bufsize_dw = pass_info.dst_off_dw;
}
}; // namespace
void FlattenExtendedUserdataPass(IR::Program& program) {
Shader::Info& info = program.info;
PassInfo pass_info;
// traverse at end and assign offsets to duplicate readconsts, using
// vn_to_inst as the source
boost::container::small_vector<IR::Inst*, 32> all_readconsts;
for (auto r_it = program.post_order_blocks.rbegin(); r_it != program.post_order_blocks.rend();
r_it++) {
IR::Block* block = *r_it;
for (IR::Inst& inst : *block) {
if (inst.GetOpcode() == IR::Opcode::ReadConst) {
if (!GetReadConstOff(&inst).IsImmediate()) {
continue;
}
all_readconsts.push_back(&inst);
if (pass_info.DeduplicateInstruction(&inst) != &inst) {
// This is a duplicate of a readconst we've already visited
continue;
}
IR::Inst* ptr_composite = inst.Arg(0).InstRecursive();
const auto pred = [](IR::Inst* inst) -> std::optional<IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
auto base0 = IR::BreadthFirstSearch(ptr_composite->Arg(0), pred);
auto base1 = IR::BreadthFirstSearch(ptr_composite->Arg(1), pred);
ASSERT_MSG(base0 && base1 && "ReadConst not from constant memory");
// TODO this probably requires some template magic to fix. BFS needs non-const
// variant. Needs to be non-const to change flags
IR::Inst* ptr_lo = base0.value();
ptr_lo = pass_info.DeduplicateInstruction(ptr_lo);
auto ptr_uses_kv =
pass_info.pointer_uses.try_emplace(ptr_lo, PassInfo::PtrUserList{});
PassInfo::PtrUserList& user_list = ptr_uses_kv.first->second;
user_list[GetReadConstOff(&inst).U32()] = &inst;
if (ptr_lo->GetOpcode() == IR::Opcode::GetUserData) {
IR::ScalarReg ud_reg = GetUserDataSgprBase(ptr_lo);
pass_info.srt_roots[ud_reg] = ptr_lo;
}
}
}
}
GenerateSrtProgram(info, pass_info);
// Assign offsets to duplicate readconsts
for (IR::Inst* readconst : all_readconsts) {
ASSERT(pass_info.vn_to_inst.contains(pass_info.gvn_table.GetValueNumber(readconst)));
IR::Inst* original = pass_info.DeduplicateInstruction(readconst);
readconst->SetFlags<u32>(original->Flags<u32>());
}
info.RefreshFlatBuf();
}
} // namespace Shader::Optimization

View File

@ -12,6 +12,7 @@ void SsaRewritePass(IR::BlockList& program);
void IdentityRemovalPass(IR::BlockList& program);
void DeadCodeEliminationPass(IR::Program& program);
void ConstantPropagationPass(IR::BlockList& program);
void FlattenExtendedUserdataPass(IR::Program& program);
void ResourceTrackingPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program);
void LowerSharedMemToRegisters(IR::Program& program);

View File

@ -13,12 +13,7 @@
namespace Shader::Optimization {
namespace {
struct SharpLocation {
u32 sgpr_base;
u32 dword_offset;
auto operator<=>(const SharpLocation&) const = default;
};
using SharpLocation = u32;
bool IsBufferAtomic(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
@ -155,9 +150,7 @@ public:
if (desc.is_gds_buffer && existing.is_gds_buffer) {
return true;
}
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset &&
desc.inline_cbuf == existing.inline_cbuf;
return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf;
})};
auto& buffer = buffer_resources[index];
buffer.used_types |= desc.used_types;
@ -167,8 +160,7 @@ public:
u32 Add(const TextureBufferResource& desc) {
const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
auto& buffer = texture_buffer_resources[index];
buffer.is_written |= desc.is_written;
@ -177,8 +169,7 @@ public:
u32 Add(const ImageResource& desc) {
const u32 index{Add(image_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
auto& image = image_resources[index];
image.is_storage |= desc.is_storage;
@ -187,8 +178,7 @@ public:
u32 Add(const SamplerResource& desc) {
const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
return index;
}
@ -259,48 +249,25 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
return {prod2, true};
}
SharpLocation TrackSharp(const IR::Inst* inst) {
SharpLocation TrackSharp(const IR::Inst* inst, const Shader::Info& info) {
// Search until we find a potential sharp source.
const auto pred0 = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
const auto result = IR::BreadthFirstSearch(inst, pred0);
const auto result = IR::BreadthFirstSearch(inst, pred);
ASSERT_MSG(result, "Unable to track sharp source");
inst = result.value();
// If its from user data not much else to do.
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return SharpLocation{
.sgpr_base = u32(IR::ScalarReg::Max),
.dword_offset = u32(inst->Arg(0).ScalarReg()),
};
return static_cast<u32>(inst->Arg(0).ScalarReg());
} else {
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst,
"Sharp load not from constant memory");
return inst->Flags<u32>();
}
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory");
// Retrieve offset from base.
const u32 dword_offset = inst->Arg(1).U32();
const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive();
// Retrieve SGPR pair that holds sbase
const auto pred1 = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
ASSERT(inst->GetOpcode() != IR::Opcode::ReadConst);
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return inst->Arg(0).ScalarReg();
}
return std::nullopt;
};
const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1);
const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1);
ASSERT_MSG(base0 && base1, "Nested resource loads not supported");
// Return retrieved location.
return SharpLocation{
.sgpr_base = u32(base0.value()),
.dword_offset = dword_offset,
};
}
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
@ -327,8 +294,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
cbuf = std::bit_cast<AmdGpu::Buffer>(buffer);
// Assign a binding to this sharp.
return descriptors.Add(BufferResource{
.sgpr_base = std::numeric_limits<u32>::max(),
.dword_offset = 0,
.sharp_idx = std::numeric_limits<u32>::max(),
.used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
.inline_cbuf = cbuf,
});
@ -341,11 +307,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) {
IR::Inst* handle = inst.Arg(0).InstRecursive();
IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
const auto sharp = TrackSharp(producer, info);
buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp);
binding = descriptors.Add(BufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.sharp_idx = sharp,
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.is_written = IsBufferStore(inst),
});
@ -404,11 +369,10 @@ void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
Descriptors& descriptors) {
const IR::Inst* handle = inst.Arg(0).InstRecursive();
const IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
const auto buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
const auto sharp = TrackSharp(producer, info);
const auto buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp);
const s32 binding = descriptors.Add(TextureBufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.sharp_idx = sharp,
.nfmt = buffer.GetNumberFmt(),
.is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
});
@ -630,7 +594,8 @@ void PatchImageSampleInstruction(IR::Block& block, IR::Inst& inst, Info& info,
inst.ReplaceUsesWith(new_inst);
}
void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors,
Shader::FlatSharpBuffer sharp_buf) {
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
const auto opcode = inst->GetOpcode();
if (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
@ -647,9 +612,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer;
// Read image sharp.
const auto tsharp = TrackSharp(tsharp_handle);
const auto tsharp = TrackSharp(tsharp_handle, info);
const auto inst_info = inst.Flags<IR::TextureInstInfo>();
auto image = info.ReadUd<AmdGpu::Image>(tsharp.sgpr_base, tsharp.dword_offset);
auto image = info.ReadUdSharp<AmdGpu::Image>(tsharp);
if (!image.Valid()) {
LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!");
image = AmdGpu::Image::Null();
@ -658,8 +623,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
const bool is_storage = IsImageStorageInstruction(inst);
const auto type = image.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray : image.GetType();
u32 image_binding = descriptors.Add(ImageResource{
.sgpr_base = tsharp.sgpr_base,
.dword_offset = tsharp.dword_offset,
.sharp_idx = tsharp,
.type = type,
.nfmt = image.GetNumberFmt(),
.is_storage = is_storage,
@ -763,6 +727,7 @@ void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info,
void ResourceTrackingPass(IR::Program& program) {
// Iterate resource instructions and patch them after finding the sharp.
auto& info = program.info;
Descriptors descriptors{info};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {

View File

@ -63,6 +63,9 @@ void Visit(Info& info, IR::Inst& inst) {
case IR::Opcode::LaneId:
info.uses_lane_id = true;
break;
case IR::Opcode::ReadConst:
info.has_readconst = true;
break;
default:
break;
}

View File

@ -0,0 +1,126 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <vector>
#include <boost/align/aligned_allocator.hpp>
#include <boost/align/aligned_delete.hpp>
#include <boost/container/map.hpp>
#include <boost/container/set.hpp>
#include <boost/container/small_vector.hpp>
#include <memory>
#include "common/alignment.h"
#include "common/assert.h"
#include "common/types.h"
#include "xbyak/xbyak.h"
namespace Shader {
class FlattenedUserDataBuffer {
public:
template <typename T>
T ReadUdSharp(u32 sharp_idx) const noexcept {
return *reinterpret_cast<const T*>(&buf[sharp_idx]);
}
size_t num_dwords() const {
return buf.size();
}
size_t size_bytes() const {
return buf.size() * sizeof(u32);
}
u32* data() {
return buf.data();
}
const u32* data() const {
return buf.data();
}
void resize(size_t new_size_dw) {
buf.resize(new_size_dw);
}
private:
std::vector<u32> buf;
};
typedef void(__attribute__((sysv_abi)) * PFN_SrtWalker)(const u32* /*user_data*/,
u32* /*flat_dst*/);
// Utility for copying a simple relocatable function from a Xbyak code generator to manage memory
// separately
class SmallCodeArray {
public:
SmallCodeArray() : bufsize(0), codebuf(nullptr) {}
SmallCodeArray& operator=(SmallCodeArray&& other) = default;
SmallCodeArray(SmallCodeArray&& other) = default;
SmallCodeArray& operator=(const SmallCodeArray& other) {
*this = SmallCodeArray(reinterpret_cast<u8*>(codebuf.get()), bufsize);
return *this;
}
SmallCodeArray(const SmallCodeArray& other) {
*this = other;
};
SmallCodeArray(const u8* code, size_t codesize) : SmallCodeArray() {
size_t pagesize = Xbyak::inner::getPageSize();
bufsize = Common::AlignUp(codesize, pagesize);
if (bufsize > 0) {
auto fn = reinterpret_cast<u8*>(boost::alignment::aligned_alloc(pagesize, bufsize));
ASSERT(fn);
codebuf = aligned_unique_ptr(fn);
memcpy(codebuf.get(), code, codesize);
Xbyak::CodeArray::protect(codebuf.get(), bufsize, Xbyak::CodeArray::PROTECT_RE);
}
}
~SmallCodeArray() {
if (bufsize > 0) {
Xbyak::CodeArray::protect(codebuf.get(), bufsize, Xbyak::CodeArray::PROTECT_RW);
}
}
template <class F>
F getCode() const {
return reinterpret_cast<F>(codebuf.get());
}
private:
using aligned_unique_ptr = std::unique_ptr<u8, boost::alignment::aligned_delete>;
size_t bufsize;
aligned_unique_ptr codebuf;
};
struct PersistentSrtInfo {
PersistentSrtInfo() : flattened_bufsize_dw(/*NumUserDataRegs*/ 16) {}
// Special case when fetch shader uses step rates.
struct SrtSharpReservation {
u32 sgpr_base;
u32 dword_offset;
u32 num_dwords;
};
SmallCodeArray walker;
boost::container::small_vector<SrtSharpReservation, 2> srt_reservations;
u32 flattened_bufsize_dw;
// Special case for fetch shaders because we don't generate IR to read from step rate buffers,
// so we won't see usage with GetUserData/ReadConst.
// Reserve space in the flattened buffer for a sharp ahead of time
u32 reserve_sharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) {
u32 rv = flattened_bufsize_dw;
srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords);
flattened_bufsize_dw += num_dwords;
return rv;
}
};
} // namespace Shader

View File

@ -0,0 +1,157 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include <boost/container/set.hpp>
#include <boost/container/small_vector.hpp>
#include "common/assert.h"
#include "common/hash.h"
#include "common/types.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::Optimization {
// Does global value numbering on a subset of instructions that are used
// for loads from shader resource tables.
// Inspiration from spirv-opt
class SrtGvnTable {
public:
using ValueNumberTable = std::unordered_map<IR::Value, u32>;
using ValueNum = u32;
SrtGvnTable() : value_numbers(), next_num(0) {}
u32 GetValueNumber(IR::Inst* inst) {
return GetValueNumber(IR::Value{inst});
}
u32 GetValueNumber(IR::Value v) {
v = v.Resolve();
if (auto it = value_numbers.find(v); it != value_numbers.end()) {
return it->second;
}
if (auto inst = v.TryInstRecursive()) {
return ComputeInstValueNumber(inst);
}
return NextValueNumber(v);
}
private:
u32 ComputeInstValueNumber(IR::Inst* inst) {
ASSERT(!value_numbers.contains(
IR::Value(inst))); // Should always be checking before calling this function
if (inst->MayHaveSideEffects()) {
return NextValueNumber(IR::Value(inst));
}
u32 vn;
switch (inst->GetOpcode()) {
case IR::Opcode::Phi: {
// hack to get to parity with main
// Need to fix ssa_rewrite pass to remove certain phis
std::optional<IR::Value> source = TryRemoveTrivialPhi(inst);
if (!source) {
const auto pred = [](IR::Inst* inst) -> std::optional<IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::CompositeConstructU32x2 ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
source = IR::BreadthFirstSearch(inst, pred).transform([](auto inst) {
return IR::Value{inst};
});
ASSERT(source);
}
vn = GetValueNumber(source.value());
value_numbers[IR::Value(inst)] = vn;
break;
}
case IR::Opcode::GetUserData:
case IR::Opcode::CompositeConstructU32x2:
case IR::Opcode::ReadConst: {
InstVector iv = MakeInstVector(inst);
if (auto it = iv_to_vn.find(iv); it != iv_to_vn.end()) {
vn = it->second;
value_numbers[IR::Value(inst)] = vn;
} else {
vn = NextValueNumber(IR::Value(inst));
iv_to_vn.emplace(std::move(iv), vn);
}
break;
}
default:
vn = NextValueNumber(IR::Value(inst));
break;
}
return vn;
}
u32 NextValueNumber(IR::Value v) {
u32 rv = next_num++;
value_numbers[v] = rv;
return rv;
}
ValueNumberTable value_numbers;
u32 next_num;
using InstVector = boost::container::small_vector<u32, 8>;
InstVector MakeInstVector(IR::Inst* inst) {
ASSERT(inst->GetOpcode() != IR::Opcode::Identity);
InstVector iv;
iv.reserve(2 + inst->NumArgs());
iv.push_back(static_cast<u32>(inst->GetOpcode()));
iv.push_back(inst->Flags<u32>());
for (auto i = 0; i < inst->NumArgs(); i++) {
iv.push_back(GetValueNumber(inst->Arg(i)));
}
return iv;
}
// Temp workaround for something like this:
// [0000555558a5baf8] %297 = Phi [ %24, {Block $1} ], [ %297, {Block $5} ] (uses: 4)
// [0000555558a4e038] %305 = CompositeConstructU32x2 %297, %296 (uses: 4)
// [0000555558a4e0a8] %306 = ReadConst %305, #0 (uses: 2)
// Should probably be fixed in ssa_rewrite
std::optional<IR::Value> TryRemoveTrivialPhi(IR::Inst* phi) {
IR::Value single_source{};
for (auto i = 0; i < phi->NumArgs(); i++) {
IR::Value v = phi->Arg(i).Resolve();
if (v == IR::Value(phi)) {
continue;
}
if (!single_source.IsEmpty() && single_source != v) {
return std::nullopt;
}
single_source = v;
}
ASSERT(!single_source.IsEmpty());
phi->ReplaceUsesWith(single_source);
return single_source;
}
struct HashInstVector {
size_t operator()(const InstVector& iv) const {
u32 h = 0;
for (auto vn : iv) {
h = HashCombine(vn, h);
}
return h;
}
};
std::unordered_map<InstVector, u32, HashInstVector> iv_to_vn;
};
} // namespace Shader::Optimization

View File

@ -1,7 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <string>
#include <cstddef>
#include <string_view>
#include "common/hash.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::IR {
@ -97,3 +99,52 @@ bool Value::operator!=(const Value& other) const {
}
} // namespace Shader::IR
namespace std {
std::size_t hash<Shader::IR::Value>::operator()(const Shader::IR::Value& v) const {
using namespace Shader::IR;
u64 h = HashCombine(static_cast<u64>(v.type), 0ULL);
switch (v.type) {
case Type::Void:
return h;
case Type::Opaque:
return reinterpret_cast<u64>(v.InstRecursive());
case Type::ScalarReg:
return HashCombine(static_cast<u64>(v.sreg), h);
case Type::VectorReg:
return HashCombine(static_cast<u64>(v.vreg), h);
case Type::Attribute:
return HashCombine(static_cast<u64>(v.attribute), h);
case Type::U1:
return HashCombine(static_cast<u64>(v.attribute), h);
case Type::U8:
return HashCombine(static_cast<u64>(v.imm_u8), h);
case Type::U16:
case Type::F16:
return HashCombine(static_cast<u64>(v.imm_u16), h);
case Type::U32:
case Type::F32:
return HashCombine(static_cast<u64>(v.imm_u32), h);
case Type::U64:
case Type::F64:
return HashCombine(static_cast<u64>(v.imm_u64), h);
case Type::U32x2:
case Type::U32x3:
case Type::U32x4:
case Type::F16x2:
case Type::F16x3:
case Type::F16x4:
case Type::F32x2:
case Type::F32x3:
case Type::F32x4:
case Type::F64x2:
case Type::F64x3:
case Type::F64x4:
default:
break;
}
UNREACHABLE_MSG("Invalid type {}", v.type);
}
} // namespace std

View File

@ -29,6 +29,7 @@ class Value {
public:
Value() noexcept = default;
explicit Value(IR::Inst* value) noexcept;
explicit Value(const IR::Inst* value) noexcept;
explicit Value(IR::ScalarReg reg) noexcept;
explicit Value(IR::VectorReg reg) noexcept;
explicit Value(IR::Attribute value) noexcept;
@ -82,6 +83,8 @@ private:
f64 imm_f64;
const char* string_literal;
};
friend class std::hash<Value>;
};
static_assert(static_cast<u32>(IR::Type::Void) == 0, "memset relies on IR::Type being zero");
static_assert(std::is_trivially_copyable_v<Value>);
@ -364,3 +367,10 @@ inline const char* Value::StringLiteral() const {
}
} // namespace Shader::IR
namespace std {
template <>
struct hash<Shader::IR::Value> {
std::size_t operator()(const Shader::IR::Value& v) const;
};
} // namespace std

View File

@ -1,6 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/config.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/frontend/control_flow_graph.h"
#include "shader_recompiler/frontend/decode.h"
#include "shader_recompiler/frontend/structured_control_flow.h"
@ -51,6 +54,28 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
Common::ObjectPool<Gcn::Block> gcn_block_pool{64};
Gcn::CFG cfg{gcn_block_pool, program.ins_list};
bool dump_ir = false;
bool extra_id_removal = true; // TODO remove all this stuff
if (true /*info.pgm_hash == 0x6fd3463f*/) {
dump_ir = true;
}
auto dumpMatchingIR = [&](std::string phase) {
if (dump_ir) {
if (Config::dumpShaders()) {
std::string s = IR::DumpProgram(program);
using namespace Common::FS;
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename = fmt::format("{}_{:#018x}_{}.{}.ir.txt", info.stage,
info.pgm_hash, info.perm_idx, phase);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write};
file.WriteString(s);
}
}
};
// Structurize control flow graph and create program.
program.syntax_list = Shader::Gcn::BuildASL(pools.inst_pool, pools.block_pool, cfg,
program.info, runtime_info, profile);
@ -58,16 +83,28 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front());
// Run optimization passes
dumpMatchingIR("pre_ssa");
Shader::Optimization::SsaRewritePass(program.post_order_blocks);
dumpMatchingIR("pre_const_prop");
Shader::Optimization::ConstantPropagationPass(program.post_order_blocks);
if (program.info.stage != Stage::Compute) {
Shader::Optimization::LowerSharedMemToRegisters(program);
}
Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage);
dumpMatchingIR("pre_hoist_pre_id");
// Shader::Optimization::IdentityRemovalPass(program.blocks); // temp
if (extra_id_removal) {
Shader::Optimization::IdentityRemovalPass(program.blocks); // temp
}
dumpMatchingIR("pre_flatten");
Shader::Optimization::FlattenExtendedUserdataPass(program);
dumpMatchingIR("pre_resource_tracking");
Shader::Optimization::ResourceTrackingPass(program);
Shader::Optimization::IdentityRemovalPass(program.blocks);
dumpMatchingIR("pre_dce");
Shader::Optimization::DeadCodeEliminationPass(program);
Shader::Optimization::CollectShaderInfoPass(program);
dumpMatchingIR("final");
return program;
}

View File

@ -8,6 +8,7 @@
#include "common/types.h"
#include "shader_recompiler/backend/bindings.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/passes/srt.h"
namespace Shader {
@ -52,6 +53,9 @@ struct StageSpecialization {
Backend::Bindings start_)
: info{&info_}, runtime_info{runtime_info_}, start{start_} {
u32 binding{};
if (info->has_readconst) {
binding++;
}
ForEachSharp(binding, buffers, info->buffers,
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.stride = sharp.GetStride();
@ -90,6 +94,12 @@ struct StageSpecialization {
return false;
}
u32 binding{};
if (info->has_readconst != other.info->has_readconst) {
return false;
}
if (info->has_readconst) {
binding++;
}
for (u32 i = 0; i < buffers.size(); i++) {
if (other.bitset[binding++] && buffers[i] != other.buffers[i]) {
return false;

View File

@ -156,7 +156,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
continue;
}
const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
const auto& buffer = vs_info.ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -301,6 +301,13 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value);
}
std::pair<Buffer*, u32> BufferCache::ObtainHostUBO(VAddr host_addr, u32 size) {
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
ASSERT(size <= StreamThreshold);
const u64 offset = stream_buffer.Copy(host_addr, size, instance.UniformMinAlignment());
return {&stream_buffer, offset};
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer, BufferId buffer_id) {
// For small uniform buffers that have not been modified by gpu

View File

@ -84,6 +84,8 @@ public:
/// Writes a value to GPU buffer.
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
[[nodiscard]] std::pair<Buffer*, u32> ObtainHostUBO(VAddr host_addr, u32 size);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
bool is_texel_buffer = false,

View File

@ -25,6 +25,15 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
u32 binding{};
boost::container::small_vector<vk::DescriptorSetLayoutBinding, 32> bindings;
if (info->has_readconst) {
bindings.push_back({
.binding = binding++,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
});
}
for (const auto& buffer : info->buffers) {
const auto sharp = buffer.GetSharp(*info);
bindings.push_back({

View File

@ -60,7 +60,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
}
const auto buffer =
vs_info->ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
vs_info->ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -327,6 +327,15 @@ void GraphicsPipeline::BuildDescSetLayout() {
if (!stage) {
continue;
}
if (stage->has_readconst) {
bindings.push_back({
.binding = binding++,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.descriptorCount = 1,
.stageFlags = gp_stage_flags,
});
}
for (const auto& buffer : stage->buffers) {
const auto sharp = buffer.GetSharp(*stage);
bindings.push_back({

View File

@ -4,6 +4,7 @@
#include <ranges>
#include "common/config.h"
#include "common/hash.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/backend/spirv/emit_spirv.h"
@ -22,10 +23,6 @@ namespace Vulkan {
using Shader::VsOutput;
[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2));
}
constexpr static std::array DescriptorHeapSizes = {
vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192},
vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024},
@ -351,7 +348,7 @@ bool PipelineCache::RefreshGraphicsKey() {
continue;
}
const auto& buffer =
vs_info->ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
vs_info->ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -399,6 +396,7 @@ vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info,
perm_idx != 0 ? "(permutation)" : "");
DumpShader(code, info.pgm_hash, info.stage, perm_idx, "bin");
info.perm_idx = perm_idx;
const auto ir_program = Shader::TranslateProgram(code, pools, info, runtime_info, profile);
const auto spv = Shader::Backend::SPIRV::EmitSPIRV(profile, runtime_info, ir_program, binding);
DumpShader(spv, info.pgm_hash, info.stage, perm_idx, "spv");
@ -424,7 +422,8 @@ std::tuple<const Shader::Info*, vk::ShaderModule, u64> PipelineCache::GetProgram
}
Program* program = it_pgm->second;
const auto& info = program->info;
auto& info = program->info;
info.RefreshFlatBuf();
const auto spec = Shader::StageSpecialization(info, runtime_info, binding);
size_t perm_idx = program->modules.size();
vk::ShaderModule module{};

View File

@ -57,6 +57,23 @@ void Pipeline::BindBuffers(VideoCore::BufferCache& buffer_cache,
}
}
// Bind the flattened user data buffer as a UBO so it's accessible to the shader
if (stage.has_readconst) {
const auto [vk_buffer, offset] =
buffer_cache.ObtainHostUBO(reinterpret_cast<VAddr>(stage.flattened_ud_buf.data()),
stage.flattened_ud_buf.size_bytes());
buffer_infos.emplace_back(vk_buffer->Handle(), offset, stage.flattened_ud_buf.size_bytes());
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding.unified++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
++binding.buffer;
}
// Second pass to re-bind buffers that were updated after binding
for (u32 i = 0; i < buffer_bindings.size(); i++) {
const auto& [buffer_id, vsharp] = buffer_bindings[i];

View File

@ -12,6 +12,10 @@
#include "video_core/texture_cache/texture_cache.h"
#include "vk_rasterizer.h"
#ifdef MemoryBarrier
#undef MemoryBarrier
#endif
namespace Vulkan {
Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,