mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-12-15 16:18:56 +00:00
shader_recompiler: Implement most integer image atomics, workgroup barriers and shared memory load/store (#231)
* shader_recompiler: Add LDEXP * shader_recompiler: Add most image integer atomic ops * shader_recompiler: Implement shared memory load/store * shader_recompiler: More image atomics * externals: Update sirit * clang format * cmake: Add missing files * shader_recompiler: Fix some atomic bugs * shader_recompiler: Vs outputs * shader_recompiler: Shared mem has side-effects, fix format component order * shader_recompiler: Inline constant buffer impl * video_core: Fix regressions * Work * Fixup a few things
This commit is contained in:
@@ -324,8 +324,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
|
||||
case IR::Opcode::BitFieldUExtract:
|
||||
FoldWhenAllImmediates(inst, [](u32 base, u32 shift, u32 count) {
|
||||
if (static_cast<size_t>(shift) + static_cast<size_t>(count) > 32) {
|
||||
throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
|
||||
base, shift, count);
|
||||
UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldUExtract,
|
||||
base, shift, count);
|
||||
}
|
||||
return (base >> shift) & ((1U << count) - 1);
|
||||
});
|
||||
@@ -336,8 +336,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
|
||||
const size_t left_shift{32 - back_shift};
|
||||
const size_t right_shift{static_cast<size_t>(32 - count)};
|
||||
if (back_shift > 32 || left_shift >= 32 || right_shift >= 32) {
|
||||
throw LogicError("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract,
|
||||
base, shift, count);
|
||||
UNREACHABLE_MSG("Undefined result in {}({}, {}, {})", IR::Opcode::BitFieldSExtract,
|
||||
base, shift, count);
|
||||
}
|
||||
return static_cast<u32>((base << left_shift) >> right_shift);
|
||||
});
|
||||
@@ -345,8 +345,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
|
||||
case IR::Opcode::BitFieldInsert:
|
||||
FoldWhenAllImmediates(inst, [](u32 base, u32 insert, u32 offset, u32 bits) {
|
||||
if (bits >= 32 || offset >= 32) {
|
||||
throw LogicError("Undefined result in {}({}, {}, {}, {})",
|
||||
IR::Opcode::BitFieldInsert, base, insert, offset, bits);
|
||||
UNREACHABLE_MSG("Undefined result in {}({}, {}, {}, {})",
|
||||
IR::Opcode::BitFieldInsert, base, insert, offset, bits);
|
||||
}
|
||||
return (base & ~(~(~0u << bits) << offset)) | (insert << offset);
|
||||
});
|
||||
|
||||
@@ -89,6 +89,17 @@ bool IsImageInstruction(const IR::Inst& inst) {
|
||||
case IR::Opcode::ImageGradient:
|
||||
case IR::Opcode::ImageRead:
|
||||
case IR::Opcode::ImageWrite:
|
||||
case IR::Opcode::ImageAtomicIAdd32:
|
||||
case IR::Opcode::ImageAtomicSMin32:
|
||||
case IR::Opcode::ImageAtomicUMin32:
|
||||
case IR::Opcode::ImageAtomicSMax32:
|
||||
case IR::Opcode::ImageAtomicUMax32:
|
||||
case IR::Opcode::ImageAtomicInc32:
|
||||
case IR::Opcode::ImageAtomicDec32:
|
||||
case IR::Opcode::ImageAtomicAnd32:
|
||||
case IR::Opcode::ImageAtomicOr32:
|
||||
case IR::Opcode::ImageAtomicXor32:
|
||||
case IR::Opcode::ImageAtomicExchange32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -99,6 +110,17 @@ bool IsImageStorageInstruction(const IR::Inst& inst) {
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::ImageWrite:
|
||||
case IR::Opcode::ImageRead:
|
||||
case IR::Opcode::ImageAtomicIAdd32:
|
||||
case IR::Opcode::ImageAtomicSMin32:
|
||||
case IR::Opcode::ImageAtomicUMin32:
|
||||
case IR::Opcode::ImageAtomicSMax32:
|
||||
case IR::Opcode::ImageAtomicUMax32:
|
||||
case IR::Opcode::ImageAtomicInc32:
|
||||
case IR::Opcode::ImageAtomicDec32:
|
||||
case IR::Opcode::ImageAtomicAnd32:
|
||||
case IR::Opcode::ImageAtomicOr32:
|
||||
case IR::Opcode::ImageAtomicXor32:
|
||||
case IR::Opcode::ImageAtomicExchange32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -115,7 +137,8 @@ public:
|
||||
u32 Add(const BufferResource& desc) {
|
||||
const u32 index{Add(buffer_resources, desc, [&desc](const auto& existing) {
|
||||
return desc.sgpr_base == existing.sgpr_base &&
|
||||
desc.dword_offset == existing.dword_offset;
|
||||
desc.dword_offset == existing.dword_offset &&
|
||||
desc.inline_cbuf == existing.inline_cbuf;
|
||||
})};
|
||||
auto& buffer = buffer_resources[index];
|
||||
ASSERT(buffer.stride == desc.stride && buffer.num_records == desc.num_records);
|
||||
@@ -196,20 +219,70 @@ SharpLocation TrackSharp(const IR::Inst* inst) {
|
||||
};
|
||||
}
|
||||
|
||||
static constexpr size_t MaxUboSize = 65536;
|
||||
|
||||
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
|
||||
AmdGpu::Buffer& cbuf) {
|
||||
|
||||
// Assuming V# is in UD s[32:35]
|
||||
// The next pattern:
|
||||
// s_getpc_b64 s[32:33]
|
||||
// s_add_u32 s32, <const>, s32
|
||||
// s_addc_u32 s33, 0, s33
|
||||
// s_mov_b32 s35, <const>
|
||||
// s_movk_i32 s34, <const>
|
||||
// buffer_load_format_xyz v[8:10], v1, s[32:35], 0 ...
|
||||
// is used to define an inline constant buffer
|
||||
|
||||
IR::Inst* handle = inst.Arg(0).InstRecursive();
|
||||
IR::Inst* p0 = handle->Arg(0).InstRecursive();
|
||||
if (p0->GetOpcode() != IR::Opcode::IAdd32 || !p0->Arg(0).IsImmediate() ||
|
||||
!p0->Arg(1).IsImmediate()) {
|
||||
return -1;
|
||||
}
|
||||
IR::Inst* p1 = handle->Arg(1).InstRecursive();
|
||||
if (p1->GetOpcode() != IR::Opcode::IAdd32) {
|
||||
return -1;
|
||||
}
|
||||
if (!handle->Arg(3).IsImmediate() || !handle->Arg(2).IsImmediate()) {
|
||||
return -1;
|
||||
}
|
||||
// We have found this pattern. Build the sharp.
|
||||
std::array<u64, 2> buffer;
|
||||
buffer[0] = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32();
|
||||
buffer[1] = handle->Arg(2).U32() | handle->Arg(3).U64() << 32;
|
||||
cbuf = std::bit_cast<AmdGpu::Buffer>(buffer);
|
||||
// Assign a binding to this sharp.
|
||||
return descriptors.Add(BufferResource{
|
||||
.sgpr_base = std::numeric_limits<u32>::max(),
|
||||
.dword_offset = 0,
|
||||
.stride = cbuf.GetStride(),
|
||||
.num_records = u32(cbuf.num_records),
|
||||
.used_types = BufferDataType(inst),
|
||||
.inline_cbuf = cbuf,
|
||||
.is_storage = IsBufferStore(inst) || cbuf.GetSize() > MaxUboSize,
|
||||
});
|
||||
}
|
||||
|
||||
void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
|
||||
Descriptors& descriptors) {
|
||||
static constexpr size_t MaxUboSize = 65536;
|
||||
IR::Inst* producer = inst.Arg(0).InstRecursive();
|
||||
const auto sharp = TrackSharp(producer);
|
||||
const auto buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
|
||||
const u32 binding = descriptors.Add(BufferResource{
|
||||
.sgpr_base = sharp.sgpr_base,
|
||||
.dword_offset = sharp.dword_offset,
|
||||
.stride = buffer.GetStride(),
|
||||
.num_records = u32(buffer.num_records),
|
||||
.used_types = BufferDataType(inst),
|
||||
.is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
|
||||
});
|
||||
s32 binding{};
|
||||
AmdGpu::Buffer buffer;
|
||||
if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) {
|
||||
IR::Inst* handle = inst.Arg(0).InstRecursive();
|
||||
IR::Inst* producer = handle->Arg(0).InstRecursive();
|
||||
const auto sharp = TrackSharp(producer);
|
||||
buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
|
||||
binding = descriptors.Add(BufferResource{
|
||||
.sgpr_base = sharp.sgpr_base,
|
||||
.dword_offset = sharp.dword_offset,
|
||||
.stride = buffer.GetStride(),
|
||||
.num_records = u32(buffer.num_records),
|
||||
.used_types = BufferDataType(inst),
|
||||
.is_storage = IsBufferStore(inst) || buffer.GetSize() > MaxUboSize,
|
||||
});
|
||||
}
|
||||
|
||||
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
|
||||
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
|
||||
// Replace handle with binding index in buffer resource list.
|
||||
@@ -217,7 +290,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
|
||||
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
|
||||
if (inst_info.is_typed) {
|
||||
ASSERT(inst_info.nfmt == AmdGpu::NumberFormat::Float &&
|
||||
inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32);
|
||||
(inst_info.dmft == AmdGpu::DataFormat::Format32_32_32_32 ||
|
||||
inst_info.dmft == AmdGpu::DataFormat::Format32_32_32 ||
|
||||
inst_info.dmft == AmdGpu::DataFormat::Format32_32 ||
|
||||
inst_info.dmft == AmdGpu::DataFormat::Format32));
|
||||
}
|
||||
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer ||
|
||||
inst.GetOpcode() == IR::Opcode::ReadConstBufferU32) {
|
||||
|
||||
@@ -16,6 +16,16 @@ void Visit(Info& info, IR::Inst& inst) {
|
||||
info.stores.Set(inst.Arg(0).Attribute(), inst.Arg(2).U32());
|
||||
break;
|
||||
}
|
||||
case IR::Opcode::LoadSharedS8:
|
||||
case IR::Opcode::LoadSharedU8:
|
||||
case IR::Opcode::WriteSharedU8:
|
||||
info.uses_shared_u8 = true;
|
||||
break;
|
||||
case IR::Opcode::LoadSharedS16:
|
||||
case IR::Opcode::LoadSharedU16:
|
||||
case IR::Opcode::WriteSharedU16:
|
||||
info.uses_shared_u16 = true;
|
||||
break;
|
||||
case IR::Opcode::QuadShuffle:
|
||||
info.uses_group_quad = true;
|
||||
break;
|
||||
|
||||
@@ -32,6 +32,7 @@ struct SccFlagTag : FlagTag {};
|
||||
struct ExecFlagTag : FlagTag {};
|
||||
struct VccFlagTag : FlagTag {};
|
||||
struct VccLoTag : FlagTag {};
|
||||
struct SccLoTag : FlagTag {};
|
||||
struct VccHiTag : FlagTag {};
|
||||
|
||||
struct GotoVariable : FlagTag {
|
||||
@@ -44,7 +45,7 @@ struct GotoVariable : FlagTag {
|
||||
};
|
||||
|
||||
using Variant = std::variant<IR::ScalarReg, IR::VectorReg, GotoVariable, SccFlagTag, ExecFlagTag,
|
||||
VccFlagTag, VccLoTag, VccHiTag>;
|
||||
VccFlagTag, SccLoTag, VccLoTag, VccHiTag>;
|
||||
using ValueMap = std::unordered_map<IR::Block*, IR::Value>;
|
||||
|
||||
struct DefTable {
|
||||
@@ -83,6 +84,13 @@ struct DefTable {
|
||||
exec_flag.insert_or_assign(block, value);
|
||||
}
|
||||
|
||||
const IR::Value& Def(IR::Block* block, SccLoTag) {
|
||||
return scc_lo_flag[block];
|
||||
}
|
||||
void SetDef(IR::Block* block, SccLoTag, const IR::Value& value) {
|
||||
scc_lo_flag.insert_or_assign(block, value);
|
||||
}
|
||||
|
||||
const IR::Value& Def(IR::Block* block, VccLoTag) {
|
||||
return vcc_lo_flag[block];
|
||||
}
|
||||
@@ -108,6 +116,7 @@ struct DefTable {
|
||||
ValueMap scc_flag;
|
||||
ValueMap exec_flag;
|
||||
ValueMap vcc_flag;
|
||||
ValueMap scc_lo_flag;
|
||||
ValueMap vcc_lo_flag;
|
||||
ValueMap vcc_hi_flag;
|
||||
};
|
||||
@@ -124,6 +133,10 @@ IR::Opcode UndefOpcode(const VccLoTag&) noexcept {
|
||||
return IR::Opcode::UndefU32;
|
||||
}
|
||||
|
||||
IR::Opcode UndefOpcode(const SccLoTag&) noexcept {
|
||||
return IR::Opcode::UndefU32;
|
||||
}
|
||||
|
||||
IR::Opcode UndefOpcode(const VccHiTag&) noexcept {
|
||||
return IR::Opcode::UndefU32;
|
||||
}
|
||||
@@ -321,6 +334,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
|
||||
case IR::Opcode::SetVcc:
|
||||
pass.WriteVariable(VccFlagTag{}, block, inst.Arg(0));
|
||||
break;
|
||||
case IR::Opcode::SetSccLo:
|
||||
pass.WriteVariable(SccLoTag{}, block, inst.Arg(0));
|
||||
break;
|
||||
case IR::Opcode::SetVccLo:
|
||||
pass.WriteVariable(VccLoTag{}, block, inst.Arg(0));
|
||||
break;
|
||||
@@ -350,6 +366,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
|
||||
case IR::Opcode::GetVcc:
|
||||
inst.ReplaceUsesWith(pass.ReadVariable(VccFlagTag{}, block));
|
||||
break;
|
||||
case IR::Opcode::GetSccLo:
|
||||
inst.ReplaceUsesWith(pass.ReadVariable(SccLoTag{}, block));
|
||||
break;
|
||||
case IR::Opcode::GetVccLo:
|
||||
inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block));
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user