shader_recompiler: Optimize common buffer access pattern

This commit is contained in:
IndecisiveTurtle 2025-06-26 01:43:50 +03:00
parent 6fa5f51702
commit e741c3cc10

View File

@ -522,6 +522,22 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info, IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
const AmdGpu::Buffer& buffer, u32 stride) { const AmdGpu::Buffer& buffer, u32 stride) {
const auto inst_info = inst.Flags<IR::BufferInstInfo>(); const auto inst_info = inst.Flags<IR::BufferInstInfo>();
const u32 inst_offset = inst_info.inst_offset.Value();
const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
const auto data_format = is_inst_typed
? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
: buffer.GetDataFmt();
const u32 shift = BufferAddressShift(inst, data_format);
const u32 mask = (1 << shift) - 1;
// If address calculation is of the form "index * const_stride + offset" with offset constant
// and both const_stride and offset are divisible with the element size, apply shift directly.
if (inst_info.index_enable && !inst_info.offset_enable && !buffer.swizzle_enable &&
!buffer.add_tid_enable && (stride & mask) == 0 && (inst_offset & mask) == 0) {
// buffer_offset = index * (const_stride >> shift) + (inst_offset >> shift)
const IR::U32 index = IR::U32{inst.Arg(1)};
return ir.IAdd(ir.IMul(index, ir.Imm32(stride >> shift)), ir.Imm32(inst_offset >> shift));
}
// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0) // index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
IR::U32 index = ir.Imm32(0U); IR::U32 index = ir.Imm32(0U);
@ -538,7 +554,7 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
index = ir.IAdd(index, thread_id); index = ir.IAdd(index, thread_id);
} }
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset // offset = (inst_offen ? vgpr_offset : 0) + inst_offset
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value()); IR::U32 offset = ir.Imm32(inst_offset);
if (inst_info.offset_enable) { if (inst_info.offset_enable) {
const IR::U32 vgpr_offset = inst_info.index_enable const IR::U32 vgpr_offset = inst_info.index_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)} ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
@ -571,12 +587,6 @@ IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const In
// buffer_offset = index * const_stride + offset // buffer_offset = index * const_stride + offset
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset); buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
} }
const auto is_inst_typed = inst_info.inst_data_fmt != AmdGpu::DataFormat::FormatInvalid;
const auto data_format = is_inst_typed
? AmdGpu::RemapDataFormat(inst_info.inst_data_fmt.Value())
: buffer.GetDataFmt();
const u32 shift = BufferAddressShift(inst, data_format);
if (shift != 0) { if (shift != 0) {
buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift)); buffer_offset = ir.ShiftRightLogical(buffer_offset, ir.Imm32(shift));
} }