update comments for current implementation

This commit is contained in:
Frodo Baggins 2024-12-13 23:26:09 -08:00
parent b0dafd0501
commit 0b75adb7c8
5 changed files with 137 additions and 114 deletions

View File

@ -8,20 +8,21 @@
namespace Shader {
struct TessellationDataConstantBuffer {
u32 m_lsStride;
u32 m_hsCpStride; // HullStateConstants::m_cpStride != 0 ? HullStateConstants::m_cpStride :
u32 ls_stride;
u32 hs_cp_stride; // HullStateConstants::m_cpStride != 0 ? HullStateConstants::m_cpStride :
// ls_stride
u32 m_hsNumPatch; // num patches submitted in threadgroup
u32 m_hsOutputBase; // HullStateConstants::m_numInputCP::m_cpStride != 0 ?
u32 num_patches; // num patches submitted in threadgroup
u32 hs_output_base; // HullStateConstants::m_numInputCP::m_cpStride != 0 ?
// HullStateConstants::m_numInputCP * ls_stride * num_patches : 0
u32 m_patchConstSize; // 16 * num_patch_attrs
u32 m_patchConstBase; // hs_output_base + patch_output_size
u32 m_patchOutputSize; // output_cp_stride * num_output_cp
f32 m_offChipTessellationFactorThreshold;
u32 m_firstEdgeTessFactorIndex;
// basically 0 when passthrough
u32 patch_const_size; // 16 * num_patch_attrs
u32 patch_const_base; // hs_output_base + patch_output_size
u32 patch_output_size; // output_cp_stride * num_output_cp_per_patch
f32 off_chip_tessellation_factor_threshold;
u32 first_edge_tess_factor_index;
};
// TODO comment
// Assign names to dword fields of TessellationDataConstantBuffer
enum class TessConstantAttribute : u32 {
LsStride,
HsCpStride,

View File

@ -129,7 +129,7 @@ void Translator::EmitPrologue() {
// [8:12]: output control point id
ir.SetVectorReg(IR::VectorReg::V1,
ir.GetAttributeU32(IR::Attribute::PackedHullInvocationInfo));
// TODO need PrimitiveId also like TES?
// TODO need PrimitiveId also like TES? Havent seen it yet but probably V2
break;
}
case LogicalStage::TessellationEval:
@ -140,12 +140,8 @@ void Translator::EmitPrologue() {
// V2 is similar to PrimitiveID but not the same. It seems to only be used in
// compiler-generated address calculations. Its probably the patch id within the
// patches running locally on a given VGT (or CU, whichever is the granularity of LDS
// memory). So it is probably equal to PrimitiveID % #patches_per_VGT (or per CU).
// We should be able to safely set V2 to 0, along with other special values read from the
// tess constants buffer, since in the recompiled Vulkan shaders a thread can only
// read/write control points and patch const attributes within the local patch. This (V2)
// and other special values of TessellationDataConstantBuffer are (probably) just an
// implementation detail from the ps4 shader compiler and only used for addressing.
// memory)
// Set to 0. See explanation in comment describing hull/domain passes
ir.SetVectorReg(IR::VectorReg::V2, ir.Imm32(0u));
// V3 is the actual PrimitiveID as intended by the shader author.
ir.SetVectorReg(IR::VectorReg::V3, ir.GetAttributeU32(IR::Attribute::PrimitiveId));

View File

@ -1,6 +1,5 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <numeric>
#include "common/assert.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/attribute.h"
@ -9,31 +8,17 @@
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/pattern_matching.h"
#include "shader_recompiler/ir/program.h"
// TODO delelte
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/runtime_info.h"
namespace Shader::Optimization {
static void DumpIR(IR::Program& program, std::string phase) {
std::string s = IR::DumpProgram(program);
using namespace Common::FS;
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename =
fmt::format("{}_{:#018x}.{}.ir.txt", program.info.stage, program.info.pgm_hash, phase);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write};
file.WriteString(s);
};
/**
* Tessellation shaders pass outputs to the next shader using LDS.
* The Hull shader stage receives input control points stored in LDS.
*
* These passes attempt to resolve LDS accesses to attribute accesses and correctly
* write to the tessellation factor tables.
*
* The LDS layout is:
* - TCS inputs for patch 0
* - TCS inputs for patch 1
@ -43,66 +28,111 @@ static void DumpIR(IR::Program& program, std::string phase) {
* - TCS outputs for patch 1
* - TCS outputs for patch 2
* - ...
* - Per-patch TCS outputs for patch 0
* - Per-patch TCS outputs for patch 1
* - Per-patch TCS outputs for patch 2
* - PatchConst TCS outputs for patch 0
* - PatchConst TCS outputs for patch 1
* - PatchConst TCS outputs for patch 2
*
*
* If the Hull stage does not write any new control points the driver will
* optimize LDS layout so input and output control point spaces overlap.
* (Passthrough)
*
* Tessellation factors are stored in the per-patch TCS output block
* as well as a factor V# that is automatically bound by the driver.
* The gnm driver requires a V# holding special constants to be bound
* for reads by the shader.
* The Hull and Domain shaders read values from this buffer which
* contain size and offset information required to address input, output,
* or PatchConst attributes within the current patch.
* See the TessellationDataConstantBuffer struct to see the layout of this V#.
*
* This pass attempts to resolve LDS accesses to attribute accesses and correctly
* write to the tessellation factor tables. For the latter we replace the
* buffer store instruction to factor writes according to their offset.
* Tessellation factors are stored to a special tessellation factor V# that is automatically bound
* by the driver. This is the input to the fixed function tessellator that actually subdivides the
* domain. We translate these to writes to SPIR-V builtins for tessellation factors in the Hull
* shader.
* The offset into the tess factor buffer determines which factor the shader is writing.
* Additionally, most hull shaders seem to redundantly write tess factors to PatchConst
* attributes, even if dead in the domain shader. We just treat these as generic PatchConst writes.
*
* LDS stores can either be output control point writes or per-patch data writes.
* This is detected by looking at how the address is formed. In any case the calculation
* will be of the form a * b + c. For output control points a = output_control_point_id
* while for per-patch writes a = patch_id.
* LDS reads in the Hull shader can be from input control points, and in the the Domain shader can
* be hs output control points (output from the perspective of the Hull shader) and patchconst
* values.
* LDS stores in the Hull shader can either be output control point writes or per-patch
* (PatchConst) data writes. The Domain shader exports attributes using EXP instructions, unless its
* followed by the geometry stage (but we havent seen this yet), so nothing special there.
* The address calculations can vary significantly and can't be easily pattern matched. We are at
* the mercy of instruction selection the ps4 compiler wanted to use.
* Generally though, they could look something like this:
* Input control point:
* addr = PatchIdInVgt * input_cp_stride * #input_cp_per_patch + index * input_cp_stride
* + attr# * 16 + component
* Output control point:
* addr = #patches * input_cp_stride * #input_cp_per_patch
* + PatchIdInVgt * output_patch_stride + InvocationID * output_cp_stride
+ attr# * 16 + component
* Per patch output:
* addr = #patches * input_cp_stride * #cp_per_input_patch
* + #patches * output_patch_stride
* + PatchIdInVgt * per_patch_output_stride + attr# * 16 + component
*
* Both patch_id and output_control_point_id are packed in VGPR1 by the driver and shader
* uses V_BFE_U32 to extract them. We use the starting bit_pos to determine which is which.
* output_patch_stride and output_cp_stride are usually compile time constants in the gcn
*
* LDS reads are more tricky as amount of different calculations performed can vary.
* The final result, if output control point space is distinct, is of the form:
* patch_id * input_control_point_stride * num_control_points_per_input_patch + a
* The value "a" can be anything in the range of [0, input_control_point_stride]
* Hull shaders can probably also read output control points corresponding to other threads, like
* shared memory (but we havent seen this yet).
* ^ This is an UNREACHABLE for now. We may need to insert additional barriers if this happens.
* They should also be able to read PatchConst values,
* although not sure if this happens in practice.
*
* This pass does not attempt to deduce the exact attribute referenced by "a" but rather
* only using "a" itself index into input attributes. Those are defined as an array in the shader
* layout (location = 0) in vec4[num_control_points_per_input_patch] attr[];
* ...
* float value = attr[a / in_stride][(a % in_stride) >> 4][(a & 0xF) >> 2];
* To determine which type of attribute (input, output, patchconst) we the check the users of
* TessConstants V# reads to deduce which type of attribute a given load/store to LDS
* is touching.
*
* This requires knowing in_stride which is not provided to us by the guest.
* To deduce it we perform a breadth first search on the arguments of a DS_READ*
* looking for a buffer load with offset = 0. This will be the buffer holding tessellation
* constants and it contains the value of in_stride we can read at compile time.
* In the Hull shader, both the PatchId within the VGT group (PatchIdInVgt) and the output control
* point id (InvocationId) are packed in VGPR1 by the driver like
* V1 = InvocationId << 8 | PatchIdInVgt
* The shader typically uses V_BFE_(U|S)32 to extract them. We use the starting bit_pos to determine
* which is which.
*
* NOTE: This pass must be run before constant propagation as it relies on relatively specific
* pattern matching that might be mutated that that optimization pass.
* This pass does not attempt to deduce the exact attribute referenced in a LDS load/store.
* Instead, it feeds the address in the LDS load/store to the get/set Insts we use for TCS in/out's,
* TES in's, and PatchConst in/out's.
*
* TCS/TES Input attributes:
* We define input attributes using an array in the shader roughly like this:
* // equivalent GLSL in TCS
* layout (location = 0) in vec4 in_attrs[][NUM_INPUT_ATTRIBUTES];
*
* Here the NUM_INPUT_ATTRIBUTES is derived from the ls_stride member of the TessConstants V#.
* We divide ls_stride (in bytes) by 16 to get the number of vec4 attributes.
* For TES, the number of attributes comes from hs_cp_stride / 16.
* The first (outer) dimension is unsized but corresponds to the number of vertices in the hs input
* patch (for Hull) or the hs output patch (for Domain).
*
* For input reads in TCS or TES, we emit SPIR-V like:
* float value = in_attrs[addr / ls_stride][(addr % ls_stride) >> 4][(addr & 0xF) >> 2];
*
* For output writes, we assume the control point index is InvocationId, since high level languages
* impose that restriction (although maybe it's technically possible on hardware). So SPIR-V looks
* like this:
* layout (location = 0) in vec4 in_attrs[][NUM_OUTPUT_ATTRIBUTES];
* out_attrs[InvocationId][(addr % hs_cp_stride) >> 4][(addr & 0xF) >> 2] = value;
*
* NUM_OUTPUT_ATTRIBUTES is derived by hs_cp_stride / 16, so it can link with the TES in_attrs
* variable.
*
* Another challenge is the fact that the GCN shader needs to address attributes from LDS as a whole
* which contains the attributes from many patches. On the other hand, higher level shading
* languages restrict attribute access to the patch of the current thread, which is naturally a
* restriction in SPIR-V also.
* The addresses the ps4 compiler generates for loads/stores and the fact that LDS holds many
* patches' attributes are just implementation details of the ps4 driver/compiler. To deal with
* this, we can replace certain TessConstant V# reads with 0, which only contribute to the base
* address of the current patch's attributes in LDS and not the indexes within the local patch.
*
* (A perfect implementation might need emulation of the VGTs in mesh/compute, loading/storing
* attributes to buffers and not caring about whether they are hs input, hs output, or patchconst
* attributes)
*
* TODO: need to be careful about reading from output arrays at idx other than InvocationID
* Need SPIRV OpControlBarrier
* "Wait for all active invocations within the specified Scope to reach the current point of
* execution."
* Must be placed in uniform control flow
*/
// Addr calculations look something like this, but can vary wildly due to decisions made by
// the ps4 compiler (instruction selection, etc)
// Input control point:
// PrimitiveId * input_cp_stride * #cp_per_input_patch + index * input_cp_stride + (attr# * 16 +
// component)
// Output control point
// #patches * input_cp_stride * #cp_per_input_patch + PrimitiveId * output_patch_stride +
// InvocationID * output_cp_stride + (attr# * 16 + component)
// Per patch output:
// #patches * input_cp_stride * #cp_per_input_patch + #patches * output_patch_stride +
// + PrimitiveId * per_patch_output_stride + (attr# * 16 + component)
namespace {
using namespace Shader::Optimiation::PatternMatching;
@ -162,15 +192,9 @@ std::optional<TessSharpLocation> FindTessConstantSharp(IR::Inst* read_const_buff
// is used as an addend to skip the region for input control points, and similarly
// NumPatch * hs_cp_stride * #output_cp_in_patch is used to skip the region
// for output control points.
// The Input CP, Output CP, and PatchConst regions are laid out in that order for the
// entire thread group, so seeing the TcsNumPatches attribute used in an addr calc should
// increment the "region counter" by 1 for the given Load/WriteShared
//
// TODO this will break if AMD compiler used distributive property like
// TODO: this will break if AMD compiler used distributive property like
// TcsNumPatches * (ls_stride * #input_cp_in_patch + hs_cp_stride * #output_cp_in_patch)
//
// TODO can we just look at address post-constant folding, pull out all the constants
// and find the interval it's inside of? (phis are still a problem here)
class TessConstantUseWalker {
public:
void MarkTessAttributeUsers(IR::Inst* read_const_buffer, TessConstantAttribute attr) {
@ -306,7 +330,6 @@ static bool TryOptimizeAddendInModulo(IR::Value addend, u32 stride, std::vector<
// If any addend is divisible by stride, then we can replace it with 0 in the attribute
// or component index calculation
static IR::U32 TryOptimizeAddressModulo(IR::U32 addr, u32 stride, IR::IREmitter& ir) {
#if 1
std::vector<IR::U32> addends;
if (TryOptimizeAddendInModulo(addr, stride, addends)) {
addr = ir.Imm32(0);
@ -314,10 +337,11 @@ static IR::U32 TryOptimizeAddressModulo(IR::U32 addr, u32 stride, IR::IREmitter&
addr = ir.IAdd(addr, addend);
}
}
#endif
return addr;
}
// TODO: can optimize div in control point index similarly to mod
// Read a TCS input (InputCP region) or TES input (OutputCP region)
static IR::F32 ReadTessInputComponent(IR::U32 addr, const u32 stride, IR::IREmitter& ir,
u32 off_dw) {
@ -340,8 +364,6 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
for (IR::Block* block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {
IR::IREmitter ir{*block,
IR::Block::InstructionList::s_iterator_to(inst)}; // TODO sink this
const auto opcode = inst.GetOpcode();
switch (opcode) {
case IR::Opcode::StoreBufferU32:
@ -352,6 +374,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
if (!info.globally_coherent) {
break;
}
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const auto GetValue = [&](IR::Value data) -> IR::F32 {
if (auto* inst = data.TryInstRecursive();
inst && inst->GetOpcode() == IR::Opcode::BitCastU32F32) {
@ -389,7 +412,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
}
return IR::PatchFactor(gcn_factor_idx);
default:
// TODO point domain types haven't been seen so far
// Point domain types haven't been seen so far
UNREACHABLE_MSG("Unhandled tess type");
}
};
@ -412,6 +435,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
case IR::Opcode::WriteSharedU32:
case IR::Opcode::WriteSharedU64:
case IR::Opcode::WriteSharedU128: {
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32
? 1
: (opcode == IR::Opcode::WriteSharedU64 ? 2 : 4);
@ -457,6 +481,7 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
case IR::Opcode::LoadSharedU32: {
case IR::Opcode::LoadSharedU64:
case IR::Opcode::LoadSharedU128:
IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)};
const IR::U32 addr{inst.Arg(0)};
AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info);
const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32
@ -515,16 +540,15 @@ void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
ir.SetTcsGenericAttribute(attr_read, ir.Imm32(attr_no), ir.Imm32(comp));
}
}
// TODO: wrap rest of program with if statement when passthrough?
// copy passthrough attributes ...
// We could wrap the rest of the program in an if stmt
// CopyInputAttrsToOutputs(); // psuedocode
// if (InvocationId == 0) {
// program ...
// PatchConstFunction();
// }
// But as long as we treat invocation ID as 0 for all threads, shouldn't matter functionally
}
}
// TODO refactor
void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) {
Info& info = program.info;
@ -628,8 +652,7 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
auto sharp_location = FindTessConstantSharp(&inst);
if (sharp_location && sharp_location->ptr_base == info.tess_consts_ptr_base &&
sharp_location->dword_off == info.tess_consts_dword_offset) {
// Replace the load with a special attribute load (for readability and
// easier pattern matching)
// The shader is reading from the TessConstants V#
IR::Value index = inst.Arg(1);
ASSERT_MSG(index.IsImmediate(),
@ -643,10 +666,10 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
case TessConstantAttribute::LsStride:
// If not, we may need to make this runtime state for TES
ASSERT(info.l_stage == LogicalStage::TessellationControl);
inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.m_lsStride));
inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.ls_stride));
break;
case TessConstantAttribute::HsCpStride:
inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.m_hsCpStride));
inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.hs_cp_stride));
break;
case TessConstantAttribute::HsNumPatch:
case TessConstantAttribute::HsOutputBase:
@ -659,12 +682,14 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
// See the explanation for why we set V2 to 0 when emitting the prologue.
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
break;
// PatchConstSize:
// PatchOutputSize:
// OffChipTessellationFactorThreshold:
// FirstEdgeTessFactorIndex:
default:
case Shader::TessConstantAttribute::PatchConstSize:
case Shader::TessConstantAttribute::PatchOutputSize:
case Shader::TessConstantAttribute::OffChipTessellationFactorThreshold:
case Shader::TessConstantAttribute::FirstEdgeTessFactorIndex:
// May need to replace PatchConstSize and PatchOutputSize with 0
break;
default:
UNREACHABLE_MSG("Read past end of TessConstantsBuffer");
}
}
}
@ -675,8 +700,7 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
// PatchConst attributes and tess factors. PatchConst should be easy, turn those into a single
// vec4 array like in/out attrs. Not sure about tess factors.
if (info.l_stage == LogicalStage::TessellationControl) {
// Replace the BFEs on V1 (packed with patch id and output cp id) for easier pattern
// matching
// Replace the BFEs on V1 (packed with patch id within VGT and output cp id)
for (IR::Block* block : program.blocks) {
for (auto it = block->Instructions().begin(); it != block->Instructions().end(); it++) {
IR::Inst& inst = *it;
@ -686,6 +710,8 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
MatchU32(0), MatchU32(8))
.Match(IR::Value{&inst})) {
IR::IREmitter emit(*block, it);
// This is the patch id within the VGT, not the actual PrimitiveId
// in the draw
IR::Value replacement(0u);
inst.ReplaceUsesWithAndRemove(replacement);
} else if (M_BITFIELDUEXTRACT(
@ -698,7 +724,7 @@ void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) {
IR::Value replacement;
if (runtime_info.hs_info.IsPassthrough()) {
// Deal with annoying pattern in BB where InvocationID use makes no
// sense (in addr calculation for patchconst write)
// sense (in addr calculation for patchconst or tess factor write)
replacement = ir.Imm32(0);
} else {
replacement = ir.GetAttributeU32(IR::Attribute::InvocationId);

View File

@ -98,7 +98,7 @@ struct VertexRuntimeInfo {
}
void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) {
hs_output_cp_stride = tess_constants.m_hsCpStride;
hs_output_cp_stride = tess_constants.hs_cp_stride;
}
};
@ -135,9 +135,9 @@ struct HullRuntimeInfo {
}
void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) {
ls_stride = tess_constants.m_lsStride;
hs_output_cp_stride = tess_constants.m_hsCpStride;
hs_output_base = tess_constants.m_hsOutputBase;
ls_stride = tess_constants.ls_stride;
hs_output_cp_stride = tess_constants.hs_cp_stride;
hs_output_base = tess_constants.hs_output_base;
}
};

View File

@ -101,7 +101,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_
const auto params = Liverpool::GetParams(*pgm);
const auto& hull_info = program_cache.at(params.hash)->info;
hull_info.ReadTessConstantBuffer(tess_constants);
info.ls_info.ls_stride = tess_constants.m_lsStride;
info.ls_info.ls_stride = tess_constants.ls_stride;
}
break;
}