From 917e02f99749ff05f2a1b336b108cc012af5f2aa Mon Sep 17 00:00:00 2001 From: Frodo Baggins Date: Wed, 23 Oct 2024 23:20:23 -0700 Subject: [PATCH] WIP Tessellation partial implementation. Squash commits --- CMakeLists.txt | 4 + src/common/logging/backend.cpp | 3 +- src/core/libraries/gnmdriver/gnmdriver.cpp | 11 +- .../spirv/emit_spirv_context_get_set.cpp | 55 +- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/spirv_emit_context.cpp | 88 ++- src/shader_recompiler/frontend/tessellation.h | 24 + .../frontend/translate/scalar_alu.cpp | 26 +- .../frontend/translate/scalar_flow.cpp | 7 +- .../frontend/translate/translate.h | 2 +- src/shader_recompiler/info.h | 20 + src/shader_recompiler/ir/attribute.cpp | 24 +- src/shader_recompiler/ir/attribute.h | 25 +- src/shader_recompiler/ir/ir_emitter.cpp | 4 +- src/shader_recompiler/ir/ir_emitter.h | 5 +- src/shader_recompiler/ir/microinstruction.cpp | 1 + .../ir/passes/constant_propagation_pass.cpp | 3 + .../ir/passes/constant_propogation.h | 4 + .../ir/passes/hull_shader_transform.cpp | 740 +++++++++++++++++- src/shader_recompiler/ir/passes/ir_passes.h | 8 +- .../ir/passes/ring_access_elimination.cpp | 83 +- src/shader_recompiler/ir/value.h | 2 +- src/shader_recompiler/recompiler.cpp | 57 +- src/shader_recompiler/recompiler.h | 2 +- src/shader_recompiler/runtime_info.h | 49 +- src/video_core/amdgpu/liverpool.h | 27 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 12 +- .../renderer_vulkan/vk_graphics_pipeline.h | 1 + .../renderer_vulkan/vk_instance.cpp | 17 +- src/video_core/renderer_vulkan/vk_instance.h | 4 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 62 +- .../renderer_vulkan/vk_pipeline_cache.h | 2 +- .../renderer_vulkan/vk_rasterizer.cpp | 6 +- 33 files changed, 1131 insertions(+), 249 deletions(-) create mode 100644 src/shader_recompiler/frontend/tessellation.h create mode 100644 src/shader_recompiler/ir/passes/constant_propogation.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ecab3e02d..1b8844e44 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,10 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + add_compile_definitions(_DEBUG) +endif() + project(shadPS4) # Forcing PIE makes sure that the base address is high enough so that it doesn't clash with the PS4 memory. diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp index 082ce4221..7802977f5 100644 --- a/src/common/logging/backend.cpp +++ b/src/common/logging/backend.cpp @@ -62,8 +62,7 @@ private: class FileBackend { public: explicit FileBackend(const std::filesystem::path& filename) - : file{std::filesystem::path("/dev/null"), FS::FileAccessMode::Write, - FS::FileType::TextFile} {} + : file{filename, FS::FileAccessMode::Write, FS::FileType::TextFile} {} ~FileBackend() = default; diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index dbf085fb3..efd7cf531 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1642,7 +1642,6 @@ s32 PS4_SYSV_ABI sceGnmSetGsShader(u32* cmdbuf, u32 size, const u32* gs_regs) { s32 PS4_SYSV_ABI sceGnmSetHsShader(u32* cmdbuf, u32 size, const u32* hs_regs, u32 param4) { LOG_TRACE(Lib_GnmDriver, "called"); - if (!cmdbuf || size < 0x1E) { return -1; } @@ -1660,11 +1659,19 @@ s32 PS4_SYSV_ABI sceGnmSetHsShader(u32* cmdbuf, u32 size, const u32* hs_regs, u3 cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x108u, hs_regs[0], 0u); // SPI_SHADER_PGM_LO_HS cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x10au, hs_regs[2], hs_regs[3]); // SPI_SHADER_PGM_RSRC1_HS/SPI_SHADER_PGM_RSRC2_HS + // This is wrong but just stash them here for now + // Should read the tess constants buffer instead, which is bound as V#, into runtime_info. + // HsConstants member of HsProgram is used to derive TessellationDataConstantBuffer, its members + // dont correspond to real registers + cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x11cu, hs_regs[4], hs_regs[5], hs_regs[6], hs_regs[7], + hs_regs[8], hs_regs[9], hs_regs[10], hs_regs[11], hs_regs[12], + hs_regs[13]); // TODO comment cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x286u, hs_regs[5], - hs_regs[5]); // VGT_HOS_MAX_TESS_LEVEL + hs_regs[6]); // VGT_HOS_MAX_TESS_LEVEL cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x2dbu, hs_regs[4]); // VGT_TF_PARAM cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x2d6u, param4); // VGT_LS_HS_CONFIG + // right padding? WriteTrailingNop<11>(cmdbuf); return ORBIS_OK; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index f1e173371..497a62e97 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -47,15 +47,24 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { } } -Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { +Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, Id array_index, u32 element) { if (IR::IsParam(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; const auto& info{ctx.output_params.at(index)}; ASSERT(info.num_components > 0); - if (info.num_components == 1) { + Id base = info.id; + boost::container::small_vector indices; + if (ctx.l_stage == LogicalStage::TessellationControl) { + indices.push_back(array_index); + } + if (info.num_components > 1) { + indices.push_back(ctx.ConstU32(element)); + } + + if (indices.empty()) { return info.id; } else { - return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)); + return ctx.OpAccessChain(info.pointer_type, info.id, indices); } } if (IR::IsMrt(attr)) { @@ -84,6 +93,10 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { } } +Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { + return OutputAttrPointer(ctx, attr, {}, element); +} + std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { if (IR::IsParam(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; @@ -175,12 +188,11 @@ Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value)); } -Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { +Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ - ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))}; + const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -190,7 +202,7 @@ Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; const auto param = ctx.input_params.at(param_id).id; const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))}; + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -198,9 +210,27 @@ Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u UNREACHABLE(); } -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (ctx.info.stage == Stage::Geometry) { return EmitGetAttributeForGeometry(ctx, attr, comp, index); + } else if (ctx.info.l_stage == LogicalStage::TessellationControl || + ctx.info.l_stage == LogicalStage::TessellationEval) { + if (IR::IsTessCoord(attr)) { + const u32 component = attr == IR::Attribute::TessellationEvaluationPointU ? 0 : 1; + const auto component_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); + const auto pointer{ + ctx.OpAccessChain(component_ptr, ctx.tess_coord, ctx.ConstU32(component))}; + return ctx.OpLoad(ctx.F32[1], pointer); + } else if (IR::IsParam(attr)) { + const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; + const auto param = ctx.input_params.at(param_id).id; + const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; + const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); + return ctx.OpLoad(ctx.F32[1], + ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); + } + UNREACHABLE(); } if (IR::IsParam(attr)) { @@ -276,6 +306,7 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value, ctx.u32_zero_value); case IR::Attribute::PrimitiveId: + case IR::Attribute::TessPatchIdInVgt: // TODO see why this isnt DCEd ASSERT(ctx.info.l_stage == LogicalStage::Geometry || ctx.info.l_stage == LogicalStage::TessellationControl || ctx.info.l_stage == LogicalStage::TessellationEval); @@ -301,7 +332,13 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen LOG_WARNING(Render_Vulkan, "Ignoring pos1 export"); return; } - const Id pointer{OutputAttrPointer(ctx, attr, element)}; + + Id pointer; + if (ctx.l_stage == LogicalStage::TessellationControl) { + pointer = OutputAttrPointer(ctx, attr, ctx.OpLoad(ctx.U32[1], ctx.invocation_id), element); + } else { + pointer = OutputAttrPointer(ctx, attr, element); + } const auto component_type{OutputAttrComponentType(ctx, attr)}; if (component_type.second) { ctx.OpStore(pointer, ctx.OpBitcast(component_type.first, value)); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 29ffb916a..c7fa672b9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -85,7 +85,7 @@ Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index); +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); Id EmitGetPatch(EmitContext& ctx, IR::Patch patch); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 81376c4f0..06e42cae8 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -388,13 +388,52 @@ void EmitContext::DefineInputs() { } case LogicalStage::TessellationControl: { invocation_id = - DefineVariable(U32[3], spv::BuiltIn::InvocationId, spv::StorageClass::Input); + DefineVariable(U32[1], spv::BuiltIn::InvocationId, spv::StorageClass::Input); patch_vertices = DefineVariable(U32[1], spv::BuiltIn::PatchVertices, spv::StorageClass::Input); + primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); + + for (u32 i = 0; i < IR::NumParams; i++) { + const IR::Attribute param{IR::Attribute::Param0 + i}; + if (!info.loads.GetAny(param)) { + continue; + } + const u32 num_components = info.loads.NumComponents(param); + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id type{TypeArray(F32[4], ConstU32(32u))}; + const Id id{DefineInput(type, i)}; + Name(id, fmt::format("in_attr{}", i)); + input_params[i] = {id, input_f32, F32[1], 4}; + } break; } case LogicalStage::TessellationEval: { tess_coord = DefineInput(F32[3], std::nullopt, spv::BuiltIn::TessCoord); + primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); + + for (u32 i = 0; i < IR::NumParams; i++) { + const IR::Attribute param{IR::Attribute::Param0 + i}; + if (!info.loads.GetAny(param)) { + continue; + } + const u32 num_components = info.loads.NumComponents(param); + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id type{TypeArray(F32[4], ConstU32(32u))}; + const Id id{DefineInput(type, i)}; + Name(id, fmt::format("in_attr{}", i)); + input_params[i] = {id, input_f32, F32[1], 4}; + } + + u32 patch_base_location = runtime_info.vs_info.hs_output_cp_stride >> 4; + for (size_t index = 0; index < 30; ++index) { + if (!(info.uses_patches & (1U << index))) { + continue; + } + const Id id{DefineInput(F32[4], patch_base_location + index)}; + Decorate(id, spv::Decoration::Patch); + Name(id, fmt::format("patch_in{}", index)); + patches[index] = id; + } break; } default: @@ -405,6 +444,9 @@ void EmitContext::DefineInputs() { void EmitContext::DefineOutputs() { switch (l_stage) { case LogicalStage::Vertex: { + // No point in defining builtin outputs (i.e. position) unless next stage is fragment? + // Might cause problems linking with tcs + output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) || info.stores.Get(IR::Attribute::Position2) || @@ -442,16 +484,58 @@ void EmitContext::DefineOutputs() { DefineOutput(type, std::nullopt, spv::BuiltIn::TessLevelInner); Decorate(output_tess_level_inner, spv::Decoration::Patch); } + + for (u32 i = 0; i < IR::NumParams; i++) { + const IR::Attribute param{IR::Attribute::Param0 + i}; + if (!info.stores.GetAny(param)) { + continue; + } + const u32 num_components = info.stores.NumComponents(param); + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id type{TypeArray(F32[4], ConstU32(runtime_info.hs_info.output_control_points))}; + const Id id{DefineOutput(type, i)}; + Name(id, fmt::format("out_attr{}", i)); + output_params[i] = {id, output_f32, F32[1], 4}; + } + + u32 patch_base_location = runtime_info.hs_info.hs_output_cp_stride >> 4; for (size_t index = 0; index < 30; ++index) { if (!(info.uses_patches & (1U << index))) { continue; } - const Id id{DefineOutput(F32[4], index)}; + const Id id{DefineOutput(F32[4], patch_base_location + index)}; Decorate(id, spv::Decoration::Patch); + Name(id, fmt::format("patch_out{}", index)); patches[index] = id; } break; } + case LogicalStage::TessellationEval: { + // TODO copied from logical vertex, figure this out + output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); + const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) || + info.stores.Get(IR::Attribute::Position2) || + info.stores.Get(IR::Attribute::Position3); + if (has_extra_pos_stores) { + const Id type{TypeArray(F32[1], ConstU32(8U))}; + clip_distances = + DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output); + cull_distances = + DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output); + } + for (u32 i = 0; i < IR::NumParams; i++) { + const IR::Attribute param{IR::Attribute::Param0 + i}; + if (!info.stores.GetAny(param)) { + continue; + } + const u32 num_components = info.stores.NumComponents(param); + const Id id{DefineOutput(F32[num_components], i)}; + Name(id, fmt::format("out_attr{}", i)); + output_params[i] = + GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true); + } + break; + } case LogicalStage::Fragment: for (u32 i = 0; i < IR::NumRenderTargets; i++) { const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i}; diff --git a/src/shader_recompiler/frontend/tessellation.h b/src/shader_recompiler/frontend/tessellation.h new file mode 100644 index 000000000..97e298486 --- /dev/null +++ b/src/shader_recompiler/frontend/tessellation.h @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/types.h" + +namespace Shader { + +struct TessellationDataConstantBuffer { + u32 m_lsStride; + u32 m_hsCpStride; // HullStateConstants::m_cpStride != 0 ? HullStateConstants::m_cpStride : + // ls_stride + u32 m_hsNumPatch; // num patches submitted in threadgroup + u32 m_hsOutputBase; // HullStateConstants::m_numInputCP::m_cpStride != 0 ? + // HullStateConstants::m_numInputCP * ls_stride * num_patches : 0 + u32 m_patchConstSize; // 16 * num_patch_attrs + u32 m_patchConstBase; // hs_output_base + patch_output_size + u32 m_patchOutputSize; // output_cp_stride * num_output_cp + f32 m_offChipTessellationFactorThreshold; + u32 m_firstEdgeTessFactorIndex; +}; + +} // namespace Shader \ No newline at end of file diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 549464580..1ef0d82d8 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -81,9 +81,9 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { case Opcode::S_MUL_I32: return S_MUL_I32(inst); case Opcode::S_BFE_I32: - return S_BFE_I32(inst); + return S_BFE(inst, true); case Opcode::S_BFE_U32: - return S_BFE_U32(inst); + return S_BFE(inst, false); case Opcode::S_ABSDIFF_I32: return S_ABSDIFF_I32(inst); @@ -438,30 +438,12 @@ void Translator::S_MUL_I32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IMul(GetSrc(inst.src[0]), GetSrc(inst.src[1]))); } -void Translator::S_BFE_U32(const GcnInst& inst) { +void Translator::S_BFE(const GcnInst& inst, bool is_signed) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 offset{ir.BitwiseAnd(src1, ir.Imm32(0x1F))}; const IR::U32 count{ir.BitFieldExtract(src1, ir.Imm32(16), ir.Imm32(7))}; - const IR::U32 result{ir.BitFieldExtract(src0, offset, count)}; - SetDst(inst.dst[0], result); - ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); -} - -void Translator::S_BFE_I32(const GcnInst& inst) { - const IR::U32 src0{GetSrc(inst.src[0])}; - const IR::U32 src1{GetSrc(inst.src[1])}; - IR::U32 result; - - ASSERT_MSG(src1.IsImmediate(), "Unhandled S_BFE_I32 with non-immediate mask"); - u32 mask = src1.U32(); - ASSERT(mask != 0); - u32 offset = std::countr_zero(mask); - u32 count = std::popcount(mask); - mask = mask >> offset; - ASSERT_MSG((mask & (mask + 1)) == 0, "mask {} has non-adjacent bits set"); - - result = ir.BitFieldExtract(src0, ir.Imm32(offset), ir.Imm32(count), true); + const IR::U32 result{ir.BitFieldExtract(src0, offset, count, is_signed)}; SetDst(inst.dst[0], result); ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } diff --git a/src/shader_recompiler/frontend/translate/scalar_flow.cpp b/src/shader_recompiler/frontend/translate/scalar_flow.cpp index ef8bab789..fe9a5c8e8 100644 --- a/src/shader_recompiler/frontend/translate/scalar_flow.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_flow.cpp @@ -35,7 +35,12 @@ void Translator::EmitFlowControl(u32 pc, const GcnInst& inst) { } void Translator::S_BARRIER() { - ir.Barrier(); + if (info.l_stage == LogicalStage::TessellationControl) { + // TODO: ASSERT that we're in uniform control flow + ir.TcsOutputBarrier(); + } else { + ir.Barrier(); + } } void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) { diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 72263b3bf..08612548a 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -94,7 +94,7 @@ public: void S_ASHR_I32(const GcnInst& inst); void S_BFM_B32(const GcnInst& inst); void S_MUL_I32(const GcnInst& inst); - void S_BFE_U32(const GcnInst& inst); + void S_BFE(const GcnInst& inst, bool is_signed); void S_BFE_I32(const GcnInst& inst); void S_ABSDIFF_I32(const GcnInst& inst); void S_NOT_B32(const GcnInst& inst); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 53db1b5b0..814af10c3 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -11,6 +11,7 @@ #include "common/types.h" #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/frontend/copy_shader.h" +#include "shader_recompiler/frontend/tessellation.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/passes/srt.h" #include "shader_recompiler/ir/reg.h" @@ -174,6 +175,10 @@ struct Info { PersistentSrtInfo srt_info; std::vector flattened_ud_buf; + // TODO handle indirection + IR::ScalarReg tess_consts_ptr_base = IR::ScalarReg::Max; + s32 tess_consts_dword_offset = -1; + std::span user_data; Stage stage; LogicalStage l_stage; @@ -248,6 +253,21 @@ struct Info { srt_info.walker_func(user_data.data(), flattened_ud_buf.data()); } } + + // TODO probably not needed + bool FoundTessConstantsSharp() { + return tess_consts_dword_offset >= 0; + } + + void ReadTessConstantBuffer(TessellationDataConstantBuffer& tess_constants) { + ASSERT(FoundTessConstantsSharp()); + auto buf = ReadUdReg(static_cast(tess_consts_ptr_base), + static_cast(tess_consts_dword_offset)); + VAddr tess_constants_addr = buf.base_address; + memcpy(&tess_constants, + reinterpret_cast(tess_constants_addr), + sizeof(tess_constants)); + } }; constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index 12edb28dc..e297bda6e 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -118,14 +118,34 @@ std::string NameOf(Attribute attribute) { return "LocalInvocationIndex"; case Attribute::InvocationId: return "InvocationId"; - case Attribute::PackedHullInvocationInfo: - return "PackedHullInvocationInfo"; case Attribute::PatchVertices: return "PatchVertices"; case Attribute::TessellationEvaluationPointU: return "TessellationEvaluationPointU"; case Attribute::TessellationEvaluationPointV: return "TessellationEvaluationPointV"; + case Attribute::PackedHullInvocationInfo: + return "PackedHullInvocationInfo"; + case Attribute::TcsLsStride: + return "TcsLsStride"; + case Attribute::TcsCpStride: + return "TcsCpStride"; + case Attribute::TcsNumPatches: + return "TcsNumPatches"; + case Attribute::TcsOutputBase: + return "TcsOutputBase"; + case Attribute::TcsPatchConstSize: + return "TcsPatchConstSize"; + case Attribute::TcsPatchConstBase: + return "TcsPatchConstBase"; + case Attribute::TcsPatchOutputSize: + return "TcsPatchOutputSize"; + case Attribute::TcsOffChipTessellationFactorThreshold: + return "TcsOffChipTessellationFactorThreshold"; + case Attribute::TcsFirstEdgeTessFactorIndex: + return "TcsFirstEdgeTessFactorIndex"; + case Attribute::TessPatchIdInVgt: + return "TessPatchIdInVgt"; default: break; } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 3d3e48923..b8d77d45c 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -75,11 +75,23 @@ enum class Attribute : u64 { InstanceId0 = 78, // step rate 0 InstanceId1 = 79, // step rate 1 InvocationId = 80, // TCS id in output patch and instanced geometry shader id + PatchVertices = 81, + TessellationEvaluationPointU = 82, + TessellationEvaluationPointV = 83, PackedHullInvocationInfo = - 81, // PrimitiveId (patch id) and InvocationId (output control point id) - PatchVertices = 82, - TessellationEvaluationPointU = 83, - TessellationEvaluationPointV = 84, + 84, // PrimitiveId (patch id) and InvocationId (output control point id) + // Probably don't need all these. + // Most should be dead after hull shader transform + TcsLsStride = 85, + TcsCpStride = 86, + TcsNumPatches = 87, + TcsOutputBase = 88, + TcsPatchConstSize = 89, + TcsPatchConstBase = 90, + TcsPatchOutputSize = 91, + TcsOffChipTessellationFactorThreshold = 92, + TcsFirstEdgeTessFactorIndex = 93, + TessPatchIdInVgt = 94, Max, }; @@ -91,6 +103,11 @@ constexpr bool IsPosition(Attribute attribute) noexcept { return attribute >= Attribute::Position0 && attribute <= Attribute::Position3; } +constexpr bool IsTessCoord(Attribute attribute) noexcept { + return attribute >= Attribute::TessellationEvaluationPointU && + attribute <= Attribute::TessellationEvaluationPointV; +} + constexpr bool IsParam(Attribute attribute) noexcept { return attribute >= Attribute::Param0 && attribute <= Attribute::Param31; } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 25cb9b2b3..6cab6aa79 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -270,8 +270,8 @@ void IREmitter::SetM0(const U32& value) { Inst(Opcode::SetM0, value); } -F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) { - return Inst(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index)); +F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) { + return Inst(Opcode::GetAttribute, attribute, Imm32(comp), index); } U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 00e81d65a..7af39baef 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -82,7 +82,8 @@ public: [[nodiscard]] U1 Condition(IR::Condition cond); - [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0); + [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, + IR::Value index = IR::Value(u32(0u))); [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); @@ -338,6 +339,7 @@ private: template T Inst(Opcode op, Args... args) { auto it{block->PrependNewInst(insertion_point, op, {Value{args}...})}; + it->SetParent(block); return T{Value{&*it}}; } @@ -355,6 +357,7 @@ private: u32 raw_flags{}; std::memcpy(&raw_flags, &flags.proxy, sizeof(flags.proxy)); auto it{block->PrependNewInst(insertion_point, op, {Value{args}...}, raw_flags)}; + it->SetParent(block); return T{Value{&*it}}; } }; diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 9f3ccd52f..9196350ff 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -153,6 +153,7 @@ void Inst::AddPhiOperand(Block* predecessor, const Value& value) { void Inst::Invalidate() { ClearArgs(); + ASSERT(users.list.empty()); ReplaceOpcode(Opcode::Void); } diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index 9624ce6a5..6a27cba04 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -294,6 +294,9 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::IMul32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); return; + case IR::Opcode::UDiv32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a / b; }); + return; case IR::Opcode::FPCmpClass32: FoldCmpClass(block, inst); return; diff --git a/src/shader_recompiler/ir/passes/constant_propogation.h b/src/shader_recompiler/ir/passes/constant_propogation.h new file mode 100644 index 000000000..313a3cc6a --- /dev/null +++ b/src/shader_recompiler/ir/passes/constant_propogation.h @@ -0,0 +1,4 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp index d27c75bd1..b88008372 100644 --- a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -1,10 +1,29 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include "shader_recompiler/ir/breadth_first_search.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/program.h" +// TODO delelte +#include "common/io_file.h" +#include "common/path_util.h" + namespace Shader::Optimization { +static void DumpIR(IR::Program& program, std::string phase) { + std::string s = IR::DumpProgram(program); + using namespace Common::FS; + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = + fmt::format("{}_{:#018x}.{}.ir.txt", program.info.stage, program.info.pgm_hash, phase); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; + file.WriteString(s); +}; + /** * Tessellation shaders pass outputs to the next shader using LDS. * The Hull shader stage receives input control points stored in LDS. @@ -66,17 +85,411 @@ namespace Shader::Optimization { * Must be placed in uniform control flow */ -void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_info) { - LOG_INFO(Render_Vulkan, "{}", IR::DumpProgram(program)); +// Bad pattern matching attempt +template +struct MatchObject { + inline bool DoMatch(IR::Value v) { + return static_cast(this)->DoMatch(v); + } +}; + +struct MatchValue : MatchObject { + MatchValue(IR::Value& return_val_) : return_val(return_val_) {} + + inline bool DoMatch(IR::Value v) { + return_val = v; + return true; + } + +private: + IR::Value& return_val; +}; + +struct MatchIgnore : MatchObject { + MatchIgnore() {} + + inline bool DoMatch(IR::Value v) { + return true; + } +}; + +struct MatchImm : MatchObject { + MatchImm(IR::Value& v) : return_val(v) {} + + inline bool DoMatch(IR::Value v) { + if (!v.IsImmediate()) { + return false; + } + + return_val = v; + return true; + } + +private: + IR::Value& return_val; +}; + +// Specific +struct MatchAttribute : MatchObject { + MatchAttribute(IR::Attribute attribute_) : attribute(attribute_) {} + + inline bool DoMatch(IR::Value v) { + return v.Type() == IR::Type::Attribute && v.Attribute() == attribute; + } + +private: + IR::Attribute attribute; +}; + +// Specific +struct MatchU32 : MatchObject { + MatchU32(u32 imm_) : imm(imm_) {} + + inline bool DoMatch(IR::Value v) { + return v.Type() == IR::Type::U32 && v.U32() == imm; + } + +private: + u32 imm; +}; + +template +struct MatchInstObject : MatchObject> { + static_assert(sizeof...(Args) == IR::NumArgsOf(opcode)); + MatchInstObject(Args&&... args) : pattern(std::forward_as_tuple(args...)) {} + + inline bool DoMatch(IR::Value v) { + IR::Inst* inst = v.TryInstRecursive(); + if (!inst || inst->GetOpcode() != opcode) { + return false; + } + + bool matched = true; + + [&](std::index_sequence) { + ((matched = matched && std::get(pattern).DoMatch(inst->Arg(Is))), ...); + }(std::make_index_sequence{}); + + return matched; + } + +private: + using MatchArgs = std::tuple; + MatchArgs pattern; +}; + +template +auto MakeInstPattern(Args&&... args) { + return MatchInstObject(std::forward(args)...); +} + +struct MatchFoldImm : MatchObject { + MatchFoldImm(IR::Value& v) : return_val(v) {} + + inline bool DoMatch(IR::Value v); + +private: + IR::Value& return_val; +}; + +// Represent address as sum of products +// Input control point: +// PrimitiveId * input_cp_stride * #cp_per_input_patch + index * input_cp_stride + (attr# * 16 + +// component) +// Output control point +// #patches * input_cp_stride * #cp_per_input_patch + PrimitiveId * output_patch_stride + +// InvocationID * output_cp_stride + (attr# * 16 + component) +// Per patch output: +// #patches * input_cp_stride * #cp_per_input_patch + #patches * output_patch_stride + +// + PrimitiveId * per_patch_output_stride + (attr# * 16 + component) + +// Sort terms left to right + +namespace { + +static void InitTessConstants(IR::ScalarReg sharp_ptr_base, s32 sharp_dword_offset, + Shader::Info& info, Shader::RuntimeInfo& runtime_info, + TessellationDataConstantBuffer& tess_constants) { + info.tess_consts_ptr_base = sharp_ptr_base; + info.tess_consts_dword_offset = sharp_dword_offset; + info.ReadTessConstantBuffer(tess_constants); + if (info.l_stage == LogicalStage::TessellationControl) { + runtime_info.hs_info.InitFromTessConstants(tess_constants); + } else { + runtime_info.vs_info.InitFromTessConstants(tess_constants); + } + + return; +} + +struct TessSharpLocation { + IR::ScalarReg ptr_base; + u32 dword_off; +}; + +std::optional FindTessConstantSharp(IR::Inst* read_const_buffer) { + IR::Value sharp_ptr_base; + IR::Value sharp_dword_offset; + + IR::Value rv = IR::Value{read_const_buffer}; + IR::Value handle = read_const_buffer->Arg(0); + + if (MakeInstPattern( + MakeInstPattern(MatchImm(sharp_dword_offset)), MatchIgnore(), + MatchIgnore(), MatchIgnore()) + .DoMatch(handle)) { + return TessSharpLocation{.ptr_base = IR::ScalarReg::Max, + .dword_off = static_cast(sharp_dword_offset.ScalarReg())}; + } else if (MakeInstPattern( + MakeInstPattern( + MakeInstPattern( + MakeInstPattern(MatchImm(sharp_ptr_base)), + MatchIgnore()), + MatchImm(sharp_dword_offset)), + MatchIgnore(), MatchIgnore(), MatchIgnore()) + .DoMatch(handle)) { + return TessSharpLocation{.ptr_base = sharp_ptr_base.ScalarReg(), + .dword_off = sharp_dword_offset.U32()}; + } + UNREACHABLE_MSG("failed to match tess constants sharp buf"); + return {}; +} + +static IR::Program* g_program; // TODO delete + +enum AttributeRegion { InputCP, OutputCP, PatchConst, Unknown }; + +struct RingAddressInfo { + AttributeRegion region{}; + u32 attribute_byte_offset{}; + // For InputCP and OutputCP, offset from the start of the patch's memory (including + // attribute_byte_offset) For PatchConst, not relevant + IR::U32 offset_in_patch{IR::Value(0u)}; +}; + +class Pass { +public: + Pass(Info& info_, RuntimeInfo& runtime_info_) : info(info_), runtime_info(runtime_info_) { + InitTessConstants(info.tess_consts_ptr_base, info.tess_consts_dword_offset, info, + runtime_info, tess_constants); + } + + RingAddressInfo WalkRingAccess(IR::Inst* access, IR::IREmitter& insert_point) { + Reset(); + RingAddressInfo address_info{}; + + IR::Value addr; + switch (access->GetOpcode()) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: + addr = access->Arg(0); + break; + case IR::Opcode::StoreBufferU32: + case IR::Opcode::StoreBufferU32x2: + case IR::Opcode::StoreBufferU32x3: + case IR::Opcode::StoreBufferU32x4: + addr = access->Arg(1); + break; + default: + UNREACHABLE(); + } + + products.emplace_back(addr); + Visit(addr); + + FindIndexInfo(address_info, insert_point); + + return address_info; + } + +private: + void Reset() { + within_mul = false; + products.clear(); + } + + void Visit(IR::Value node) { + IR::Value a, b, c; + + if (MakeInstPattern(MatchValue(a), MatchValue(b)).DoMatch(node)) { + bool saved_within_mul = within_mul; + within_mul = true; + Visit(a); + Visit(b); + within_mul = saved_within_mul; + } else if (MakeInstPattern(MatchValue(a), MatchValue(b)) + .DoMatch(node)) { + if (within_mul) { + UNREACHABLE_MSG("Test"); + products.back().as_factors.emplace_back(IR::U32{node}); + } else { + products.back().as_nested_value = IR::U32{a}; + Visit(a); + products.emplace_back(b); + Visit(b); + } + } else if (MakeInstPattern(MatchValue(a), MatchImm(b)) + .DoMatch(node)) { + products.back().as_factors.emplace_back(IR::Value(u32(2 << (b.U32() - 1)))); + Visit(a); + } else if (MakeInstPattern(MatchIgnore(), MatchValue(b)) + .DoMatch(node)) { + IR::Inst* read_const_buffer = node.InstRecursive(); + IR::Value index = read_const_buffer->Arg(1); + + if (index.IsImmediate()) { + u32 offset = index.U32(); + if (offset < static_cast(IR::Attribute::TcsFirstEdgeTessFactorIndex) - + static_cast(IR::Attribute::TcsLsStride) + 1) { + IR::Attribute tess_constant_attr = static_cast( + static_cast(IR::Attribute::TcsLsStride) + offset); + IR::IREmitter ir{*read_const_buffer->GetParent(), + IR::Block::InstructionList::s_iterator_to(*read_const_buffer)}; + + ASSERT(tess_constant_attr != + IR::Attribute::TcsOffChipTessellationFactorThreshold); + IR::U32 replacement = ir.GetAttributeU32(tess_constant_attr); + + read_const_buffer->ReplaceUsesWithAndRemove(replacement); + // Unwrap the attribute from the GetAttribute Inst and push back as a factor + // (more convenient for scanning the factors later) + node = IR::Value{tess_constant_attr}; + + if (IR::Value{read_const_buffer} == products.back().as_nested_value) { + products.back().as_nested_value = replacement; + } + } + } + products.back().as_factors.emplace_back(node); + } else if (MakeInstPattern(MatchValue(a), MatchU32(0)) + .DoMatch(node)) { + products.back().as_factors.emplace_back(a); + } else if (MakeInstPattern(MatchValue(a), MatchIgnore(), + MatchIgnore()) + .DoMatch(node)) { + Visit(a); + } else if (MakeInstPattern(MatchValue(a), MatchIgnore(), + MatchIgnore()) + .DoMatch(node)) { + Visit(a); + } else if (MakeInstPattern(MatchValue(a)).DoMatch(node)) { + return Visit(a); + } else if (MakeInstPattern(MatchValue(a)).DoMatch(node)) { + return Visit(a); + } else if (node.TryInstRecursive() && + node.InstRecursive()->GetOpcode() == IR::Opcode::Phi) { + DEBUG_ASSERT(false && "Phi test"); + products.back().as_factors.emplace_back(node); + } else { + products.back().as_factors.emplace_back(node); + } + } + + void FindIndexInfo(RingAddressInfo& address_info, IR::IREmitter& ir) { + // infer which attribute base the address is indexing + // by how many addends are multiplied by TessellationDataConstantBuffer::m_hsNumPatch. + // Also handle m_hsOutputBase or m_patchConstBase + u32 region_count = 0; + + // Remove addends except for the attribute offset and possibly the + // control point index calc + std::erase_if(products, [&](Product& p) { + for (IR::Value& value : p.as_factors) { + if (value.Type() == IR::Type::Attribute) { + if (value.Attribute() == IR::Attribute::TcsNumPatches || + value.Attribute() == IR::Attribute::TcsOutputBase) { + ++region_count; + return true; + } else if (value.Attribute() == IR::Attribute::TcsPatchConstBase) { + region_count += 2; + return true; + } else if (value.Attribute() == IR::Attribute::TessPatchIdInVgt) { + return true; + } + } + } + return false; + }); + + // DumpIR(*g_program, "before_crash"); + + // Look for some term with a dynamic index (should be the control point index) + for (auto i = 0; i < products.size(); i++) { + auto& factors = products[i].as_factors; + // Remember this as the index term + if (std::any_of(factors.begin(), factors.end(), [&](const IR::Value& v) { + return !v.IsImmediate() || v.Type() == IR::Type::Attribute; + })) { + address_info.offset_in_patch = + ir.IAdd(address_info.offset_in_patch, products[i].as_nested_value); + } else { + ASSERT_MSG(factors.size() == 1, "factors all const but not const folded"); + // Otherwise assume it contributes to the attribute + address_info.offset_in_patch = + ir.IAdd(address_info.offset_in_patch, IR::U32{factors[0]}); + address_info.attribute_byte_offset += factors[0].U32(); + } + } + + if (region_count == 0) { + address_info.region = AttributeRegion::InputCP; + } else if (info.l_stage == LogicalStage::TessellationControl && + runtime_info.hs_info.IsPassthrough()) { + ASSERT(region_count <= 1); + address_info.region = AttributeRegion::PatchConst; + } else { + ASSERT(region_count <= 2); + address_info.region = AttributeRegion(region_count); + } + } + + Info& info; + RuntimeInfo& runtime_info; + + TessellationDataConstantBuffer tess_constants; + bool within_mul{}; + + // One product in the sum of products making up an address + struct Product { + Product(IR::Value val_) : as_nested_value(val_), as_factors() {} + Product(const Product& other) = default; + ~Product() = default; + + // IR value used as an addend in address calc + IR::U32 as_nested_value; + // all the leaves that feed the multiplication, linear + // TODO small_vector + // boost::container::small_vector as_factors; + std::vector as_factors; + }; + + std::vector products; +}; + +} // namespace + +void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { + g_program = &program; // TODO delete + Info& info = program.info; + Pass pass(info, runtime_info); + for (IR::Block* block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { - IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + IR::IREmitter ir{*block, + IR::Block::InstructionList::s_iterator_to(inst)}; // TODO sink this const auto opcode = inst.GetOpcode(); switch (opcode) { case IR::Opcode::StoreBufferU32: case IR::Opcode::StoreBufferU32x2: case IR::Opcode::StoreBufferU32x3: case IR::Opcode::StoreBufferU32x4: { + // TODO: rename struct + RingAddressInfo address_info = pass.WalkRingAccess(&inst, ir); + const auto info = inst.Flags(); if (!info.globally_coherent) { break; @@ -89,11 +502,30 @@ void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_ return ir.BitCast(IR::U32{data}); }; const u32 num_dwords = u32(opcode) - u32(IR::Opcode::StoreBufferU32) + 1; - const auto factor_idx = info.inst_offset.Value() >> 2; + const u32 gcn_factor_idx = + (info.inst_offset.Value() + address_info.attribute_byte_offset) >> 2; + const IR::Value data = inst.Arg(2); + auto get_factor_attr = [&](u32 gcn_factor_idx) -> IR::Patch { + ASSERT(gcn_factor_idx * 4 < runtime_info.hs_info.tess_factor_stride); + + switch (runtime_info.hs_info.tess_factor_stride) { + case 24: + return IR::PatchFactor(gcn_factor_idx); + case 16: + if (gcn_factor_idx == 3) { + return IR::Patch::TessellationLodInteriorU; + } + return IR::PatchFactor(gcn_factor_idx); + + default: + UNREACHABLE_MSG("Unhandled tess factor stride"); + } + }; + inst.Invalidate(); if (num_dwords == 1) { - ir.SetPatch(IR::PatchFactor(factor_idx), GetValue(data)); + ir.SetPatch(get_factor_attr(gcn_factor_idx), GetValue(data)); break; } auto* inst = data.TryInstRecursive(); @@ -101,13 +533,20 @@ void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_ inst->GetOpcode() == IR::Opcode::CompositeConstructU32x3 || inst->GetOpcode() == IR::Opcode::CompositeConstructU32x4)); for (s32 i = 0; i < num_dwords; i++) { - ir.SetPatch(IR::PatchFactor(factor_idx + i), GetValue(inst->Arg(i))); + ir.SetPatch(get_factor_attr(gcn_factor_idx + i), GetValue(inst->Arg(i))); } break; } + + // case IR::Opcode::WriteSharedU128: // TODO case IR::Opcode::WriteSharedU32: case IR::Opcode::WriteSharedU64: { - const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; + // DumpIR(program, "before_walk"); + RingAddressInfo address_info = pass.WalkRingAccess(&inst, ir); + + const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 + ? 1 + : (opcode == IR::Opcode::WriteSharedU64 ? 2 : 4); const IR::Value data = inst.Arg(1); const auto [data_lo, data_hi] = [&] -> std::pair { if (num_dwords == 1) { @@ -116,38 +555,148 @@ void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_ const auto* prod = data.InstRecursive(); return {IR::U32{prod->Arg(0)}, IR::U32{prod->Arg(1)}}; }(); - const IR::Inst* ds_offset = inst.Arg(0).InstRecursive(); - const u32 offset_dw = ds_offset->Arg(1).U32() >> 4; - IR::Inst* prod = ds_offset->Arg(0).TryInstRecursive(); - ASSERT(prod && (prod->GetOpcode() == IR::Opcode::IAdd32 || - prod->GetOpcode() == IR::Opcode::IMul32)); - if (prod->GetOpcode() == IR::Opcode::IAdd32) { - prod = prod->Arg(0).TryInstRecursive(); - ASSERT(prod && prod->GetOpcode() == IR::Opcode::IMul32); - } - prod = prod->Arg(0).TryInstRecursive(); - ASSERT(prod && prod->GetOpcode() == IR::Opcode::BitFieldSExtract && - prod->Arg(2).IsImmediate() && prod->Arg(2).U32() == 24); - prod = prod->Arg(0).TryInstRecursive(); - ASSERT(prod && prod->GetOpcode() == IR::Opcode::BitFieldUExtract); - const u32 bit_pos = prod->Arg(1).U32(); - const auto SetOutput = [&ir](IR::U32 value, u32 offset_dw, bool is_patch_const) { + + const auto SetOutput = [&](IR::U32 value, u32 offset_dw, + AttributeRegion output_kind) { const IR::F32 data = ir.BitCast(value); - if (!is_patch_const) { + if (output_kind == AttributeRegion::OutputCP) { const u32 param = offset_dw >> 2; const u32 comp = offset_dw & 3; + // Invocation ID array index is implicit, handled by SPIRV backend ir.SetAttribute(IR::Attribute::Param0 + param, data, comp); } else { + ASSERT(output_kind == AttributeRegion::PatchConst); ir.SetPatch(IR::PatchGeneric(offset_dw), data); } }; - ASSERT_MSG(bit_pos == 0 || bit_pos == 8, "Unknown bit extract pos {}", bit_pos); - const bool is_patch_const = bit_pos == 0; - SetOutput(data_lo, offset_dw, is_patch_const); + + u32 offset_dw = address_info.attribute_byte_offset >> 2; + SetOutput(data_lo, offset_dw, address_info.region); if (num_dwords > 1) { - SetOutput(data_hi, offset_dw + 1, is_patch_const); + // TODO handle WriteSharedU128 + SetOutput(data_hi, offset_dw + 1, address_info.region); } inst.Invalidate(); + + break; + } + + case IR::Opcode::LoadSharedU32: { + // case IR::Opcode::LoadSharedU64: + // case IR::Opcode::LoadSharedU128: + RingAddressInfo address_info = pass.WalkRingAccess(&inst, ir); + + ASSERT(address_info.region == AttributeRegion::InputCP || + address_info.region == AttributeRegion::OutputCP); + switch (address_info.region) { + case AttributeRegion::InputCP: { + u32 offset_dw = + (address_info.attribute_byte_offset % runtime_info.hs_info.ls_stride) >> 2; + const u32 param = offset_dw >> 2; + const u32 comp = offset_dw & 3; + IR::Value control_point_index = + ir.IDiv(IR::U32{address_info.offset_in_patch}, + ir.Imm32(runtime_info.hs_info.ls_stride)); + IR::Value get_attrib = + ir.GetAttribute(IR::Attribute::Param0 + param, comp, control_point_index); + get_attrib = ir.BitCast(IR::F32{get_attrib}); + inst.ReplaceUsesWithAndRemove(get_attrib); + break; + } + case AttributeRegion::OutputCP: { + UNREACHABLE_MSG("Unhandled output control point read"); + break; + } + default: + break; + } + } + + default: + break; + } + } + } + + if (runtime_info.hs_info.IsPassthrough()) { + // Copy input attributes to output attributes, indexed by InvocationID + // Passthrough should imply that input and output patches have same number of vertices + IR::Block* entry_block = *program.blocks.begin(); + auto it = std::ranges::find_if(entry_block->Instructions(), [](IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::Prologue; + }); + ASSERT(it != entry_block->end()); + ++it; + ASSERT(it != entry_block->end()); + ++it; + // Prologue + // SetExec #true + // <- insert here + // ... + IR::IREmitter ir{*entry_block, it}; + + ASSERT(runtime_info.hs_info.ls_stride % 16 == 0); + u32 num_attributes = runtime_info.hs_info.ls_stride / 16; + const auto invocation_id = ir.GetAttributeU32(IR::Attribute::InvocationId); + for (u32 i = 0; i < num_attributes; i++) { + for (u32 j = 0; j < 4; j++) { + const auto input_attr = + ir.GetAttribute(IR::Attribute::Param0 + i, j, invocation_id); + // InvocationId is implicit index for output control point writes + ir.SetAttribute(IR::Attribute::Param0 + i, input_attr, j); + } + } + // TODO: wrap rest of program with if statement when passthrough? + // copy passthrough attributes ... + // if (InvocationId == 0) { + // program ... + // } + } +} + +// TODO refactor +void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { + Info& info = program.info; + Pass pass(info, runtime_info); + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto opcode = inst.GetOpcode(); + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: { + // case IR::Opcode::LoadSharedU64: + // case IR::Opcode::LoadSharedU128: // TODO + RingAddressInfo address_info = pass.WalkRingAccess(&inst, ir); + + ASSERT(address_info.region == AttributeRegion::OutputCP || + address_info.region == AttributeRegion::PatchConst); + switch (address_info.region) { + case AttributeRegion::OutputCP: { + u32 offset_dw = (address_info.attribute_byte_offset % + runtime_info.vs_info.hs_output_cp_stride) >> + 2; + const u32 param = offset_dw >> 2; + const u32 comp = offset_dw & 3; + IR::Value control_point_index = + ir.IDiv(IR::U32{address_info.offset_in_patch}, + ir.Imm32(runtime_info.vs_info.hs_output_cp_stride)); + IR::Value get_attrib = + ir.GetAttribute(IR::Attribute::Param0 + param, comp, control_point_index); + get_attrib = ir.BitCast(IR::F32{get_attrib}); + inst.ReplaceUsesWithAndRemove(get_attrib); + break; + } + case AttributeRegion::PatchConst: { + u32 offset_dw = address_info.attribute_byte_offset >> 2; + IR::Value get_patch = ir.GetPatch(IR::PatchGeneric(offset_dw)); + inst.ReplaceUsesWithAndRemove(get_patch); + break; + } + default: + break; + } + break; } default: @@ -155,7 +704,140 @@ void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_ } } } - LOG_INFO(Render_Vulkan, "{}", IR::DumpProgram(program)); +} + +// Run before copy prop +void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) { + TessellationDataConstantBuffer tess_constants; + Shader::Info& info = program.info; + // Find the TessellationDataConstantBuffer V# + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: { + IR::Value addr = inst.Arg(0); + auto read_const_buffer = IR::BreadthFirstSearch( + addr, [](IR::Inst* maybe_tess_const) -> std::optional { + if (maybe_tess_const->GetOpcode() == IR::Opcode::ReadConstBuffer) { + return maybe_tess_const; + } + return std::nullopt; + }); + if (read_const_buffer) { + auto sharp_location = FindTessConstantSharp(read_const_buffer.value()); + if (sharp_location) { + if (info.FoundTessConstantsSharp()) { + ASSERT(static_cast(sharp_location->dword_off) == + info.tess_consts_dword_offset && + sharp_location->ptr_base == info.tess_consts_ptr_base); + } + InitTessConstants(sharp_location->ptr_base, + static_cast(sharp_location->dword_off), info, + runtime_info, tess_constants); + // break; TODO + continue; + } + } + continue; + } + default: + continue; + } + + break; + } + } + + ASSERT(info.FoundTessConstantsSharp()); + + if (info.l_stage == LogicalStage::TessellationControl) { + // Replace the BFEs on V1 (packed with patch id and output cp id) for easier pattern + // matching + for (IR::Block* block : program.blocks) { + for (auto it = block->Instructions().begin(); it != block->Instructions().end(); it++) { + IR::Inst& inst = *it; + if (MakeInstPattern( + MakeInstPattern( + MatchAttribute(IR::Attribute::PackedHullInvocationInfo), MatchIgnore()), + MatchU32(0), MatchU32(8)) + .DoMatch(IR::Value{&inst})) { + IR::IREmitter emit(*block, it); + IR::Value replacement = emit.GetAttributeU32(IR::Attribute::TessPatchIdInVgt); + inst.ReplaceUsesWithAndRemove(replacement); + } else if (MakeInstPattern( + MakeInstPattern( + MatchAttribute(IR::Attribute::PackedHullInvocationInfo), + MatchIgnore()), + MatchU32(8), MatchU32(5)) + .DoMatch(IR::Value{&inst})) { + IR::IREmitter ir(*block, it); + IR::Value replacement; + if (runtime_info.hs_info.IsPassthrough()) { + // Deal with annoying pattern in BB where InvocationID use makes no sense + // (in addr calculation for patchconst write) + replacement = ir.Imm32(0); + } else { + replacement = ir.GetAttributeU32(IR::Attribute::InvocationId); + } + inst.ReplaceUsesWithAndRemove(replacement); + } + } + } + } +} + +void TessellationPostprocess(IR::Program& program, RuntimeInfo& runtime_info) { + Shader::Info& info = program.info; + TessellationDataConstantBuffer tess_constants; + InitTessConstants(info.tess_consts_ptr_base, info.tess_consts_dword_offset, info, runtime_info, + tess_constants); + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::GetAttributeU32) { + switch (inst.Arg(0).Attribute()) { + case IR::Attribute::TcsLsStride: + ASSERT(info.l_stage == LogicalStage::TessellationControl); + inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.m_lsStride)); + break; + case IR::Attribute::TcsCpStride: + inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.m_hsCpStride)); + break; + case IR::Attribute::TcsNumPatches: + case IR::Attribute::TcsOutputBase: + case IR::Attribute::TcsPatchConstSize: + case IR::Attribute::TcsPatchConstBase: + case IR::Attribute::TcsPatchOutputSize: + case IR::Attribute::TcsFirstEdgeTessFactorIndex: + default: + break; + } + } + } + } + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: + UNREACHABLE_MSG("Remaining DS instruction. {} transform failed", + info.l_stage == LogicalStage::TessellationControl ? "Hull" + : "Domain"); + default: + break; + } + } + } } } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 5228006ed..57fd79d55 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -16,7 +16,11 @@ void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerSharedMemToRegisters(IR::Program& program); -void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info); -void HullShaderTransform(const IR::Program& program, const RuntimeInfo& runtime_info); +void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, + Stage stage); +void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); +void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); +void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); +void TessellationPostprocess(IR::Program& program, RuntimeInfo& runtime_info); } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index 207d82e6f..d6f1efb12 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" -#include "shader_recompiler/info.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" #include "shader_recompiler/ir/program.h" @@ -10,80 +9,12 @@ #include "shader_recompiler/recompiler.h" #include "shader_recompiler/runtime_info.h" -namespace { - -// TODO clean this up. Maybe remove -// from https://github.com/chaotic-cx/mesa-mirror/blob/main/src/amd/compiler/README.md -// basically logical stage x hw stage permutations -enum class SwHwStagePerm { - vertex_vs, - fragment_fs, - vertex_ls, - tess_control_hs, - tess_eval_vs, - vertex_es, - geometry_gs, - gs_copy_vs, - tess_eval_es, - compute_cs, -}; - -static SwHwStagePerm GetSwHwStagePerm(Shader::Stage hw_stage, Shader::LogicalStage sw_stage) { - using namespace Shader; - switch (sw_stage) { - case LogicalStage::Fragment: - ASSERT(hw_stage == Stage::Fragment); - return SwHwStagePerm::fragment_fs; - case LogicalStage::Vertex: { - switch (hw_stage) { - case Stage::Vertex: - return SwHwStagePerm::vertex_vs; - case Stage::Export: - return SwHwStagePerm::vertex_es; - case Stage::Local: - return SwHwStagePerm::vertex_ls; - default: - UNREACHABLE(); - } - } break; - case LogicalStage::TessellationControl: - ASSERT(hw_stage == Stage::Hull); - return SwHwStagePerm::tess_control_hs; - case LogicalStage::TessellationEval: { - switch (hw_stage) { - case Stage::Vertex: - return SwHwStagePerm::tess_eval_vs; - case Stage::Export: - return SwHwStagePerm::tess_eval_es; - default: - UNREACHABLE(); - } - } - case LogicalStage::Geometry: - ASSERT(hw_stage == Stage::Geometry); - return SwHwStagePerm::geometry_gs; - case LogicalStage::GsCopy: - ASSERT(hw_stage == Stage::Vertex); - return SwHwStagePerm::gs_copy_vs; - case LogicalStage::Compute: - ASSERT(hw_stage == Stage::Compute); - return SwHwStagePerm::compute_cs; - default: - UNREACHABLE(); - } -} - -}; // namespace - namespace Shader::Optimization { -void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info) { +void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, + Stage stage) { auto& info = program.info; - Stage stage = info.stage; - LogicalStage l_stage = info.l_stage; - SwHwStagePerm stage_perm = GetSwHwStagePerm(stage, l_stage); - const auto& ForEachInstruction = [&](auto func) { for (IR::Block* block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { @@ -93,8 +24,8 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } }; - switch (stage_perm) { - case SwHwStagePerm::vertex_ls: { + switch (stage) { + case Stage::Local: { ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) { const auto opcode = inst.GetOpcode(); switch (opcode) { @@ -126,7 +57,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim }); break; } - case SwHwStagePerm::vertex_es: { + case Stage::Export: { ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) { const auto opcode = inst.GetOpcode(); switch (opcode) { @@ -157,7 +88,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim }); break; } - case SwHwStagePerm::geometry_gs: { + case Stage::Geometry: { const auto& gs_info = runtime_info.gs_info; info.gs_copy_data = Shader::ParseCopyShader(gs_info.vs_copy); @@ -171,7 +102,7 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim } const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; + const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2); const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); const auto bucket = offset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index 49d85fc28..ed1e5536a 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -16,9 +16,9 @@ #include "shader_recompiler/exception.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/patch.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/ir/type.h" -#include "shader_recompiler/ir/patch.h" namespace Shader::IR { diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 399b08a2a..584211602 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -32,7 +32,7 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { } IR::Program TranslateProgram(std::span code, Pools& pools, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile) { + RuntimeInfo& runtime_info, const Profile& profile) { // Ensure first instruction is expected. constexpr u32 token_mov_vcchi = 0xBEEB03FF; if (code[0] != token_mov_vcchi) { @@ -65,53 +65,54 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info // Run optimization passes const auto stage = program.info.stage; - bool dump_ir = true; - bool extra_id_removal = true; // TODO remove all this stuff auto dumpMatchingIR = [&](std::string phase) { - if (dump_ir) { - if (Config::dumpShaders()) { - std::string s = IR::DumpProgram(program); - using namespace Common::FS; - const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; - if (!std::filesystem::exists(dump_dir)) { - std::filesystem::create_directories(dump_dir); - } - const auto filename = - fmt::format("{}_{:#018x}.{}.ir.txt", info.stage, info.pgm_hash, phase); - const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; - file.WriteString(s); + if (Config::dumpShaders()) { + std::string s = IR::DumpProgram(program); + using namespace Common::FS; + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); } + const auto filename = + fmt::format("{}_{:#018x}.{}.ir.txt", info.stage, info.pgm_hash, phase); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write}; + file.WriteString(s); } }; + dumpMatchingIR("init"); + Shader::Optimization::SsaRewritePass(program.post_order_blocks); - if (extra_id_removal) { - Shader::Optimization::IdentityRemovalPass(program.blocks); - } + Shader::Optimization::IdentityRemovalPass(program.blocks); + // Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + dumpMatchingIR("post_ssa"); if (stage == Stage::Hull) { + Shader::Optimization::TessellationPreprocess(program, runtime_info); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); dumpMatchingIR("pre_hull"); Shader::Optimization::HullShaderTransform(program, runtime_info); dumpMatchingIR("post_hull"); + Shader::Optimization::TessellationPostprocess(program, runtime_info); + } else if (info.l_stage == LogicalStage::TessellationEval) { + Shader::Optimization::TessellationPreprocess(program, runtime_info); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + dumpMatchingIR("pre_domain"); + Shader::Optimization::DomainShaderTransform(program, runtime_info); + dumpMatchingIR("post_domain"); + Shader::Optimization::TessellationPostprocess(program, runtime_info); } Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); - if (extra_id_removal) { - Shader::Optimization::IdentityRemovalPass(program.blocks); - } - dumpMatchingIR("pre_ring"); - Shader::Optimization::RingAccessElimination(program, runtime_info); - if (extra_id_removal) { - Shader::Optimization::IdentityRemovalPass(program.blocks); - } - dumpMatchingIR("post_ring"); + Shader::Optimization::RingAccessElimination(program, runtime_info, stage); if (stage != Stage::Compute) { Shader::Optimization::LowerSharedMemToRegisters(program); } - Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::CollectShaderInfoPass(program); + dumpMatchingIR("final"); return program; } diff --git a/src/shader_recompiler/recompiler.h b/src/shader_recompiler/recompiler.h index f8acf6c9e..8180c29b3 100644 --- a/src/shader_recompiler/recompiler.h +++ b/src/shader_recompiler/recompiler.h @@ -28,6 +28,6 @@ struct Pools { }; [[nodiscard]] IR::Program TranslateProgram(std::span code, Pools& pools, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile); + RuntimeInfo& runtime_info, const Profile& profile); } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 808e734ac..290528dae 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -7,6 +7,7 @@ #include #include #include "common/types.h" +#include "shader_recompiler/frontend/tessellation.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/types.h" @@ -74,30 +75,56 @@ struct VertexRuntimeInfo { u32 num_outputs; std::array outputs; bool emulate_depth_negative_one_to_one{}; + // Domain AmdGpu::TessellationType tess_type; AmdGpu::TessellationTopology tess_topology; AmdGpu::TessellationPartitioning tess_partitioning; + u32 hs_output_cp_stride{}; bool operator==(const VertexRuntimeInfo& other) const noexcept { return emulate_depth_negative_one_to_one == other.emulate_depth_negative_one_to_one && tess_type == other.tess_type && tess_topology == other.tess_topology && - tess_partitioning == other.tess_partitioning; + tess_partitioning == other.tess_partitioning && + hs_output_cp_stride == other.hs_output_cp_stride; + } + + void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { + hs_output_cp_stride = tess_constants.m_hsCpStride; } }; struct HullRuntimeInfo { + // from registers u32 output_control_points; - // trying to debug TODO probably delete this - u32 input_control_points; - u32 num_patches; - u32 num_instances; - u64 tess_factor_memory_base; - AmdGpu::TessellationType tess_type; - AmdGpu::TessellationTopology tess_topology; - AmdGpu::TessellationPartitioning tess_partitioning; - bool operator==(const HullRuntimeInfo& other) const noexcept { - return output_control_points == other.output_control_points; + // from HullStateConstants in HsProgram (TODO dont rely on this) + u32 tess_factor_stride; + + // from tess constants buffer + u32 ls_stride; + u32 hs_output_cp_stride; + u32 hs_num_patch; + u32 hs_output_base; + u32 patch_const_size; + u32 patch_const_base; + u32 patch_output_size; + u32 first_edge_tess_factor_index; + + auto operator<=>(const HullRuntimeInfo&) const noexcept = default; + + bool IsPassthrough() { + return hs_output_base == 0; + }; + + void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { + ls_stride = tess_constants.m_lsStride; + hs_output_cp_stride = tess_constants.m_hsCpStride; + hs_num_patch = tess_constants.m_hsNumPatch; + hs_output_base = tess_constants.m_hsOutputBase; + patch_const_size = tess_constants.m_patchConstSize; + patch_const_base = tess_constants.m_patchConstBase; + patch_output_size = tess_constants.m_patchOutputSize; + first_edge_tess_factor_index = tess_constants.m_firstEdgeTessFactorIndex; } }; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index f9dbf71d3..908a4c73b 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -143,6 +143,22 @@ struct Liverpool { } }; + struct HsStageRegisters { + u32 vgt_tf_param; + u32 vgt_hos_max_tess_level; + u32 vgt_hos_min_tess_level; + }; + + struct HsConstants { + u32 num_input_cp; + u32 num_output_cp; + u32 num_patch_const; + u32 cp_stride; + u32 num_threads; + u32 tess_factor_stride; + u32 first_edge_tess_factor_index; + }; + struct ComputeProgram { u32 dispatch_initiator; u32 dim_x; @@ -974,7 +990,8 @@ struct Liverpool { BitField<2, 1, u32> hs_en; BitField<3, 2, u32> es_en; BitField<5, 1, u32> gs_en; - BitField<6, 1, u32> vs_en; + BitField<6, 2, u32> vs_en; + BitField<8, 24, u32> dynamic_hs; // TODO testing bool IsStageEnabled(u32 stage) const { switch (stage) { @@ -1145,7 +1162,11 @@ struct Liverpool { ShaderProgram es_program; INSERT_PADDING_WORDS(0x2C); ShaderProgram hs_program; - INSERT_PADDING_WORDS(0x2C); + // TODO delete. These don't actually correspond to real registers, but I'll stash them + // here to debug + HsStageRegisters hs_registers; + HsConstants hs_constants; + INSERT_PADDING_WORDS(0x2D48 - 0x2d08 - 20 - 3 - 7); ShaderProgram ls_program; INSERT_PADDING_WORDS(0xA4); ComputeProgram cs_program; @@ -1432,6 +1453,8 @@ static_assert(GFX6_3D_REG_INDEX(vs_program.user_data) == 0x2C4C); static_assert(GFX6_3D_REG_INDEX(gs_program) == 0x2C88); static_assert(GFX6_3D_REG_INDEX(es_program) == 0x2CC8); static_assert(GFX6_3D_REG_INDEX(hs_program) == 0x2D08); +static_assert(GFX6_3D_REG_INDEX(hs_registers) == 0x2D1C); +static_assert(GFX6_3D_REG_INDEX(hs_constants) == 0x2D1F); static_assert(GFX6_3D_REG_INDEX(ls_program) == 0x2D48); static_assert(GFX6_3D_REG_INDEX(cs_program) == 0x2E00); static_assert(GFX6_3D_REG_INDEX(cs_program.dim_z) == 0x2E03); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 4904b9d1c..144fec934 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -30,6 +30,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul const vk::Device device = instance.GetDevice(); std::ranges::copy(infos, stages.begin()); BuildDescSetLayout(); + const bool uses_tessellation = stages[u32(LogicalStage::TessellationControl)]; const vk::PushConstantRange push_constants = { .stageFlags = gp_stage_flags, @@ -107,8 +108,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul "Primitive restart index other than -1 is not supported yet"); const vk::PipelineTessellationStateCreateInfo tessellation_state = { - // TODO how to handle optional member of graphics key when dynamic state not supported? - //.patchControlPoints = key. + .patchControlPoints = key.patch_control_points, }; const vk::PipelineRasterizationStateCreateInfo raster_state = { @@ -173,8 +173,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul } else { dynamic_states.push_back(vk::DynamicState::eVertexInputBindingStrideEXT); } - ASSERT(instance.IsPatchControlPointsDynamicState()); // TODO remove - if (instance.IsPatchControlPointsDynamicState()) { + if (uses_tessellation && instance.IsPatchControlPointsDynamicState()) { dynamic_states.push_back(vk::DynamicState::ePatchControlPointsEXT); } @@ -326,8 +325,9 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul .pStages = shader_stages.data(), .pVertexInputState = !instance.IsVertexInputDynamicState() ? &vertex_input_info : nullptr, .pInputAssemblyState = &input_assembly, - .pTessellationState = - !instance.IsPatchControlPointsDynamicState() ? &tessellation_state : nullptr, + .pTessellationState = (uses_tessellation && !instance.IsPatchControlPointsDynamicState()) + ? &tessellation_state + : nullptr, .pViewportState = &viewport_info, .pRasterizationState = &raster_state, .pMultisampleState = &multisampling, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 99588cb3b..444c8517e 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -52,6 +52,7 @@ struct GraphicsPipelineKey { std::array blend_controls; std::array write_masks; std::array vertex_buffer_formats; + u32 patch_control_points; bool operator==(const GraphicsPipelineKey& key) const noexcept { return std::memcmp(this, &key, sizeof(key)) == 0; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 009e9a42e..c25e5cd6c 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -258,7 +258,8 @@ bool Instance::CreateDevice() { add_extension(VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME); vertex_input_dynamic_state = add_extension(VK_EXT_VERTEX_INPUT_DYNAMIC_STATE_EXTENSION_NAME); fragment_shader_barycentric = add_extension(VK_KHR_FRAGMENT_SHADER_BARYCENTRIC_EXTENSION_NAME); - extended_dynamic_state_2 = add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); + const bool extended_dynamic_state_2 = + add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_2_EXTENSION_NAME); // The next two extensions are required to be available together in order to support write masks color_write_en = add_extension(VK_EXT_COLOR_WRITE_ENABLE_EXTENSION_NAME); @@ -328,6 +329,7 @@ bool Instance::CreateDevice() { .imageCubeArray = features.imageCubeArray, .independentBlend = features.independentBlend, .geometryShader = features.geometryShader, + .tessellationShader = features.tessellationShader, .logicOp = features.logicOp, .depthBiasClamp = features.depthBiasClamp, .fillModeNonSolid = features.fillModeNonSolid, @@ -379,6 +381,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT{ .extendedDynamicState = true, }, + vk::PhysicalDeviceExtendedDynamicState2FeaturesEXT{ + .extendedDynamicState2PatchControlPoints = true, + }, vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT{ .extendedDynamicState3ColorWriteMask = true, }, @@ -454,6 +459,16 @@ bool Instance::CreateDevice() { if (!legacy_vertex_attributes) { device_chain.unlink(); } + if (extended_dynamic_state_2) { + patch_control_points_dynamic_state = + feature_chain.get() + .extendedDynamicState2PatchControlPoints; + device_chain.get() + .extendedDynamicState2PatchControlPoints = patch_control_points_dynamic_state; + } else { + patch_control_points_dynamic_state = false; + device_chain.unlink(); + } auto [device_result, dev] = physical_device.createDeviceUnique(device_chain.get()); if (device_result != vk::Result::eSuccess) { diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 844e1e6c0..98a5f6289 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -134,7 +134,7 @@ public: } bool IsPatchControlPointsDynamicState() const { - return extended_dynamic_state_2; + return patch_control_points_dynamic_state; } /// Returns true when the nullDescriptor feature of VK_EXT_robustness2 is supported. @@ -337,7 +337,7 @@ private: bool debug_utils_supported{}; bool has_nsight_graphics{}; bool has_renderdoc{}; - bool extended_dynamic_state_2{}; + bool patch_control_points_dynamic_state{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index c1d937059..1ca460b42 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -97,15 +97,15 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_ } case Stage::Hull: { BuildCommon(regs.hs_program); - info.hs_info.output_control_points = regs.ls_hs_config.hs_output_control_points.Value(); - info.hs_info.input_control_points = regs.ls_hs_config.hs_input_control_points; - info.hs_info.num_patches = regs.ls_hs_config.num_patches; - // Suspicious about this in apparently "passthrough" hull shader. Probably not releva - info.hs_info.num_instances = regs.num_instances.NumInstances(); - info.hs_info.tess_factor_memory_base = regs.vgt_tf_memory_base.MemoryBase(); - info.hs_info.tess_type = regs.tess_config.type; - info.hs_info.tess_topology = regs.tess_config.topology; - info.hs_info.tess_partitioning = regs.tess_config.partitioning; + // TODO: ls_hs_config.output_control_points seems to be == 1 when doing passthrough + // instead of the real number which matches the input patch topology + // info.hs_info.output_control_points = regs.ls_hs_config.hs_output_control_points.Value(); + + // TODO dont rely on HullStateConstants + info.hs_info.output_control_points = regs.hs_constants.num_output_cp; + info.hs_info.tess_factor_stride = regs.hs_constants.tess_factor_stride; + + // We need to initialize most hs_info fields after finding the V# with tess constants break; } case Stage::Export: { @@ -244,27 +244,6 @@ const ComputePipeline* PipelineCache::GetComputePipeline() { return it->second.get(); } -bool ShouldSkipShader(u64 shader_hash, const char* shader_type) { - static std::vector skip_hashes = { - 0xbc234799 /* passthrough */, - 0x8453cd1c /* passthrough */, - 0xd67db0ef /* passthrough */, - 0x34121ac6 /* passthrough*/, - 0xa26750c1 /* passthrough, warp */, - 0xbb88db5f /* passthrough */, - 0x90c6fb05 /* passthrough */, - 0x9fd272d7 /* forbidden woods (not PS) */, - 0x2807dd6c /* forbidden woods, down elevator (not PS) */, - 0x627ac5b9 /* ayyylmao*, passthrough */, - 0xb5fb5174 /* rom (not PS) */, - }; - if (std::ranges::contains(skip_hashes, shader_hash)) { - LOG_WARNING(Render_Vulkan, "Skipped {} shader hash {:#x}.", shader_type, shader_hash); - return true; - } - return false; -} - bool PipelineCache::RefreshGraphicsKey() { std::memset(&graphics_key, 0, sizeof(GraphicsPipelineKey)); @@ -321,6 +300,11 @@ bool PipelineCache::RefreshGraphicsKey() { key.mrt_swizzles.fill(Liverpool::ColorBuffer::SwapMode::Standard); key.vertex_buffer_formats.fill(vk::Format::eUndefined); + key.patch_control_points = 0; + if (regs.stage_enable.hs_en.Value() && !instance.IsPatchControlPointsDynamicState()) { + key.patch_control_points = regs.ls_hs_config.hs_input_control_points.Value(); + } + // First pass of bindings check to idenitfy formats and swizzles and pass them to rhe shader // recompiler. for (auto cb = 0u; cb < Liverpool::NumColorBuffers; ++cb) { @@ -373,10 +357,6 @@ bool PipelineCache::RefreshGraphicsKey() { return false; } - if (ShouldSkipShader(bininfo->shader_hash, "graphics")) { - return false; - } - auto params = Liverpool::GetParams(*pgm); std::optional fetch_shader_; std::tie(infos[stage_out_idx], modules[stage_out_idx], fetch_shader_, @@ -497,8 +477,7 @@ bool PipelineCache::RefreshComputeKey() { return true; } -vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, - const Shader::RuntimeInfo& runtime_info, +vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, Shader::RuntimeInfo& runtime_info, std::span code, size_t perm_idx, Shader::Backend::Bindings& binding) { LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x} {}", info.stage, info.pgm_hash, @@ -532,7 +511,7 @@ vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stage, Shader::ShaderParams params, Shader::Backend::Bindings& binding) { - const auto runtime_info = BuildRuntimeInfo(stage, l_stage); + auto runtime_info = BuildRuntimeInfo(stage, l_stage); auto [it_pgm, new_program] = program_cache.try_emplace(params.hash); if (new_program) { it_pgm.value() = std::make_unique(stage, l_stage, params); @@ -548,6 +527,15 @@ PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stag auto& program = it_pgm.value(); auto& info = program->info; info.RefreshFlatBuf(); + if (l_stage == LogicalStage::TessellationControl || l_stage == LogicalStage::TessellationEval) { + Shader::TessellationDataConstantBuffer tess_constants; + info.ReadTessConstantBuffer(tess_constants); + if (l_stage == LogicalStage::TessellationControl) { + runtime_info.hs_info.InitFromTessConstants(tess_constants); + } else { + runtime_info.vs_info.InitFromTessConstants(tess_constants); + } + } const auto spec = Shader::StageSpecialization(info, runtime_info, profile, binding); size_t perm_idx = program->modules.size(); vk::ShaderModule module{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index d5170dd15..ec4406448 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -73,7 +73,7 @@ private: std::string_view ext); std::optional> GetShaderPatch(u64 hash, Shader::Stage stage, size_t perm_idx, std::string_view ext); - vk::ShaderModule CompileModule(Shader::Info& info, const Shader::RuntimeInfo& runtime_info, + vk::ShaderModule CompileModule(Shader::Info& info, Shader::RuntimeInfo& runtime_info, std::span code, size_t perm_idx, Shader::Backend::Bindings& binding); Shader::RuntimeInfo BuildRuntimeInfo(Shader::Stage stage, Shader::LogicalStage l_stage); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 9e7a333de..39f45a8a3 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -50,9 +50,9 @@ void Rasterizer::CpSync() { bool Rasterizer::FilterDraw() { const auto& regs = liverpool->regs; // Tessellation is unsupported so skip the draw to avoid locking up the driver. - if (regs.primitive_type == AmdGpu::PrimitiveType::PatchPrimitive) { - return false; - } + // if (regs.primitive_type == AmdGpu::PrimitiveType::PatchPrimitive) { + // return false; + // } // There are several cases (e.g. FCE, FMask/HTile decompression) where we don't need to do an // actual draw hence can skip pipeline creation. if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::EliminateFastClear) {