shadPS4/src/shader_recompiler/frontend/translate/export.cpp

// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include "shader_recompiler/frontend/translate/translate.h"
#include "shader_recompiler/ir/position.h"
#include "shader_recompiler/ir/reinterpret.h"
#include "shader_recompiler/runtime_info.h"

namespace Shader::Gcn {

static AmdGpu::NumberFormat NumberFormatCompressed(
    AmdGpu::Liverpool::ShaderExportFormat export_format) {
    switch (export_format) {
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_FP16:
        return AmdGpu::NumberFormat::Float;
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UNORM16:
        return AmdGpu::NumberFormat::Unorm;
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SNORM16:
        return AmdGpu::NumberFormat::Snorm;
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_UINT16:
        return AmdGpu::NumberFormat::Uint;
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_SINT16:
        return AmdGpu::NumberFormat::Sint;
    default:
        UNREACHABLE_MSG("Unimplemented compressed export format {}",
                        static_cast<u32>(export_format));
    }
}

static u32 MaskFromExportFormat(u8 mask, AmdGpu::Liverpool::ShaderExportFormat export_format) {
    switch (export_format) {
    case AmdGpu::Liverpool::ShaderExportFormat::R_32:
        // Red only
        return mask & 1;
    case AmdGpu::Liverpool::ShaderExportFormat::GR_32:
        // Red and Green only
        return mask & 3;
    case AmdGpu::Liverpool::ShaderExportFormat::AR_32:
        // Red and Alpha only
        return mask & 9;
    case AmdGpu::Liverpool::ShaderExportFormat::ABGR_32:
        // All components
        return mask;
    default:
        UNREACHABLE_MSG("Unimplemented uncompressed export format {}",
                        static_cast<u32>(export_format));
    }
}

void Translator::ExportRenderTarget(const GcnInst& inst) {
    const auto& exp = inst.control.exp;
    const IR::Attribute mrt{exp.target};
    info.mrt_mask |= 1u << static_cast<u8>(mrt);

    // Dual source blending uses MRT1 for exporting src1
    u32 color_buffer_idx = static_cast<u32>(mrt) - static_cast<u32>(IR::Attribute::RenderTarget0);
    if (runtime_info.fs_info.dual_source_blending && mrt == IR::Attribute::RenderTarget1) {
        color_buffer_idx = 0;
    }

    const auto color_buffer = runtime_info.fs_info.color_buffers[color_buffer_idx];
    if (color_buffer.export_format == AmdGpu::Liverpool::ShaderExportFormat::Zero || exp.en == 0) {
        // No export
        return;
    }

    std::array<IR::F32, 4> components{};
    if (exp.compr) {
        // Components are float16 packed into a VGPR
        const auto num_format = NumberFormatCompressed(color_buffer.export_format);
        // Export R, G
        if (exp.en & 1) {
            const IR::Value unpacked_value =
                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[0].code)));
            components[0] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
            components[1] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
        }
        // Export B, A
        if ((exp.en >> 2) & 1) {
            const IR::Value unpacked_value =
                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[1].code)));
            components[2] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
            components[3] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
        }
    } else {
        // Components are float32 into separate VGPRS
        u32 mask = MaskFromExportFormat(exp.en, color_buffer.export_format);
        for (u32 i = 0; i < 4; i++, mask >>= 1) {
            if ((mask & 1) == 0) {
                continue;
            }
            components[i] = ir.GetVectorReg<IR::F32>(IR::VectorReg(inst.src[i].code));
        }
    }

    // Metal seems to have an issue where 8-bit unorm/snorm/sRGB outputs to render target
    // need a bias applied to round correctly; detect and set the flag for that here.
    const auto needs_unorm_fixup = profile.needs_unorm_fixup &&
                                   (color_buffer.num_format == AmdGpu::NumberFormat::Unorm ||
                                    color_buffer.num_format == AmdGpu::NumberFormat::Snorm ||
                                    color_buffer.num_format == AmdGpu::NumberFormat::Srgb) &&
                                   (color_buffer.data_format == AmdGpu::DataFormat::Format8 ||
                                    color_buffer.data_format == AmdGpu::DataFormat::Format8_8 ||
                                    color_buffer.data_format == AmdGpu::DataFormat::Format8_8_8_8);

    // Swizzle components and export
    for (u32 i = 0; i < 4; ++i) {
        const auto swizzled_comp = components[color_buffer.swizzle.Map(i)];
        if (swizzled_comp.IsEmpty()) {
            continue;
        }
        auto converted = ApplyWriteNumberConversion(ir, swizzled_comp, color_buffer.num_conversion);
        if (needs_unorm_fixup) {
            // FIXME: Fix-up for GPUs where float-to-unorm rounding is off from expected.
            converted = ir.FPSub(converted, ir.Imm32(1.f / 127500.f));
        }
        ir.SetAttribute(mrt, converted, i);
    }
}

void Translator::ExportDepth(const GcnInst& inst) {
    const auto& exp = inst.control.exp;
    if (exp.en == 0) {
        // No export
        return;
    }

    std::array<IR::F32, 4> components{};
    if (exp.compr) {
        // Components are float16 packed into a VGPR
        const auto num_format = NumberFormatCompressed(runtime_info.fs_info.z_export_format);
        // Export R, G
        if (exp.en & 1) {
            const IR::Value unpacked_value =
                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[0].code)));
            components[0] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
            components[1] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
        }
        // Export B, A
        if ((exp.en >> 2) & 1) {
            const IR::Value unpacked_value =
                ir.Unpack2x16(num_format, ir.GetVectorReg(IR::VectorReg(inst.src[1].code)));
            components[2] = IR::F32{ir.CompositeExtract(unpacked_value, 0)};
            // components[3] = IR::F32{ir.CompositeExtract(unpacked_value, 1)};
        }
    } else {
        // Components are float32 into separate VGPRS
        u32 mask = MaskFromExportFormat(exp.en & runtime_info.fs_info.mrtz_mask,
                                        runtime_info.fs_info.z_export_format);
        for (u32 i = 0; i < 4; i++, mask >>= 1) {
            if ((mask & 1) == 0) {
                continue;
            }
            components[i] = ir.GetVectorReg<IR::F32>(IR::VectorReg(inst.src[i].code));
        }
    }

    static constexpr std::array MrtzBuiltins = {IR::Attribute::Depth, IR::Attribute::StencilRef,
                                                IR::Attribute::SampleMask, IR::Attribute::Null};
    for (u32 i = 0; i < 4; ++i) {
        if (components[i].IsEmpty()) {
            continue;
        }
        ir.SetAttribute(MrtzBuiltins[i], components[i]);
    }
}

void Translator::EmitExport(const GcnInst& inst) {
    if (info.stage == Stage::Fragment && inst.control.exp.vm) {
        ir.Discard(ir.LogicalNot(ir.GetExec()));
    }

    const IR::Attribute attrib{inst.control.exp.target};
    if (IR::IsMrt(attrib)) {
        return ExportRenderTarget(inst);
    }
    if (attrib == IR::Attribute::Depth) {
        return ExportDepth(inst);
    }

    ASSERT_MSG(!inst.control.exp.compr, "Compressed exports only supported for render targets");

    u32 mask = inst.control.exp.en;
    for (u32 i = 0; i < 4; i++, mask >>= 1) {
        if ((mask & 1) == 0) {
            continue;
        }
        const auto value = ir.GetVectorReg<IR::F32>(IR::VectorReg(inst.src[i].code));
        if (IsPosition(attrib)) {
            IR::ExportPosition(ir, runtime_info.vs_info, attrib, i, value);
        } else {
            ir.SetAttribute(attrib, value, i);
        }
    }
}

} // namespace Shader::Gcn